Commit 829179e8 authored by Yunqing Wang's avatar Yunqing Wang Committed by Code Review
Browse files

Merge "Preload reference area to an intermediate buffer in sub-pixel motion search"

parents 52d13777 20bd1446
...@@ -266,6 +266,14 @@ typedef struct ...@@ -266,6 +266,14 @@ typedef struct
int corrupted; int corrupted;
#if ARCH_X86 || ARCH_X86_64
/* This is an intermediate buffer currently used in sub-pixel motion search
* to keep a copy of the reference area. This buffer can be used for other
* purpose.
*/
DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
#endif
#if CONFIG_RUNTIME_CPU_DETECT #if CONFIG_RUNTIME_CPU_DETECT
struct VP8_COMMON_RTCD *rtcd; struct VP8_COMMON_RTCD *rtcd;
#endif #endif
......
...@@ -47,7 +47,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) ...@@ -47,7 +47,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_c; cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_c;
cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_c; cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_c;
cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_c; cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_c;
#if ARCH_X86 || ARCH_X86_64
cpi->rtcd.variance.copy32xn = vp8_copy32xn_c;
#endif
cpi->rtcd.variance.var4x4 = vp8_variance4x4_c; cpi->rtcd.variance.var4x4 = vp8_variance4x4_c;
cpi->rtcd.variance.var8x8 = vp8_variance8x8_c; cpi->rtcd.variance.var8x8 = vp8_variance8x8_c;
cpi->rtcd.variance.var8x16 = vp8_variance8x16_c; cpi->rtcd.variance.var8x16 = vp8_variance8x16_c;
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#include "mcomp.h" #include "mcomp.h"
#include "vpx_mem/vpx_mem.h" #include "vpx_mem/vpx_mem.h"
#include "vpx_ports/config.h"
#include <stdio.h> #include <stdio.h>
#include <limits.h> #include <limits.h>
#include <math.h> #include <math.h>
...@@ -165,19 +165,25 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) ...@@ -165,19 +165,25 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
x->searches_per_step = 8; x->searches_per_step = 8;
} }
/*
* To avoid the penalty for crossing cache-line read, preload the reference
* area in a small buffer, which is aligned to make sure there won't be crossing
* cache-line read while reading from this buffer. This reduced the cpu
* cycles spent on reading ref data in sub-pixel filter functions.
* TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
* 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
* could reduce the area.
*/
#define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c) #define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
#define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector #define PRE(r,c) (y + (((r)>>2) * y_stride + ((c)>>2) -(offset))) // pointer to predictor base of a motionvector
#define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc #define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc
#define DIST(r,c) vfp->svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function. #define DIST(r,c) vfp->svf( PRE(r,c), y_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e; #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
#define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best #define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
#define MIN(x,y) (((x)<(y))?(x):(y)) #define MIN(x,y) (((x)<(y))?(x):(y))
#define MAX(x,y) (((x)>(y))?(x):(y)) #define MAX(x,y) (((x)>(y))?(x):(y))
//#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
int_mv *bestmv, int_mv *ref_mv, int_mv *bestmv, int_mv *ref_mv,
int error_per_bit, int error_per_bit,
...@@ -185,7 +191,6 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, ...@@ -185,7 +191,6 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
int *mvcost[2], int *distortion, int *mvcost[2], int *distortion,
unsigned int *sse1) unsigned int *sse1)
{ {
unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
unsigned char *z = (*(b->base_src) + b->src); unsigned char *z = (*(b->base_src) + b->src);
int rr = ref_mv->as_mv.row >> 1, rc = ref_mv->as_mv.col >> 1; int rr = ref_mv->as_mv.row >> 1, rc = ref_mv->as_mv.col >> 1;
...@@ -204,12 +209,38 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, ...@@ -204,12 +209,38 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
int minr = MAX(x->mv_row_min << 2, (ref_mv->as_mv.row >> 1) - ((1 << mvlong_width) - 1)); int minr = MAX(x->mv_row_min << 2, (ref_mv->as_mv.row >> 1) - ((1 << mvlong_width) - 1));
int maxr = MIN(x->mv_row_max << 2, (ref_mv->as_mv.row >> 1) + ((1 << mvlong_width) - 1)); int maxr = MIN(x->mv_row_max << 2, (ref_mv->as_mv.row >> 1) + ((1 << mvlong_width) - 1));
int y_stride;
int offset;
#if ARCH_X86 || ARCH_X86_64
MACROBLOCKD *xd = &x->e_mbd;
unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
unsigned char *y;
int buf_r1, buf_r2, buf_c1, buf_c2;
// Clamping to avoid out-of-range data access
buf_r1 = ((bestmv->as_mv.row - 3) < x->mv_row_min)?(bestmv->as_mv.row - x->mv_row_min):3;
buf_r2 = ((bestmv->as_mv.row + 3) > x->mv_row_max)?(x->mv_row_max - bestmv->as_mv.row):3;
buf_c1 = ((bestmv->as_mv.col - 3) < x->mv_col_min)?(bestmv->as_mv.col - x->mv_col_min):3;
buf_c2 = ((bestmv->as_mv.col + 3) > x->mv_col_max)?(x->mv_col_max - bestmv->as_mv.col):3;
y_stride = 32;
/* Copy to intermediate buffer before searching. */
vfp->copymem(y0 - buf_c1 - d->pre_stride*buf_r1, d->pre_stride, xd->y_buf, y_stride, 16+buf_r1+buf_r2);
y = xd->y_buf + y_stride*buf_r1 +buf_c1;
#else
unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
y_stride = d->pre_stride;
#endif
offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
// central mv // central mv
bestmv->as_mv.row <<= 3; bestmv->as_mv.row <<= 3;
bestmv->as_mv.col <<= 3; bestmv->as_mv.col <<= 3;
// calculate central point error // calculate central point error
besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1); besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
*distortion = besterr; *distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
...@@ -296,6 +327,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, ...@@ -296,6 +327,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
#undef PRE #undef PRE
#undef SP #undef SP
#undef DIST #undef DIST
#undef IFMVCV
#undef ERR #undef ERR
#undef CHECK_BETTER #undef CHECK_BETTER
#undef MIN #undef MIN
......
...@@ -2078,6 +2078,14 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) ...@@ -2078,6 +2078,14 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_4X4].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8); cpi->fn_ptr[BLOCK_4X4].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8);
cpi->fn_ptr[BLOCK_4X4].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d); cpi->fn_ptr[BLOCK_4X4].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
#if ARCH_X86 || ARCH_X86_64
cpi->fn_ptr[BLOCK_16X16].copymem = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
cpi->fn_ptr[BLOCK_16X8].copymem = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
cpi->fn_ptr[BLOCK_8X16].copymem = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
cpi->fn_ptr[BLOCK_8X8].copymem = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
cpi->fn_ptr[BLOCK_4X4].copymem = VARIANCE_INVOKE(&cpi->rtcd.variance, copy32xn);
#endif
cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search); cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);
cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search); cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search);
cpi->refining_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, refining_search); cpi->refining_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, refining_search);
......
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
#include <stdlib.h> #include <stdlib.h>
#include "vpx_ports/config.h"
#include "vpx/vpx_integer.h"
unsigned int vp8_sad16x16_c( unsigned int vp8_sad16x16_c(
const unsigned char *src_ptr, const unsigned char *src_ptr,
...@@ -337,3 +339,64 @@ void vp8_sad4x4x4d_c( ...@@ -337,3 +339,64 @@ void vp8_sad4x4x4d_c(
sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff); sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
} }
/* Copy 2 macroblocks to a buffer */
void vp8_copy32xn_c(
unsigned char *src_ptr,
int src_stride,
unsigned char *dst_ptr,
int dst_stride,
int height)
{
int r;
for (r = 0; r < height; r++)
{
#if !(CONFIG_FAST_UNALIGNED)
dst_ptr[0] = src_ptr[0];
dst_ptr[1] = src_ptr[1];
dst_ptr[2] = src_ptr[2];
dst_ptr[3] = src_ptr[3];
dst_ptr[4] = src_ptr[4];
dst_ptr[5] = src_ptr[5];
dst_ptr[6] = src_ptr[6];
dst_ptr[7] = src_ptr[7];
dst_ptr[8] = src_ptr[8];
dst_ptr[9] = src_ptr[9];
dst_ptr[10] = src_ptr[10];
dst_ptr[11] = src_ptr[11];
dst_ptr[12] = src_ptr[12];
dst_ptr[13] = src_ptr[13];
dst_ptr[14] = src_ptr[14];
dst_ptr[15] = src_ptr[15];
dst_ptr[16] = src_ptr[16];
dst_ptr[17] = src_ptr[17];
dst_ptr[18] = src_ptr[18];
dst_ptr[19] = src_ptr[19];
dst_ptr[20] = src_ptr[20];
dst_ptr[21] = src_ptr[21];
dst_ptr[22] = src_ptr[22];
dst_ptr[23] = src_ptr[23];
dst_ptr[24] = src_ptr[24];
dst_ptr[25] = src_ptr[25];
dst_ptr[26] = src_ptr[26];
dst_ptr[27] = src_ptr[27];
dst_ptr[28] = src_ptr[28];
dst_ptr[29] = src_ptr[29];
dst_ptr[30] = src_ptr[30];
dst_ptr[31] = src_ptr[31];
#else
((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ;
((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ;
((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ;
((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ;
((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ;
((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ;
((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ;
((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ;
#endif
src_ptr += src_stride;
dst_ptr += dst_stride;
}
}
...@@ -222,6 +222,13 @@ extern prototype_sad_multi_dif_address(vp8_variance_sad8x16x4d); ...@@ -222,6 +222,13 @@ extern prototype_sad_multi_dif_address(vp8_variance_sad8x16x4d);
#endif #endif
extern prototype_sad_multi_dif_address(vp8_variance_sad4x4x4d); extern prototype_sad_multi_dif_address(vp8_variance_sad4x4x4d);
#if ARCH_X86 || ARCH_X86_64
#ifndef vp8_variance_copy32xn
#define vp8_variance_copy32xn vp8_copy32xn_c
#endif
extern prototype_sad(vp8_variance_copy32xn);
#endif
//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
#ifndef vp8_variance_var4x4 #ifndef vp8_variance_var4x4
...@@ -381,6 +388,10 @@ typedef struct ...@@ -381,6 +388,10 @@ typedef struct
vp8_sad_multi_d_fn_t sad8x8x4d; vp8_sad_multi_d_fn_t sad8x8x4d;
vp8_sad_multi_d_fn_t sad4x4x4d; vp8_sad_multi_d_fn_t sad4x4x4d;
#if ARCH_X86 || ARCH_X86_64
vp8_sad_fn_t copy32xn;
#endif
#if CONFIG_INTERNAL_STATS #if CONFIG_INTERNAL_STATS
vp8_ssimpf_fn_t ssimpf_8x8; vp8_ssimpf_fn_t ssimpf_8x8;
vp8_ssimpf_fn_t ssimpf; vp8_ssimpf_fn_t ssimpf;
...@@ -399,7 +410,9 @@ typedef struct ...@@ -399,7 +410,9 @@ typedef struct
vp8_sad_multi_fn_t sdx3f; vp8_sad_multi_fn_t sdx3f;
vp8_sad_multi1_fn_t sdx8f; vp8_sad_multi1_fn_t sdx8f;
vp8_sad_multi_d_fn_t sdx4df; vp8_sad_multi_d_fn_t sdx4df;
#if ARCH_X86 || ARCH_X86_64
vp8_sad_fn_t copymem;
#endif
} vp8_variance_fn_ptr_t; } vp8_variance_fn_ptr_t;
#if CONFIG_RUNTIME_CPU_DETECT #if CONFIG_RUNTIME_CPU_DETECT
......
...@@ -328,3 +328,83 @@ x16x8sad_wmt_early_exit: ...@@ -328,3 +328,83 @@ x16x8sad_wmt_early_exit:
UNSHADOW_ARGS UNSHADOW_ARGS
pop rbp pop rbp
ret ret
;void vp8_copy32xn_sse2(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *dst_ptr,
; int dst_stride,
; int height);
global sym(vp8_copy32xn_sse2)
sym(vp8_copy32xn_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;dst_ptr
movsxd rax, dword ptr arg(1) ;src_stride
movsxd rdx, dword ptr arg(3) ;dst_stride
movsxd rcx, dword ptr arg(4) ;height
block_copy_sse2_loopx4:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
movdqu xmm2, XMMWORD PTR [rsi + rax]
movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
lea rsi, [rsi+rax*2]
movdqu xmm4, XMMWORD PTR [rsi]
movdqu xmm5, XMMWORD PTR [rsi + 16]
movdqu xmm6, XMMWORD PTR [rsi + rax]
movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
lea rsi, [rsi+rax*2]
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi + 16], xmm1
movdqa XMMWORD PTR [rdi + rdx], xmm2
movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
lea rdi, [rdi+rdx*2]
movdqa XMMWORD PTR [rdi], xmm4
movdqa XMMWORD PTR [rdi + 16], xmm5
movdqa XMMWORD PTR [rdi + rdx], xmm6
movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
lea rdi, [rdi+rdx*2]
sub rcx, 4
cmp rcx, 4
jge block_copy_sse2_loopx4
cmp rcx, 0
je copy_is_done
block_copy_sse2_loop:
movdqu xmm0, XMMWORD PTR [rsi]
movdqu xmm1, XMMWORD PTR [rsi + 16]
lea rsi, [rsi+rax]
movdqa XMMWORD PTR [rdi], xmm0
movdqa XMMWORD PTR [rdi + 16], xmm1
lea rdi, [rdi+rdx]
sub rcx, 1
jne block_copy_sse2_loop
copy_is_done:
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
%define ret_var rbx %define ret_var rbx
%define result_ptr arg(4) %define result_ptr arg(4)
%define max_err arg(4) %define max_err arg(4)
%define height dword ptr arg(4)
push rbp push rbp
mov rbp, rsp mov rbp, rsp
push rsi push rsi
...@@ -42,6 +43,7 @@ ...@@ -42,6 +43,7 @@
%define ret_var r11 %define ret_var r11
%define result_ptr [rsp+xmm_stack_space+8+4*8] %define result_ptr [rsp+xmm_stack_space+8+4*8]
%define max_err [rsp+xmm_stack_space+8+4*8] %define max_err [rsp+xmm_stack_space+8+4*8]
%define height [rsp+xmm_stack_space+8+4*8]
%else %else
%define src_ptr rdi %define src_ptr rdi
%define src_stride rsi %define src_stride rsi
...@@ -51,6 +53,7 @@ ...@@ -51,6 +53,7 @@
%define ret_var r10 %define ret_var r10
%define result_ptr r8 %define result_ptr r8
%define max_err r8 %define max_err r8
%define height r8
%endif %endif
%endif %endif
...@@ -65,6 +68,7 @@ ...@@ -65,6 +68,7 @@
%define ret_var %define ret_var
%define result_ptr %define result_ptr
%define max_err %define max_err
%define height
%if ABI_IS_32BIT %if ABI_IS_32BIT
pop rbx pop rbx
...@@ -632,6 +636,67 @@ sym(vp8_sad16x16_sse3): ...@@ -632,6 +636,67 @@ sym(vp8_sad16x16_sse3):
STACK_FRAME_DESTROY_X3 STACK_FRAME_DESTROY_X3
;void vp8_copy32xn_sse3(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *dst_ptr,
; int dst_stride,
; int height);
global sym(vp8_copy32xn_sse3)
sym(vp8_copy32xn_sse3):
STACK_FRAME_CREATE_X3
block_copy_sse3_loopx4:
lea end_ptr, [src_ptr+src_stride*2]
movdqu xmm0, XMMWORD PTR [src_ptr]
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
movdqu xmm4, XMMWORD PTR [end_ptr]
movdqu xmm5, XMMWORD PTR [end_ptr + 16]
movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
lea src_ptr, [src_ptr+src_stride*4]
lea end_ptr, [ref_ptr+ref_stride*2]
movdqa XMMWORD PTR [ref_ptr], xmm0
movdqa XMMWORD PTR [ref_ptr + 16], xmm1
movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
movdqa XMMWORD PTR [end_ptr], xmm4
movdqa XMMWORD PTR [end_ptr + 16], xmm5
movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
lea ref_ptr, [ref_ptr+ref_stride*4]
sub height, 4
cmp height, 4
jge block_copy_sse3_loopx4
;Check to see if there is more rows need to be copied.
cmp height, 0
je copy_is_done
block_copy_sse3_loop:
movdqu xmm0, XMMWORD PTR [src_ptr]
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
lea src_ptr, [src_ptr+src_stride]
movdqa XMMWORD PTR [ref_ptr], xmm0
movdqa XMMWORD PTR [ref_ptr + 16], xmm1
lea ref_ptr, [ref_ptr+ref_stride]
sub height, 1
jne block_copy_sse3_loop
copy_is_done:
STACK_FRAME_DESTROY_X3
;void vp8_sad16x16x4d_sse3( ;void vp8_sad16x16x4d_sse3(
; unsigned char *src_ptr, ; unsigned char *src_ptr,
; int src_stride, ; int src_stride,
...@@ -892,3 +957,4 @@ sym(vp8_sad4x4x4d_sse3): ...@@ -892,3 +957,4 @@ sym(vp8_sad4x4x4d_sse3):
STACK_FRAME_DESTROY_X4 STACK_FRAME_DESTROY_X4
...@@ -121,6 +121,7 @@ extern prototype_sad(vp8_sad8x8_wmt); ...@@ -121,6 +121,7 @@ extern prototype_sad(vp8_sad8x8_wmt);
extern prototype_sad(vp8_sad8x16_wmt); extern prototype_sad(vp8_sad8x16_wmt);
extern prototype_sad(vp8_sad16x8_wmt); extern prototype_sad(vp8_sad16x8_wmt);
extern prototype_sad(vp8_sad16x16_wmt); extern prototype_sad(vp8_sad16x16_wmt);
extern prototype_sad(vp8_copy32xn_sse2);
extern prototype_variance(vp8_variance4x4_wmt); extern prototype_variance(vp8_variance4x4_wmt);
extern prototype_variance(vp8_variance8x8_wmt); extern prototype_variance(vp8_variance8x8_wmt);
extern prototype_variance(vp8_variance8x16_wmt); extern prototype_variance(vp8_variance8x16_wmt);
...@@ -156,6 +157,9 @@ extern prototype_variance2(vp8_get16x16var_sse2); ...@@ -156,6 +157,9 @@ extern prototype_variance2(vp8_get16x16var_sse2);
#undef vp8_variance_sad16x16 #undef vp8_variance_sad16x16
#define vp8_variance_sad16x16 vp8_sad16x16_wmt #define vp8_variance_sad16x16 vp8_sad16x16_wmt
#undef vp8_variance_copy32xn
#define vp8_variance_copy32xn vp8_copy32xn_sse2
#undef vp8_variance_var4x4 #undef vp8_variance_var4x4
#define vp8_variance_var4x4 vp8_variance4x4_wmt #define vp8_variance_var4x4 vp8_variance4x4_wmt
...@@ -222,6 +226,7 @@ extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3); ...@@ -222,6 +226,7 @@ extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3);
extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3); extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3);
extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3); extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3);
extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
extern prototype_sad(vp8_copy32xn_sse3);
#if !CONFIG_RUNTIME_CPU_DETECT #if !CONFIG_RUNTIME_CPU_DETECT
...@@ -258,6 +263,9 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3); ...@@ -258,6 +263,9 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
#undef vp8_variance_sad4x4x4d #undef vp8_variance_sad4x4x4d
#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3 #define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3
#undef vp8_variance_copy32xn
#define vp8_variance_copy32xn vp8_copy32xn_sse3
#endif #endif
#endif #endif
......
...@@ -203,6 +203,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) ...@@ -203,6 +203,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt; cpi->rtcd.variance.sad8x16 = vp8_sad8x16_wmt;
cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt; cpi->rtcd.variance.sad8x8 = vp8_sad8x8_wmt;
cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt; cpi->rtcd.variance.sad4x4 = vp8_sad4x4_wmt;
cpi->rtcd.variance.copy32xn = vp8_copy32xn_sse2;
cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt; cpi->rtcd.variance.var4x4 = vp8_variance4x4_wmt;
cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt; cpi->rtcd.variance.var8x8 = vp8_variance8x8_wmt;
...@@ -263,6 +264,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) ...@@ -263,6 +264,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3; cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3;
cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3; cpi->rtcd.variance.sad8x8x4d = vp8_sad8x8x4d_sse3;
cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3; cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3;
cpi->rtcd.variance.copy32xn = vp8_copy32xn_sse3;
cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4;