Commit 71ecb5d7 authored by Yunqing Wang's avatar Yunqing Wang
Browse files

Full search SAD function optimization in SSE4.1

Use mpsadbw, and calculate 8 sad at once. Function list:
vp8_sad16x16x8_sse4
vp8_sad16x8x8_sse4
vp8_sad8x16x8_sse4
vp8_sad8x8x8_sse4
vp8_sad4x4x8_sse4

(test clip: tulip)
For best quality mode, this gave encoder a 5% performance boost.
For good quality mode with speed=1, this gave encoder a 3%
performance boost.

Change-Id: I083b5a39d39144f88dcbccbef95da6498e490134
parent a0ae3682
......@@ -824,6 +824,7 @@ process_common_toolchain() {
soft_enable sse2
soft_enable sse3
soft_enable ssse3
soft_enable sse4_1
case ${tgt_os} in
win*)
......
......@@ -199,6 +199,7 @@ ARCH_EXT_LIST="
sse2
sse3
ssse3
sse4_1
altivec
"
......
......@@ -40,6 +40,12 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_c;
cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_c;
cpi->rtcd.variance.sad16x16x8 = vp8_sad16x16x8_c;
cpi->rtcd.variance.sad16x8x8 = vp8_sad16x8x8_c;
cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_c;
cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_c;
cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_c;
cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_c;
cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_c;
cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_c;
......
......@@ -1323,7 +1323,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
check_here = r * mv_stride + in_what + col_min;
c = col_min;
while ((c + 3) < col_max)
while ((c + 2) < col_max)
{
int i;
......@@ -1388,6 +1388,158 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
#endif
int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
{
unsigned char *what = (*(b->base_src) + b->src);
int what_stride = b->src_stride;
unsigned char *in_what;
int in_what_stride = d->pre_stride;
int mv_stride = d->pre_stride;
unsigned char *bestaddress;
MV *best_mv = &d->bmi.mv.as_mv;
MV this_mv;
int bestsad = INT_MAX;
int r, c;
unsigned char *check_here;
unsigned int thissad;
int ref_row = ref_mv->row >> 3;
int ref_col = ref_mv->col >> 3;
int row_min = ref_row - distance;
int row_max = ref_row + distance;
int col_min = ref_col - distance;
int col_max = ref_col + distance;
unsigned short sad_array8[8];
unsigned int sad_array[3];
// Work out the mid point for the search
in_what = *(d->base_pre) + d->pre;
bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
best_mv->row = ref_row;
best_mv->col = ref_col;
// We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
(ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
{
// Baseline value at the centre
bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
}
// Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
if (col_min < x->mv_col_min)
col_min = x->mv_col_min;
if (col_max > x->mv_col_max)
col_max = x->mv_col_max;
if (row_min < x->mv_row_min)
row_min = x->mv_row_min;
if (row_max > x->mv_row_max)
row_max = x->mv_row_max;
for (r = row_min; r < row_max ; r++)
{
this_mv.row = r << 3;
check_here = r * mv_stride + in_what + col_min;
c = col_min;
while ((c + 7) < col_max)
{
int i;
fn_ptr->sdx8f(what, what_stride, check_here , in_what_stride, sad_array8);
for (i = 0; i < 8; i++)
{
thissad = (unsigned int)sad_array8[i];
if (thissad < bestsad)
{
this_mv.col = c << 3;
thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
if (thissad < bestsad)
{
bestsad = thissad;
best_mv->row = r;
best_mv->col = c;
bestaddress = check_here;
}
}
check_here++;
c++;
}
}
while ((c + 2) < col_max)
{
int i;
fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
for (i = 0; i < 3; i++)
{
thissad = sad_array[i];
if (thissad < bestsad)
{
this_mv.col = c << 3;
thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
if (thissad < bestsad)
{
bestsad = thissad;
best_mv->row = r;
best_mv->col = c;
bestaddress = check_here;
}
}
check_here++;
c++;
}
}
while (c < col_max)
{
thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
if (thissad < bestsad)
{
this_mv.col = c << 3;
thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
if (thissad < bestsad)
{
bestsad = thissad;
best_mv->row = r;
best_mv->col = c;
bestaddress = check_here;
}
}
check_here ++;
c ++;
}
}
this_mv.row = best_mv->row << 3;
this_mv.col = best_mv->col << 3;
if (bestsad < INT_MAX)
return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+ vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
else
return INT_MAX;
}
#ifdef ENTROPY_STATS
void print_mode_context(void)
{
......
......@@ -93,6 +93,7 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
typedef prototype_full_search_sad(*vp8_full_search_fn_t);
extern prototype_full_search_sad(vp8_full_search_sad);
extern prototype_full_search_sad(vp8_full_search_sadx3);
extern prototype_full_search_sad(vp8_full_search_sadx8);
typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t);
extern prototype_diamond_search_sad(vp8_diamond_search_sad);
......
......@@ -2341,6 +2341,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_v);
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_hv);
cpi->fn_ptr[BLOCK_16X16].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
cpi->fn_ptr[BLOCK_16X16].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x8);
cpi->fn_ptr[BLOCK_16X16].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);
cpi->fn_ptr[BLOCK_16X8].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
......@@ -2350,6 +2351,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_16X8].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
cpi->fn_ptr[BLOCK_16X8].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x8);
cpi->fn_ptr[BLOCK_16X8].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);
cpi->fn_ptr[BLOCK_8X16].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
......@@ -2359,6 +2361,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_8X16].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
cpi->fn_ptr[BLOCK_8X16].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x8);
cpi->fn_ptr[BLOCK_8X16].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);
cpi->fn_ptr[BLOCK_8X8].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
......@@ -2368,6 +2371,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_8X8].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
cpi->fn_ptr[BLOCK_8X8].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x8);
cpi->fn_ptr[BLOCK_8X8].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);
cpi->fn_ptr[BLOCK_4X4].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
......@@ -2377,6 +2381,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
cpi->fn_ptr[BLOCK_4X4].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
cpi->fn_ptr[BLOCK_4X4].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8);
cpi->fn_ptr[BLOCK_4X4].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
#if !(CONFIG_REALTIME_ONLY)
......
......@@ -126,6 +126,24 @@ void vp8_sad16x16x3_c(
sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
}
void vp8_sad16x16x8_c(
const unsigned char *src_ptr,
int src_stride,
const unsigned char *ref_ptr,
int ref_stride,
unsigned short *sad_array
)
{
sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
}
void vp8_sad16x8x3_c(
const unsigned char *src_ptr,
int src_stride,
......@@ -139,6 +157,24 @@ void vp8_sad16x8x3_c(
sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
}
void vp8_sad16x8x8_c(
const unsigned char *src_ptr,
int src_stride,
const unsigned char *ref_ptr,
int ref_stride,
unsigned short *sad_array
)
{
sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
}
void vp8_sad8x8x3_c(
const unsigned char *src_ptr,
int src_stride,
......@@ -152,6 +188,24 @@ void vp8_sad8x8x3_c(
sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
}
void vp8_sad8x8x8_c(
const unsigned char *src_ptr,
int src_stride,
const unsigned char *ref_ptr,
int ref_stride,
unsigned short *sad_array
)
{
sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
}
void vp8_sad8x16x3_c(
const unsigned char *src_ptr,
int src_stride,
......@@ -165,6 +219,24 @@ void vp8_sad8x16x3_c(
sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
}
void vp8_sad8x16x8_c(
const unsigned char *src_ptr,
int src_stride,
const unsigned char *ref_ptr,
int ref_stride,
unsigned short *sad_array
)
{
sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
}
void vp8_sad4x4x3_c(
const unsigned char *src_ptr,
int src_stride,
......@@ -178,6 +250,24 @@ void vp8_sad4x4x3_c(
sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
}
void vp8_sad4x4x8_c(
const unsigned char *src_ptr,
int src_stride,
const unsigned char *ref_ptr,
int ref_stride,
unsigned short *sad_array
)
{
sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
}
void vp8_sad16x16x4d_c(
const unsigned char *src_ptr,
int src_stride,
......
......@@ -32,6 +32,16 @@
unsigned int *sad_array\
)
#define prototype_sad_multi_same_address_1(sym)\
void (sym)\
(\
const unsigned char *src_ptr, \
int source_stride, \
const unsigned char *ref_ptr, \
int ref_stride, \
unsigned short *sad_array\
)
#define prototype_sad_multi_dif_address(sym)\
void (sym)\
(\
......@@ -138,6 +148,31 @@ extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
#endif
extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);
#ifndef vp8_variance_sad16x16x8
#define vp8_variance_sad16x16x8 vp8_sad16x16x8_c
#endif
extern prototype_sad_multi_same_address_1(vp8_variance_sad16x16x8);
#ifndef vp8_variance_sad16x8x8
#define vp8_variance_sad16x8x8 vp8_sad16x8x8_c
#endif
extern prototype_sad_multi_same_address_1(vp8_variance_sad16x8x8);
#ifndef vp8_variance_sad8x8x8
#define vp8_variance_sad8x8x8 vp8_sad8x8x8_c
#endif
extern prototype_sad_multi_same_address_1(vp8_variance_sad8x8x8);
#ifndef vp8_variance_sad8x16x8
#define vp8_variance_sad8x16x8 vp8_sad8x16x8_c
#endif
extern prototype_sad_multi_same_address_1(vp8_variance_sad8x16x8);
#ifndef vp8_variance_sad4x4x8
#define vp8_variance_sad4x4x8 vp8_sad4x4x8_c
#endif
extern prototype_sad_multi_same_address_1(vp8_variance_sad4x4x8);
//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
#ifndef vp8_variance_sad16x16x4d
......@@ -274,6 +309,7 @@ extern prototype_sad(vp8_variance_get4x4sse_cs);
typedef prototype_sad(*vp8_sad_fn_t);
typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t);
typedef prototype_variance(*vp8_variance_fn_t);
typedef prototype_variance2(*vp8_variance2_fn_t);
......@@ -317,6 +353,12 @@ typedef struct
vp8_sad_multi_fn_t sad8x8x3;
vp8_sad_multi_fn_t sad4x4x3;
vp8_sad_multi1_fn_t sad16x16x8;
vp8_sad_multi1_fn_t sad16x8x8;
vp8_sad_multi1_fn_t sad8x16x8;
vp8_sad_multi1_fn_t sad8x8x8;
vp8_sad_multi1_fn_t sad4x4x8;
vp8_sad_multi_d_fn_t sad16x16x4d;
vp8_sad_multi_d_fn_t sad16x8x4d;
vp8_sad_multi_d_fn_t sad8x16x4d;
......@@ -334,6 +376,7 @@ typedef struct
vp8_variance_fn_t svf_halfpix_v;
vp8_variance_fn_t svf_halfpix_hv;
vp8_sad_multi_fn_t sdx3f;
vp8_sad_multi1_fn_t sdx8f;
vp8_sad_multi_d_fn_t sdx4df;
} vp8_variance_fn_ptr_t;
......
......@@ -24,5 +24,14 @@
#endif
#endif
#if HAVE_SSE4_1
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_search_full_search
#define vp8_search_full_search vp8_full_search_sadx8
#endif
#endif
#endif
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro PROCESS_16X2X8 1
%if %1
movdqa xmm0, XMMWORD PTR [rsi]
movq xmm1, MMWORD PTR [rdi]
movq xmm3, MMWORD PTR [rdi+8]
movq xmm2, MMWORD PTR [rdi+16]
punpcklqdq xmm1, xmm3
punpcklqdq xmm3, xmm2
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0x0
mpsadbw xmm2, xmm0, 0x5
psrldq xmm0, 8
movdqa xmm4, xmm3
mpsadbw xmm3, xmm0, 0x0
mpsadbw xmm4, xmm0, 0x5
paddw xmm1, xmm2
paddw xmm1, xmm3
paddw xmm1, xmm4
%else
movdqa xmm0, XMMWORD PTR [rsi]
movq xmm5, MMWORD PTR [rdi]
movq xmm3, MMWORD PTR [rdi+8]
movq xmm2, MMWORD PTR [rdi+16]
punpcklqdq xmm5, xmm3
punpcklqdq xmm3, xmm2
movdqa xmm2, xmm5
mpsadbw xmm5, xmm0, 0x0
mpsadbw xmm2, xmm0, 0x5
psrldq xmm0, 8
movdqa xmm4, xmm3
mpsadbw xmm3, xmm0, 0x0
mpsadbw xmm4, xmm0, 0x5
paddw xmm5, xmm2
paddw xmm5, xmm3
paddw xmm5, xmm4
paddw xmm1, xmm5
%endif
movdqa xmm0, XMMWORD PTR [rsi + rax]
movq xmm5, MMWORD PTR [rdi+ rdx]
movq xmm3, MMWORD PTR [rdi+ rdx+8]
movq xmm2, MMWORD PTR [rdi+ rdx+16]
punpcklqdq xmm5, xmm3
punpcklqdq xmm3, xmm2
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
movdqa xmm2, xmm5
mpsadbw xmm5, xmm0, 0x0
mpsadbw xmm2, xmm0, 0x5
psrldq xmm0, 8
movdqa xmm4, xmm3
mpsadbw xmm3, xmm0, 0x0
mpsadbw xmm4, xmm0, 0x5
paddw xmm5, xmm2
paddw xmm5, xmm3
paddw xmm5, xmm4
paddw xmm1, xmm5
%endmacro
%macro PROCESS_8X2X8 1
%if %1
movq xmm0, MMWORD PTR [rsi]
movq xmm1, MMWORD PTR [rdi]
movq xmm3, MMWORD PTR [rdi+8]
punpcklqdq xmm1, xmm3
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0x0
mpsadbw xmm2, xmm0, 0x5
paddw xmm1, xmm2
%else
movq xmm0, MMWORD PTR [rsi]
movq xmm5, MMWORD PTR [rdi]
movq xmm3, MMWORD PTR [rdi+8]
punpcklqdq xmm5, xmm3
movdqa xmm2, xmm5
mpsadbw xmm5, xmm0, 0x0
mpsadbw xmm2, xmm0, 0x5
paddw xmm5, xmm2
paddw xmm1, xmm5
%endif
movq xmm0, MMWORD PTR [rsi + rax]
movq xmm5, MMWORD PTR [rdi+ rdx]
movq xmm3, MMWORD PTR [rdi+ rdx+8]
punpcklqdq xmm5, xmm3
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
movdqa xmm2, xmm5
mpsadbw xmm5, xmm0, 0x0
mpsadbw xmm2, xmm0, 0x5
paddw xmm5, xmm2
paddw xmm1, xmm5
%endmacro
%macro PROCESS_4X2X8 1
%if %1
movd xmm0, [rsi]
movq xmm1, MMWORD PTR [rdi]
movq xmm3, MMWORD PTR [rdi+8]
punpcklqdq xmm1, xmm3
mpsadbw xmm1, xmm0, 0x0
%else
movd xmm0, [rsi]
movq xmm5, MMWORD PTR [rdi]
movq xmm3, MMWORD PTR [rdi+8]
punpcklqdq xmm5, xmm3