Commit c24d9223 authored by Ronald S. Bultje's avatar Ronald S. Bultje

Add averaging-SAD functions for 8-point comp-inter motion search.

Makes first 50 frames of bus @ 1500kbps encode from 3min22.7 to 3min18.2,
i.e. 2.3% faster. In addition, use the sub_pixel_avg functions to calc
the variance of the averaging predictor. This is slightly suboptimal
because the function is subpixel-position-aware, but it will (at least
for the SSE2 version) not actually use a bilinear filter for a full-pixel
position, thus leading to approximately the same performance compared to
if we implemented an actual average-aware full-pixel variance function.
That gains another 0.3 seconds (i.e. encode time goes to 3min17.4), thus
leading to a total gain of 2.7%.

Change-Id: I3f059d2b04243921868cfed2568d4fa65d7b5acd
parent 9d959931
......@@ -369,7 +369,6 @@ specialize vp9_sad8x16 mmx sse2
prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad8x8 mmx sse2
# TODO(jingning): need to covert these functions into mmx/sse2 form
prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad8x4 sse2
......@@ -379,6 +378,45 @@ specialize vp9_sad4x8 sse
prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad4x4 mmx sse
prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad64x64_avg sse2
prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad32x64_avg sse2
prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad64x32_avg sse2
prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad32x16_avg sse2
prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad16x32_avg sse2
prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad32x32_avg sse2
prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad16x16_avg sse2
prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad16x8_avg sse2
prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad8x16_avg sse2
prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad8x8_avg sse2
prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad8x4_avg sse2
prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad4x8_avg sse
prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
specialize vp9_sad4x4_avg sse
prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance_halfpixvar16x16_h sse2
vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
......
......@@ -2353,16 +2353,12 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
int *mvjsadcost = x->nmvjointsadcost;
int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
/* Compound pred buffer */
DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
/* Get compound pred by averaging two pred blocks. */
comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
second_pred, 0x7fffffff) +
mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
for (i = 0; i < search_range; i++) {
......@@ -2380,9 +2376,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
best_address;
/* Get compound block and use it to calculate SAD. */
comp_avg_pred(comp_pred, second_pred, w, h, check_here,
in_what_stride);
thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride,
second_pred, bestsad);
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
......@@ -2412,10 +2407,11 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
this_mv.as_mv.col = ref_mv->as_mv.col << 3;
if (bestsad < INT_MAX) {
int besterr;
comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
(unsigned int *)(&thissad)) +
// FIXME(rbultje, yunqing): add full-pixel averaging variance functions
// so we don't have to use the subpixel with xoff=0,yoff=0 here.
int besterr = fn_ptr->svaf(best_address, in_what_stride, 0, 0,
what, what_stride, (unsigned int *)(&thissad),
second_pred) +
mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
xd->allow_high_precision_mv);
return besterr;
......
......@@ -1472,8 +1472,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
for (i = 0; i < MAX_MODES; i++)
cpi->rd_thresh_mult[i] = 128;
#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
SDX3F, SDX8F, SDX4DF)\
cpi->fn_ptr[BT].sdf = SDF; \
cpi->fn_ptr[BT].sdaf = SDAF; \
cpi->fn_ptr[BT].vf = VF; \
cpi->fn_ptr[BT].svf = SVF; \
cpi->fn_ptr[BT].svaf = SVAF; \
......@@ -1484,67 +1486,80 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
cpi->fn_ptr[BT].sdx8f = SDX8F; \
cpi->fn_ptr[BT].sdx4df = SDX4DF;
BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
vp9_variance32x16, vp9_sub_pixel_variance32x16,
vp9_sub_pixel_avg_variance32x16, NULL, NULL,
NULL, NULL, NULL,
vp9_sad32x16x4d)
BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
vp9_variance16x32, vp9_sub_pixel_variance16x32,
vp9_sub_pixel_avg_variance16x32, NULL, NULL,
NULL, NULL, NULL,
vp9_sad16x32x4d)
BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
vp9_variance64x32, vp9_sub_pixel_variance64x32,
vp9_sub_pixel_avg_variance64x32, NULL, NULL,
NULL, NULL, NULL,
vp9_sad64x32x4d)
BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
vp9_variance32x64, vp9_sub_pixel_variance32x64,
vp9_sub_pixel_avg_variance32x64, NULL, NULL,
NULL, NULL, NULL,
vp9_sad32x64x4d)
BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
vp9_variance32x32, vp9_sub_pixel_variance32x32,
vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
vp9_variance_halfpixvar32x32_v,
vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
vp9_sad32x32x4d)
BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
vp9_variance64x64, vp9_sub_pixel_variance64x64,
vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
vp9_variance_halfpixvar64x64_v,
vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
vp9_sad64x64x4d)
BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
vp9_variance16x16, vp9_sub_pixel_variance16x16,
vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
vp9_variance_halfpixvar16x16_v,
vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
vp9_sad16x16x4d)
BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
vp9_variance16x8, vp9_sub_pixel_variance16x8,
vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
vp9_variance8x16, vp9_sub_pixel_variance8x16,
vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
vp9_variance8x8, vp9_sub_pixel_variance8x8,
vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4,
BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
vp9_variance8x4, vp9_sub_pixel_variance8x4,
vp9_sub_pixel_avg_variance8x4, NULL, NULL,
NULL, NULL, vp9_sad8x4x8,
vp9_sad8x4x4d)
BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8,
BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
vp9_variance4x8, vp9_sub_pixel_variance4x8,
vp9_sub_pixel_avg_variance4x8, NULL, NULL,
NULL, NULL, vp9_sad4x8x8,
vp9_sad4x8x4d)
BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
vp9_variance4x4, vp9_sub_pixel_variance4x4,
vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
......
......@@ -11,25 +11,43 @@
#include <stdlib.h>
#include "vp9/common/vp9_sadmxn.h"
#include "vp9/encoder/vp9_variance.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "./vp9_rtcd.h"
unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);
}
unsigned int vp9_sad64x32_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 32);
}
#define sad_mxn_func(m, n) \
unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \
int src_stride, \
const uint8_t *ref_ptr, \
int ref_stride, \
unsigned int max_sad) { \
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
} \
unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \
int src_stride, \
const uint8_t *ref_ptr, \
int ref_stride, \
const uint8_t *second_pred, \
unsigned int max_sad) { \
uint8_t comp_pred[m * n]; \
comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \
}
sad_mxn_func(64, 64)
sad_mxn_func(64, 32)
sad_mxn_func(32, 64)
sad_mxn_func(32, 32)
sad_mxn_func(32, 16)
sad_mxn_func(16, 32)
sad_mxn_func(16, 16)
sad_mxn_func(16, 8)
sad_mxn_func(8, 16)
sad_mxn_func(8, 8)
sad_mxn_func(8, 4)
sad_mxn_func(4, 8)
sad_mxn_func(4, 4)
void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
int src_stride,
......@@ -46,14 +64,6 @@ void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
ref_ptr[3], ref_stride, 0x7fffffff);
}
unsigned int vp9_sad32x64_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 64);
}
void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t* const ref_ptr[],
......@@ -69,22 +79,6 @@ void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
ref_ptr[3], ref_stride, 0x7fffffff);
}
unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
}
unsigned int vp9_sad32x16_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 16);
}
void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t* const ref_ptr[],
......@@ -100,14 +94,6 @@ void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
ref_ptr[3], ref_stride, 0x7fffffff);
}
unsigned int vp9_sad16x32_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 32);
}
void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t* const ref_ptr[],
......@@ -123,63 +109,6 @@ void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
ref_ptr[3], ref_stride, 0x7fffffff);
}
unsigned int vp9_sad16x16_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
}
unsigned int vp9_sad8x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
}
unsigned int vp9_sad16x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
}
unsigned int vp9_sad8x16_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
}
unsigned int vp9_sad8x4_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4);
}
unsigned int vp9_sad4x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8);
}
unsigned int vp9_sad4x4_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
unsigned int max_sad) {
return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
}
void vp9_sad64x64x3_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
......
......@@ -20,6 +20,13 @@ typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
int ref_stride,
unsigned int max_sad);
typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
int ref_stride,
const uint8_t *second_pred,
unsigned int max_sad);
typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
......@@ -74,20 +81,21 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
int ref_stride);
typedef struct vp9_variance_vtable {
vp9_sad_fn_t sdf;
vp9_variance_fn_t vf;
vp9_subpixvariance_fn_t svf;
vp9_subp_avg_variance_fn_t svaf;
vp9_variance_fn_t svf_halfpix_h;
vp9_variance_fn_t svf_halfpix_v;
vp9_variance_fn_t svf_halfpix_hv;
vp9_sad_multi_fn_t sdx3f;
vp9_sad_multi1_fn_t sdx8f;
vp9_sad_multi_d_fn_t sdx4df;
vp9_sad_fn_t sdf;
vp9_sad_avg_fn_t sdaf;
vp9_variance_fn_t vf;
vp9_subpixvariance_fn_t svf;
vp9_subp_avg_variance_fn_t svaf;
vp9_variance_fn_t svf_halfpix_h;
vp9_variance_fn_t svf_halfpix_v;
vp9_variance_fn_t svf_halfpix_hv;
vp9_sad_multi_fn_t sdx3f;
vp9_sad_multi1_fn_t sdx8f;
vp9_sad_multi_d_fn_t sdx4df;
} vp9_variance_fn_ptr_t;
static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, uint8_t *ref, int ref_stride) {
int height, const uint8_t *ref, int ref_stride) {
int i, j;
for (i = 0; i < height; i++) {
......
......@@ -12,12 +12,42 @@
SECTION .text
; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD64XN 1
cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
%macro SAD_FN 4
%if %4 == 0
%if %3 == 5
cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
%else ; %3 == 7
cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
src_stride3, ref_stride3, n_rows
%endif ; %3 == 5/7
%else ; avg
%if %3 == 5
cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
second_pred, n_rows
%else ; %3 == 7
cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
ref, ref_stride, \
second_pred, \
src_stride3, ref_stride3
%if ARCH_X86_64
%define n_rowsd r7d
%else ; x86-32
%define n_rowsd dword r0m
%endif ; x86-32/64
%endif ; %3 == 5/7
%endif ; avg/sad
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
%if %3 == 7
lea src_stride3q, [src_strideq*3]
lea ref_stride3q, [ref_strideq*3]
%endif ; %3 == 7
%endmacro
; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD64XN 1-2 0
SAD_FN 64, %1, 5, %2
mov n_rowsd, %1
pxor m0, m0
.loop:
......@@ -25,6 +55,13 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
movu m2, [refq+16]
movu m3, [refq+32]
movu m4, [refq+48]
%if %2 == 1
pavgb m1, [second_predq+mmsize*0]
pavgb m2, [second_predq+mmsize*1]
pavgb m3, [second_predq+mmsize*2]
pavgb m4, [second_predq+mmsize*3]
lea second_predq, [second_predq+mmsize*4]
%endif
psadbw m1, [srcq]
psadbw m2, [srcq+16]
psadbw m3, [srcq+32]
......@@ -47,21 +84,27 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
INIT_XMM sse2
SAD64XN 64 ; sad64x64_sse2
SAD64XN 32 ; sad64x32_sse2
SAD64XN 64, 1 ; sad64x64_avg_sse2
SAD64XN 32, 1 ; sad64x32_avg_sse2
; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD32XN 1
cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
%macro SAD32XN 1-2 0
SAD_FN 32, %1, 5, %2
mov n_rowsd, %1/2
pxor m0, m0
.loop:
movu m1, [refq]
movu m2, [refq+16]
movu m3, [refq+ref_strideq]
movu m4, [refq+ref_strideq+16]
%if %2 == 1
pavgb m1, [second_predq+mmsize*0]
pavgb m2, [second_predq+mmsize*1]
pavgb m3, [second_predq+mmsize*2]
pavgb m4, [second_predq+mmsize*3]
lea second_predq, [second_predq+mmsize*4]
%endif
psadbw m1, [srcq]
psadbw m2, [srcq+16]
psadbw m3, [srcq+src_strideq]
......@@ -85,16 +128,14 @@ INIT_XMM sse2
SAD32XN 64 ; sad32x64_sse2
SAD32XN 32 ; sad32x32_sse2
SAD32XN 16 ; sad32x16_sse2
SAD32XN 64, 1 ; sad32x64_avg_sse2
SAD32XN 32, 1 ; sad32x32_avg_sse2
SAD32XN 16, 1 ; sad32x16_avg_sse2
; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD16XN 1
cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
src_stride3, ref_stride3, n_rows
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
lea src_stride3q, [src_strideq*3]
lea ref_stride3q, [ref_strideq*3]
%macro SAD16XN 1-2 0
SAD_FN 16, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
......@@ -103,6 +144,13 @@ cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
movu m2, [refq+ref_strideq]
movu m3, [refq+ref_strideq*2]
movu m4, [refq+ref_stride3q]
%if %2 == 1
pavgb m1, [second_predq+mmsize*0]
pavgb m2, [second_predq+mmsize*1]
pavgb m3, [second_predq+mmsize*2]
pavgb m4, [second_predq+mmsize*3]
lea second_predq, [second_predq+mmsize*4]
%endif
psadbw m1, [srcq]
psadbw m2, [srcq+src_strideq]
psadbw m3, [srcq+src_strideq*2]
......@@ -126,16 +174,14 @@ INIT_XMM sse2
SAD16XN 32 ; sad16x32_sse2
SAD16XN 16 ; sad16x16_sse2
SAD16XN 8 ; sad16x8_sse2
SAD16XN 32, 1 ; sad16x32_avg_sse2
SAD16XN 16, 1 ; sad16x16_avg_sse2
SAD16XN 8, 1 ; sad16x8_avg_sse2
; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD8XN 1
cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
src_stride3, ref_stride3, n_rows
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
lea src_stride3q, [src_strideq*3]
lea ref_stride3q, [ref_strideq*3]
%macro SAD8XN 1-2 0
SAD_FN 8, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
......@@ -144,6 +190,11 @@ cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
movhps m1, [refq+ref_strideq]
movh m2, [refq+ref_strideq*2]
movhps m2, [refq+ref_stride3q]
%if %2 == 1
pavgb m1, [second_predq+mmsize*0]
pavgb m2, [second_predq+mmsize*1]
lea second_predq, [second_predq+mmsize*2]
%endif
movh m3, [srcq]
movhps m3, [srcq+src_strideq]
movh m4, [srcq+src_strideq*2]
......@@ -167,16 +218,14 @@ INIT_XMM sse2
SAD8XN 16 ; sad8x16_sse2
SAD8XN 8 ; sad8x8_sse2
SAD8XN 4 ; sad8x4_sse2
SAD8XN 16, 1 ; sad8x16_avg_sse2
SAD8XN 8, 1 ; sad8x8_avg_sse2
SAD8XN 4, 1 ; sad8x4_avg_sse2
; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD4XN 1
cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
src_stride3, ref_stride3, n_rows
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
lea src_stride3q, [src_strideq*3]
lea ref_stride3q, [ref_strideq*3]
%macro SAD4XN 1-2 0
SAD_FN 4, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
......@@ -187,6 +236,11 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
movd m4, [refq+ref_stride3q]
punpckldq m1, m2
punpckldq m3, m4
%if %2 == 1
pavgb m1, [second_predq+mmsize*0]
pavgb m3, [second_predq+mmsize*1]
lea second_predq, [second_predq+mmsize*2]
%endif
movd m2, [srcq]
movd m5, [srcq+src_strideq]
movd m4, [srcq+src_strideq*2]
......@@ -209,3 +263,5 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
INIT_MMX sse
SAD4XN 8 ; sad4x8_sse
SAD4XN 4 ; sad4x4_sse
SAD4XN 8, 1 ; sad4x8_avg_sse
SAD4XN 4, 1 ; sad4x4_avg_sse
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment