Commit 7eee487c authored by Peter de Rivaz's avatar Peter de Rivaz Committed by Deb Mukherjee

Added highbitdepth sse2 SAD acceleration and tests

Change-Id: I1a74a1b032b198793ef9cc526327987f7799125f
(cherry picked from commit b1a6f6b9cb47eafe0ce86eaf0318612806091fe5)
parent 08d86bc9
This diff is collapsed.
......@@ -1652,37 +1652,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad64x64/;
specialize qw/vp9_highbd_sad64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad32x64/;
specialize qw/vp9_highbd_sad32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad64x32/;
specialize qw/vp9_highbd_sad64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad32x16/;
specialize qw/vp9_highbd_sad32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad16x32/;
specialize qw/vp9_highbd_sad16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad32x32/;
specialize qw/vp9_highbd_sad32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad16x16/;
specialize qw/vp9_highbd_sad16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad16x8/;
specialize qw/vp9_highbd_sad16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad8x16/;
specialize qw/vp9_highbd_sad8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad8x8/;
specialize qw/vp9_highbd_sad8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad8x4/;
specialize qw/vp9_highbd_sad8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad4x8/;
......@@ -1691,37 +1691,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_sad4x4/;
add_proto qw/unsigned int vp9_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad64x64_avg/;
specialize qw/vp9_highbd_sad64x64_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad32x64_avg/;
specialize qw/vp9_highbd_sad32x64_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad64x32_avg/;
specialize qw/vp9_highbd_sad64x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad32x16_avg/;
specialize qw/vp9_highbd_sad32x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad16x32_avg/;
specialize qw/vp9_highbd_sad16x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad32x32_avg/;
specialize qw/vp9_highbd_sad32x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad16x16_avg/;
specialize qw/vp9_highbd_sad16x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad16x8_avg/;
specialize qw/vp9_highbd_sad16x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad8x16_avg/;
specialize qw/vp9_highbd_sad8x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad8x8_avg/;
specialize qw/vp9_highbd_sad8x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad8x4_avg/;
specialize qw/vp9_highbd_sad8x4_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad4x8_avg/;
......@@ -1778,44 +1778,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_sad4x4x8/;
add_proto qw/void vp9_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad64x64x4d/;
specialize qw/vp9_highbd_sad64x64x4d sse2/;
add_proto qw/void vp9_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad32x64x4d/;
specialize qw/vp9_highbd_sad32x64x4d sse2/;
add_proto qw/void vp9_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad64x32x4d/;
specialize qw/vp9_highbd_sad64x32x4d sse2/;
add_proto qw/void vp9_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad32x16x4d/;
specialize qw/vp9_highbd_sad32x16x4d sse2/;
add_proto qw/void vp9_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad16x32x4d/;
specialize qw/vp9_highbd_sad16x32x4d sse2/;
add_proto qw/void vp9_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad32x32x4d/;
specialize qw/vp9_highbd_sad32x32x4d sse2/;
add_proto qw/void vp9_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad16x16x4d/;
specialize qw/vp9_highbd_sad16x16x4d sse2/;
add_proto qw/void vp9_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad16x8x4d/;
specialize qw/vp9_highbd_sad16x8x4d sse2/;
add_proto qw/void vp9_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad8x16x4d/;
specialize qw/vp9_highbd_sad8x16x4d sse2/;
add_proto qw/void vp9_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad8x8x4d/;
specialize qw/vp9_highbd_sad8x8x4d sse2/;
# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
add_proto qw/void vp9_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad8x4x4d/;
specialize qw/vp9_highbd_sad8x4x4d sse2/;
add_proto qw/void vp9_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad4x8x4d/;
specialize qw/vp9_highbd_sad4x8x4d sse2/;
add_proto qw/void vp9_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad4x4x4d/;
specialize qw/vp9_highbd_sad4x4x4d sse2/;
add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse16x16/;
......
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
SECTION .text
; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
%macro HIGH_PROCESS_4x2x4 5-6 0
movh m0, [srcq +%2*2]
%if %1 == 1
movu m4, [ref1q+%3*2]
movu m5, [ref2q+%3*2]
movu m6, [ref3q+%3*2]
movu m7, [ref4q+%3*2]
movhps m0, [srcq +%4*2]
movhps m4, [ref1q+%5*2]
movhps m5, [ref2q+%5*2]
movhps m6, [ref3q+%5*2]
movhps m7, [ref4q+%5*2]
mova m3, m0
mova m2, m0
psubusw m3, m4
psubusw m2, m5
psubusw m4, m0
psubusw m5, m0
por m4, m3
por m5, m2
pmaddwd m4, m1
pmaddwd m5, m1
mova m3, m0
mova m2, m0
psubusw m3, m6
psubusw m2, m7
psubusw m6, m0
psubusw m7, m0
por m6, m3
por m7, m2
pmaddwd m6, m1
pmaddwd m7, m1
%else
movu m2, [ref1q+%3*2]
movhps m0, [srcq +%4*2]
movhps m2, [ref1q+%5*2]
mova m3, m0
psubusw m3, m2
psubusw m2, m0
por m2, m3
pmaddwd m2, m1
paddd m4, m2
movu m2, [ref2q+%3*2]
mova m3, m0
movhps m2, [ref2q+%5*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
pmaddwd m2, m1
paddd m5, m2
movu m2, [ref3q+%3*2]
mova m3, m0
movhps m2, [ref3q+%5*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
pmaddwd m2, m1
paddd m6, m2
movu m2, [ref4q+%3*2]
mova m3, m0
movhps m2, [ref4q+%5*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
pmaddwd m2, m1
paddd m7, m2
%endif
%if %6 == 1
lea srcq, [srcq +src_strideq*4]
lea ref1q, [ref1q+ref_strideq*4]
lea ref2q, [ref2q+ref_strideq*4]
lea ref3q, [ref3q+ref_strideq*4]
lea ref4q, [ref4q+ref_strideq*4]
%endif
%endmacro
; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
%macro HIGH_PROCESS_8x2x4 5-6 0
; 1st 8 px
mova m0, [srcq +%2*2]
%if %1 == 1
movu m4, [ref1q+%3*2]
movu m5, [ref2q+%3*2]
movu m6, [ref3q+%3*2]
movu m7, [ref4q+%3*2]
mova m3, m0
mova m2, m0
psubusw m3, m4
psubusw m2, m5
psubusw m4, m0
psubusw m5, m0
por m4, m3
por m5, m2
pmaddwd m4, m1
pmaddwd m5, m1
mova m3, m0
mova m2, m0
psubusw m3, m6
psubusw m2, m7
psubusw m6, m0
psubusw m7, m0
por m6, m3
por m7, m2
pmaddwd m6, m1
pmaddwd m7, m1
%else
mova m3, m0
movu m2, [ref1q+%3*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
mova m3, m0
pmaddwd m2, m1
paddd m4, m2
movu m2, [ref2q+%3*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
mova m3, m0
pmaddwd m2, m1
paddd m5, m2
movu m2, [ref3q+%3*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
mova m3, m0
pmaddwd m2, m1
paddd m6, m2
movu m2, [ref4q+%3*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
pmaddwd m2, m1
paddd m7, m2
%endif
; 2nd 8 px
mova m0, [srcq +(%4)*2]
mova m3, m0
movu m2, [ref1q+(%5)*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
mova m3, m0
pmaddwd m2, m1
paddd m4, m2
movu m2, [ref2q+(%5)*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
mova m3, m0
pmaddwd m2, m1
paddd m5, m2
movu m2, [ref3q+(%5)*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
mova m3, m0
pmaddwd m2, m1
paddd m6, m2
movu m2, [ref4q+(%5)*2]
psubusw m3, m2
psubusw m2, m0
%if %6 == 1
lea srcq, [srcq +src_strideq*4]
lea ref1q, [ref1q+ref_strideq*4]
lea ref2q, [ref2q+ref_strideq*4]
lea ref3q, [ref3q+ref_strideq*4]
lea ref4q, [ref4q+ref_strideq*4]
%endif
por m2, m3
pmaddwd m2, m1
paddd m7, m2
%endmacro
; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
%macro HIGH_PROCESS_16x2x4 5-6 0
HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
%endmacro
; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
%macro HIGH_PROCESS_32x2x4 5-6 0
HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
%endmacro
; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
%macro HIGH_PROCESS_64x2x4 5-6 0
HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
%endmacro
; void vp9_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
; uint8_t *ref[4], int ref_stride,
; unsigned int res[4]);
; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
%macro HIGH_SADNXN4D 2
%if UNIX64
cglobal highbd_sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, \
res, ref2, ref3, ref4, one
%else
cglobal highbd_sad%1x%2x4d, 4, 8, 8, src, src_stride, ref1, ref_stride, \
ref2, ref3, ref4, one
%endif
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
mov ref2q, [ref1q+gprsize*1]
mov ref3q, [ref1q+gprsize*2]
mov ref4q, [ref1q+gprsize*3]
mov ref1q, [ref1q+gprsize*0]
; convert byte pointers to short pointers
shl srcq, 1
shl ref2q, 1
shl ref3q, 1
shl ref4q, 1
shl ref1q, 1
mov oned, 0x00010001
movd m1, oned
pshufd m1, m1, 0x0
HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
%rep (%2-4)/2
HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
%endrep
HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
; N.B. HIGH_PROCESS outputs dwords (32 bits)
; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
movhlps m0, m4
movhlps m1, m5
movhlps m2, m6
movhlps m3, m7
paddd m4, m0
paddd m5, m1
paddd m6, m2
paddd m7, m3
punpckldq m4, m5
punpckldq m6, m7
movhlps m0, m4
movhlps m1, m6
paddd m4, m0
paddd m6, m1
punpcklqdq m4, m6
movifnidn r4, r4mp
movu [r4], m4
RET
%endmacro
INIT_XMM sse2
HIGH_SADNXN4D 64, 64
HIGH_SADNXN4D 64, 32
HIGH_SADNXN4D 32, 64
HIGH_SADNXN4D 32, 32
HIGH_SADNXN4D 32, 16
HIGH_SADNXN4D 16, 32
HIGH_SADNXN4D 16, 16
HIGH_SADNXN4D 16, 8
HIGH_SADNXN4D 8, 16
HIGH_SADNXN4D 8, 8
HIGH_SADNXN4D 8, 4
HIGH_SADNXN4D 4, 8
HIGH_SADNXN4D 4, 4
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
SECTION .text
%macro HIGH_SAD_FN 4
%if %4 == 0
%if %3 == 5
cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
%else ; %3 == 7
cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
src_stride3, ref_stride3, n_rows
%endif ; %3 == 5/7
%else ; avg
%if %3 == 5
cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
second_pred, n_rows
%else ; %3 == 7
cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
ref, ref_stride, \
second_pred, \
src_stride3, ref_stride3
%if ARCH_X86_64
%define n_rowsd r7d
%else ; x86-32
%define n_rowsd dword r0m
%endif ; x86-32/64
%endif ; %3 == 5/7
%endif ; avg/sad
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
%if %3 == 7
lea src_stride3q, [src_strideq*3]
lea ref_stride3q, [ref_strideq*3]
%endif ; %3 == 7
; convert src, ref & second_pred to short ptrs (from byte ptrs)
shl srcq, 1
shl refq, 1
%if %4 == 1
shl second_predq, 1
%endif
%endmacro
; unsigned int vp9_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD64XN 1-2 0
HIGH_SAD_FN 64, %1, 5, %2
mov n_rowsd, %1
pxor m0, m0
pxor m6, m6
.loop:
; first half of each row
movu m1, [refq]
movu m2, [refq+16]
movu m3, [refq+32]
movu m4, [refq+48]
%if %2 == 1
pavgw m1, [second_predq+mmsize*0]
pavgw m2, [second_predq+mmsize*1]
pavgw m3, [second_predq+mmsize*2]
pavgw m4, [second_predq+mmsize*3]
lea second_predq, [second_predq+mmsize*4]
%endif
mova m5, [srcq]
psubusw m5, m1
psubusw m1, [srcq]
por m1, m5
mova m5, [srcq+16]
psubusw m5, m2
psubusw m2, [srcq+16]
por m2, m5
mova m5, [srcq+32]
psubusw m5, m3
psubusw m3, [srcq+32]
por m3, m5
mova m5, [srcq+48]
psubusw m5, m4
psubusw m4, [srcq+48]
por m4, m5
paddw m1, m2
paddw m3, m4
movhlps m2, m1
movhlps m4, m3
paddw m1, m2
paddw m3, m4
punpcklwd m1, m6
punpcklwd m3, m6
paddd m0, m1
paddd m0, m3
; second half of each row
movu m1, [refq+64]
movu m2, [refq+80]
movu m3, [refq+96]
movu m4, [refq+112]
%if %2 == 1
pavgw m1, [second_predq+mmsize*0]
pavgw m2, [second_predq+mmsize*1]
pavgw m3, [second_predq+mmsize*2]
pavgw m4, [second_predq+mmsize*3]
lea second_predq, [second_predq+mmsize*4]
%endif
mova m5, [srcq+64]
psubusw m5, m1
psubusw m1, [srcq+64]
por m1, m5
mova m5, [srcq+80]
psubusw m5, m2
psubusw m2, [srcq+80]
por m2, m5
mova m5, [srcq+96]
psubusw m5, m3
psubusw m3, [srcq+96]
por m3, m5
mova m5, [srcq+112]
psubusw m5, m4
psubusw m4, [srcq+112]
por m4, m5
paddw m1, m2
paddw m3, m4
movhlps m2, m1
movhlps m4, m3
paddw m1, m2
paddw m3, m4
punpcklwd m1, m6
punpcklwd m3, m6
lea refq, [refq+ref_strideq*2]
paddd m0, m1
lea srcq, [srcq+src_strideq*2]
paddd m0, m3
dec n_rowsd
jg .loop
movhlps m1, m0
paddd m0, m1
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
movd eax, m0
RET
%endmacro
INIT_XMM sse2
HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
; unsigned int vp9_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD32XN 1-2 0
HIGH_SAD_FN 32, %1, 5, %2
mov n_rowsd, %1
pxor m0, m0
pxor m6, m6
.loop:
movu m1, [refq]
movu m2, [refq+16]
movu m3, [refq+32]
movu m4, [refq+48]
%if %2 == 1
pavgw m1, [second_predq+mmsize*0]
pavgw m2, [second_predq+mmsize*1]
pavgw m3, [second_predq+mmsize*2]
pavgw m4, [second_predq+mmsize*3]
lea second_predq, [second_predq+mmsize*4]