Unverified Commit ed3cf834 authored by Kyle Siefring's avatar Kyle Siefring Committed by GitHub

Add SSSE3 sad asm (#696)

Mostly exclusive to 10 bit and 8 bit input. Roughly a 10% speedup when last measured.
parent 34370492
......@@ -37,6 +37,7 @@ fn bench_get_sad(b: &mut Bencher, bs: &BlockSize) {
let bsh = bs.height();
let w = 640;
let h = 480;
let bit_depth = 10;
let input_plane = new_plane(&mut ra, w, h);
let rec_plane = new_plane(&mut ra, w, h);
let po = PlaneOffset { x: 0, y: 0 };
......@@ -45,7 +46,8 @@ fn bench_get_sad(b: &mut Bencher, bs: &BlockSize) {
let plane_ref = rec_plane.slice(&po);
b.iter(|| {
let _ = black_box(me::get_sad(&plane_org, &plane_ref, bsw, bsh));
let _ =
black_box(me::get_sad(&plane_org, &plane_ref, bsw, bsh, bit_depth));
})
}
......
......@@ -48,7 +48,7 @@ fn main() {
let mut config_include_arg = String::from("-I");
config_include_arg.push_str(&out_dir);
config_include_arg.push('/');
nasm_rs::compile_library_args("rav1easm", &["src/x86/ipred.asm"], &[&config_include_arg, "-Isrc/"]);
nasm_rs::compile_library_args("rav1easm", &["src/x86/ipred.asm", "src/x86/me.asm"], &[&config_include_arg, "-Isrc/"]);
println!("cargo:rustc-link-lib=static=rav1easm");
rerun_dir("src/x86");
rerun_dir("src/ext/x86");
......
......@@ -2205,7 +2205,7 @@ fn encode_tile(sequence: &mut Sequence, fi: &FrameInvariants, fs: &mut FrameStat
let r = fi.ref_frames[i] as usize;
if pmvs[r].is_none() {
assert!(!sequence.use_128x128_superblock);
pmvs[r] = estimate_motion_ss4(fi, fs, BlockSize::BLOCK_64X64, r, &bo);
pmvs[r] = estimate_motion_ss4(fi, fs, BlockSize::BLOCK_64X64, r, &bo, sequence.bit_depth);
}
}
frame_pmvs.push(pmvs);
......@@ -2255,16 +2255,16 @@ fn encode_tile(sequence: &mut Sequence, fi: &FrameInvariants, fs: &mut FrameStat
assert!(!sequence.use_128x128_superblock);
pmvs[1][r] = estimate_motion_ss2(
fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(0, 0), &[Some(pmv), pmv_w, pmv_n]
fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(0, 0), &[Some(pmv), pmv_w, pmv_n], sequence.bit_depth
);
pmvs[2][r] = estimate_motion_ss2(
fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(8, 0), &[Some(pmv), pmv_e, pmv_n]
fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(8, 0), &[Some(pmv), pmv_e, pmv_n], sequence.bit_depth
);
pmvs[3][r] = estimate_motion_ss2(
fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(0, 8), &[Some(pmv), pmv_w, pmv_s]
fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(0, 8), &[Some(pmv), pmv_w, pmv_s], sequence.bit_depth
);
pmvs[4][r] = estimate_motion_ss2(
fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(8, 8), &[Some(pmv), pmv_e, pmv_s]
fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(8, 8), &[Some(pmv), pmv_e, pmv_s], sequence.bit_depth
);
}
}
......
......@@ -12,14 +12,95 @@ use context::BLOCK_TO_PLANE_SHIFT;
use context::MI_SIZE;
use partition::*;
use plane::*;
use util::*;
use FrameInvariants;
use FrameState;
use libc;
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(windows)))]
extern {
fn rav1e_sad_4x4_hbd_ssse3(
src: *const u16, src_stride: libc::ptrdiff_t, dst: *const u16,
dst_stride: libc::ptrdiff_t
) -> u32;
}
#[cfg(all(target_arch = "x86_64", not(windows)))]
extern {
fn rav1e_sad_8x8_hbd10_ssse3(
src: *const u16, src_stride: libc::ptrdiff_t, dst: *const u16,
dst_stride: libc::ptrdiff_t
) -> u32;
fn rav1e_sad_16x16_hbd_ssse3(
src: *const u16, src_stride: libc::ptrdiff_t, dst: *const u16,
dst_stride: libc::ptrdiff_t
) -> u32;
fn rav1e_sad_32x32_hbd10_ssse3(
src: *const u16, src_stride: libc::ptrdiff_t, dst: *const u16,
dst_stride: libc::ptrdiff_t
) -> u32;
fn rav1e_sad_64x64_hbd10_ssse3(
src: *const u16, src_stride: libc::ptrdiff_t, dst: *const u16,
dst_stride: libc::ptrdiff_t
) -> u32;
fn rav1e_sad_128x128_hbd10_ssse3(
src: *const u16, src_stride: libc::ptrdiff_t, dst: *const u16,
dst_stride: libc::ptrdiff_t
) -> u32;
}
#[cfg(all(target_arch = "x86_64", not(windows)))]
#[target_feature(enable = "ssse3")]
unsafe fn sad_ssse3(
plane_org: &PlaneSlice, plane_ref: &PlaneSlice, blk_h: usize, blk_w: usize,
bit_depth: usize
) -> u32 {
let mut sum = 0 as u32;
// TODO: stride *2??? What is the correct way to do this?
let org_stride = plane_org.plane.cfg.stride as libc::ptrdiff_t * 2;
let ref_stride = plane_ref.plane.cfg.stride as libc::ptrdiff_t * 2;
assert!(blk_h >= 4 && blk_w >= 4);
let step_size =
blk_h.min(blk_w).min(if bit_depth <= 10 { 128 } else { 4 });
let func = match step_size.ilog() {
3 => rav1e_sad_4x4_hbd_ssse3,
4 => rav1e_sad_8x8_hbd10_ssse3,
5 => rav1e_sad_16x16_hbd_ssse3,
6 => rav1e_sad_32x32_hbd10_ssse3,
7 => rav1e_sad_64x64_hbd10_ssse3,
8 => rav1e_sad_128x128_hbd10_ssse3,
_ => rav1e_sad_128x128_hbd10_ssse3
};
for r in (0..blk_h).step_by(step_size) {
for c in (0..blk_w).step_by(step_size) {
let org_slice = plane_org.subslice(c, r);
let ref_slice = plane_ref.subslice(c, r);
let org_ptr = org_slice.as_slice().as_ptr();
let ref_ptr = ref_slice.as_slice().as_ptr();
sum += func(org_ptr, org_stride, ref_ptr, ref_stride);
}
}
return sum;
}
#[inline(always)]
pub fn get_sad(
plane_org: &PlaneSlice, plane_ref: &PlaneSlice, blk_h: usize,
blk_w: usize
plane_org: &PlaneSlice, plane_ref: &PlaneSlice, blk_h: usize, blk_w: usize,
bit_depth: usize
) -> u32 {
#[cfg(all(target_arch = "x86_64", not(windows)))]
{
if is_x86_feature_detected!("ssse3") && blk_h >= 4 && blk_w >= 4 {
return unsafe {
sad_ssse3(plane_org, plane_ref, blk_h, blk_w, bit_depth)
};
}
}
let mut sum = 0 as u32;
let org_iter = plane_org.iter_width(blk_w);
......@@ -48,8 +129,8 @@ fn get_mv_range(fi: &FrameInvariants, bo: &BlockOffset, blk_w: usize, blk_h: usi
}
pub fn motion_estimation(
fi: &FrameInvariants, fs: &FrameState, bsize: BlockSize,
bo: &BlockOffset, ref_frame: usize, pmv: MotionVector
fi: &FrameInvariants, fs: &FrameState, bsize: BlockSize, bo: &BlockOffset,
ref_frame: usize, pmv: MotionVector, bit_depth: usize
) -> MotionVector {
match fi.rec_buffer.frames[fi.ref_frames[ref_frame - LAST_FRAME] as usize] {
Some(ref rec) => {
......@@ -70,8 +151,19 @@ pub fn motion_estimation(
let mut best_mv = MotionVector { row: 0, col: 0 };
full_search(
x_lo, x_hi, y_lo, y_hi, blk_h, blk_w,
&fs.input.planes[0], &rec.frame.planes[0], &mut best_mv, &mut lowest_sad, &po, 2
x_lo,
x_hi,
y_lo,
y_hi,
blk_h,
blk_w,
&fs.input.planes[0],
&rec.frame.planes[0],
&mut best_mv,
&mut lowest_sad,
&po,
2,
bit_depth
);
let mode = PredictionMode::NEWMV;
......@@ -116,7 +208,7 @@ pub fn motion_estimation(
let plane_org = fs.input.planes[0].slice(&po);
let plane_ref = tmp_plane.slice(&PlaneOffset { x: 0, y: 0 });
let sad = get_sad(&plane_org, &plane_ref, blk_h, blk_w);
let sad = get_sad(&plane_org, &plane_ref, blk_h, blk_w, bit_depth);
if sad < lowest_sad {
lowest_sad = sad;
......@@ -133,15 +225,17 @@ pub fn motion_estimation(
}
}
fn full_search(x_lo: isize, x_hi: isize, y_lo: isize, y_hi: isize, blk_h: usize, blk_w: usize,
p_org: &Plane, p_ref: &Plane, best_mv: &mut MotionVector, lowest_sad: &mut u32,
po: &PlaneOffset, step: usize) {
fn full_search(
x_lo: isize, x_hi: isize, y_lo: isize, y_hi: isize, blk_h: usize,
blk_w: usize, p_org: &Plane, p_ref: &Plane, best_mv: &mut MotionVector,
lowest_sad: &mut u32, po: &PlaneOffset, step: usize, bit_depth: usize
) {
for y in (y_lo..y_hi).step_by(step) {
for x in (x_lo..x_hi).step_by(step) {
let plane_org = p_org.slice(po);
let plane_ref = p_ref.slice(&PlaneOffset { x, y });
let sad = get_sad(&plane_org, &plane_ref, blk_h, blk_w);
let sad = get_sad(&plane_org, &plane_ref, blk_h, blk_w, bit_depth);
if sad < *lowest_sad {
*lowest_sad = sad;
......@@ -163,7 +257,8 @@ fn adjust_bo(bo: &BlockOffset, fi: &FrameInvariants, blk_w: usize, blk_h: usize)
}
pub fn estimate_motion_ss4(
fi: &FrameInvariants, fs: &FrameState, bsize: BlockSize, ref_idx: usize, bo: &BlockOffset
fi: &FrameInvariants, fs: &FrameState, bsize: BlockSize, ref_idx: usize,
bo: &BlockOffset, bit_depth: usize
) -> Option<MotionVector> {
if let Some(ref rec) = fi.rec_buffer.frames[ref_idx] {
let blk_w = bsize.width();
......@@ -184,8 +279,19 @@ pub fn estimate_motion_ss4(
let mut best_mv = MotionVector { row: 0, col: 0 };
full_search(
x_lo, x_hi, y_lo, y_hi, blk_h >> 2, blk_w >> 2,
&fs.input_qres, &rec.input_qres, &mut best_mv, &mut lowest_sad, &po, 1
x_lo,
x_hi,
y_lo,
y_hi,
blk_h >> 2,
blk_w >> 2,
&fs.input_qres,
&rec.input_qres,
&mut best_mv,
&mut lowest_sad,
&po,
1,
bit_depth
);
Some(MotionVector { row: best_mv.row * 4, col: best_mv.col * 4 })
......@@ -195,7 +301,8 @@ pub fn estimate_motion_ss4(
}
pub fn estimate_motion_ss2(
fi: &FrameInvariants, fs: &FrameState, bsize: BlockSize, ref_idx: usize, bo: &BlockOffset, pmvs: &[Option<MotionVector>; 3]
fi: &FrameInvariants, fs: &FrameState, bsize: BlockSize, ref_idx: usize,
bo: &BlockOffset, pmvs: &[Option<MotionVector>; 3], bit_depth: usize
) -> Option<MotionVector> {
if let Some(ref rec) = fi.rec_buffer.frames[ref_idx] {
let blk_w = bsize.width();
......@@ -219,8 +326,19 @@ pub fn estimate_motion_ss2(
let y_hi = po.y + (((pmv.row as isize / 8 + range).min(mvy_max / 8)) >> 1);
full_search(
x_lo, x_hi, y_lo, y_hi, blk_h >> 1, blk_w >> 1,
&fs.input_hres, &rec.input_hres, &mut best_mv, &mut lowest_sad, &po, 1
x_lo,
x_hi,
y_lo,
y_hi,
blk_h >> 1,
blk_w >> 1,
&fs.input_hres,
&rec.input_hres,
&mut best_mv,
&mut lowest_sad,
&po,
1,
bit_depth
);
}
}
......@@ -292,17 +410,21 @@ pub mod test {
(BLOCK_64X16, 93344),
];
let bit_depth: usize = 8;
let (input_plane, rec_plane) = setup_sad();
for block in blocks {
let bsw = block.0.width();
let bsh = block.0.height();
let po = PlaneOffset { x: 40, y: 40 };
let bsw = block.0.width();
let bsh = block.0.height();
let po = PlaneOffset { x: 40, y: 40 };
let mut input_slice = input_plane.slice(&po);
let mut rec_slice = rec_plane.slice(&po);
let mut input_slice = input_plane.slice(&po);
let mut rec_slice = rec_plane.slice(&po);
assert_eq!(block.1, get_sad(&mut input_slice, &mut rec_slice, bsw, bsh));
assert_eq!(
block.1,
get_sad(&mut input_slice, &mut rec_slice, bsw, bsh, bit_depth)
);
}
}
}
......@@ -386,7 +386,10 @@ pub fn rdo_mode_decision(
let slot_idx = fi.ref_frames[i - LAST_FRAME];
ref_slot_set.push(slot_idx);
let pmv = pmvs[slot_idx as usize].unwrap();
mvs_from_me.push([motion_estimation(fi, fs, bsize, bo, i, pmv), MotionVector { row: 0, col: 0 }]);
mvs_from_me.push([
motion_estimation(fi, fs, bsize, bo, i, pmv, seq.bit_depth),
MotionVector { row: 0, col: 0 }
]);
}
}
assert!(ref_frames_set.len() != 0);
......
; Copyright (c) 2018, The rav1e contributors. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION .text
%macro W_ABS_DIFF 8
psubw %1, %5
psubw %2, %6
psubw %3, %7
psubw %4, %8
pabsw %1, %1
pabsw %2, %2
pabsw %3, %3
pabsw %4, %4
%endmacro
INIT_XMM ssse3
cglobal sad_4x4_hbd, 4, 6, 8, src, src_stride, dst, dst_stride, \
src_stride3, dst_stride3
lea src_stride3q, [src_strideq*3]
lea dst_stride3q, [dst_strideq*3]
movq m0, [srcq]
movq m1, [srcq+src_strideq*1]
movq m2, [srcq+src_strideq*2]
movq m3, [srcq+src_stride3q]
movq m4, [dstq]
movq m5, [dstq+dst_strideq*1]
movq m6, [dstq+dst_strideq*2]
movq m7, [dstq+dst_stride3q]
W_ABS_DIFF m0, m1, m2, m3, m4, m5, m6, m7
; Don't convert to 32 bit integers: 4*4 abs diffs of 12-bits fits in 16 bits.
; Accumulate onto m0
%define sum m0
paddw sum, m1
paddw m2, m3
paddw sum, m2
; Horizontal reduction
pshuflw m1, sum, q2323
paddw sum, m1
pshuflw m1, sum, q1111
paddw sum, m1
movd eax, sum
; Convert to 16-bits since the upper half of eax is dirty
movzx eax, ax
RET
%if ARCH_X86_64
; 10-bit only
INIT_XMM ssse3
cglobal sad_8x8_hbd10, 4, 7, 9, src, src_stride, dst, dst_stride, \
src_stride3, dst_stride3, cnt
lea src_stride3q, [src_strideq*3]
lea dst_stride3q, [dst_strideq*3]
mov cntd, 2
%define sum m0
pxor sum, sum
.loop:
movu m1, [srcq]
movu m2, [srcq+src_strideq*1]
movu m3, [srcq+src_strideq*2]
movu m4, [srcq+src_stride3q]
lea srcq, [srcq+src_strideq*4]
movu m5, [dstq]
movu m6, [dstq+dst_strideq*1]
movu m7, [dstq+dst_strideq*2]
movu m8, [dstq+dst_stride3q]
lea dstq, [dstq+dst_strideq*4]
W_ABS_DIFF m1, m2, m3, m4, m5, m6, m7, m8
paddw m1, m2
paddw m3, m4
paddw sum, m1
paddw sum, m3
dec cntd
jg .loop
; Don't convert to 32 bit integers: 8*8 abs diffs of 10-bits fits in 16 bits.
; Horizontal reduction
movhlps m1, sum
paddw sum, m1
pshuflw m1, sum, q2323
paddw sum, m1
pshuflw m1, sum, q1111
paddw sum, m1
movd eax, m0
; Convert to 16-bits since the upper half of eax is dirty
movzx eax, ax
RET
INIT_XMM ssse3
cglobal sad_16x16_hbd, 4, 5, 9, src, src_stride, dst, dst_stride, \
cnt
mov cntd, 8
%define sum m0
pxor sum, sum
.loop:
movu m1, [srcq]
movu m2, [srcq+16]
movu m3, [srcq+src_strideq]
movu m4, [srcq+src_strideq+16]
lea srcq, [srcq+src_strideq*2]
movu m5, [dstq]
movu m6, [dstq+16]
movu m7, [dstq+dst_strideq]
movu m8, [dstq+dst_strideq+16]
lea dstq, [dstq+dst_strideq*2]
W_ABS_DIFF m1, m2, m3, m4, m5, m6, m7, m8
paddw m1, m2
paddw m3, m4
paddw sum, m1
paddw sum, m3
dec cntd
jg .loop
; Convert to 32-bits
pxor m1, m1
punpcklwd m2, sum, m1
punpckhwd sum, m1
paddd sum, m2
; Horizontal reduction
movhlps m1, sum
paddd sum, m1
pshufd m1, sum, q1111
paddd sum, m1
movd eax, sum
RET
;10 bit only
INIT_XMM ssse3
cglobal sad_32x32_hbd10, 4, 5, 10, src, src_stride, dst, dst_stride, \
cnt
mov cntd, 32
; Accumulate onto multiple registers to avoid overflowing before converting
; to 32-bits.
pxor m0, m0
pxor m1, m1
.loop:
movu m2, [srcq]
movu m3, [srcq+16]
movu m4, [srcq+32]
movu m5, [srcq+48]
lea srcq, [srcq+src_strideq]
movu m6, [dstq]
movu m7, [dstq+16]
movu m8, [dstq+32]
movu m9, [dstq+48]
lea dstq, [dstq+dst_strideq]
W_ABS_DIFF m2, m3, m4, m5, m6, m7, m8, m9
paddw m2, m3
paddw m4, m5
paddw m0, m2
paddw m1, m4
dec cntd
jg .loop
; Convert to 32-bits
pxor m2, m2
punpcklwd m3, m0, m2
punpckhwd m0, m2
paddd m0, m3
punpcklwd m3, m1, m2
punpckhwd m1, m2
paddd m1, m3
paddd m0, m1
; Horizontal reduction
movhlps m1, m0
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, m1
movd eax, m0
RET
%macro SAD_64X16_HBD10_INTERNAL 1
mov %1, 16
; Accumulate onto multiple registers to avoid overflowing before converting
; to 32-bits.
; In this case, we need to be able to able to fit into 16-bit SIGNED integers.
pxor m1, m1
pxor m2, m2
pxor m3, m3
pxor m4, m4
.innerloop:
movu m5, [srcq]
movu m6, [srcq+16]
movu m7, [srcq+32]
movu m8, [srcq+48]
movu m9, [dstq]
movu m10, [dstq+16]
movu m11, [dstq+32]
movu m12, [dstq+48]
W_ABS_DIFF m5, m6, m7, m8, m9, m10, m11, m12
; Evenly distribute abs diffs among the registers we use for accumulation.
paddw m1, m5
paddw m2, m6
paddw m3, m7
paddw m4, m8
movu m5, [srcq+64]
movu m6, [srcq+80]
movu m7, [srcq+96]
movu m8, [srcq+112]
lea srcq, [srcq+src_strideq]
movu m9, [dstq+64]
movu m10, [dstq+80]
movu m11, [dstq+96]
movu m12, [dstq+112]
lea dstq, [dstq+dst_strideq]
W_ABS_DIFF m5, m6, m7, m8, m9, m10, m11, m12
; Evenly distribute abs diffs among the registers we use for accumulation.
paddw m1, m5
paddw m2, m6
paddw m3, m7
paddw m4, m8
dec %1
jg .innerloop
; Convert to 32-bits by performing (-1*a) + (-1*b) on pairs of horizontal words.
; This has to be corrected for later.
; TODO: punpck might be faster since we only have to do it half as much.
pcmpeqd m5, m5
pmaddwd m1, m5
pmaddwd m2, m5
pmaddwd m3, m5
pmaddwd m4, m5
; Reduce from 4 to 2 regisers, then add them to m0
paddd m1, m2
paddd m3, m4
paddd m0, m1
paddd m0, m3
%endmacro
;10 bit only
INIT_XMM ssse3
cglobal sad_64x64_hbd10, 4, 5, 13, src, src_stride, dst, dst_stride, \
cnt1, cnt2
pxor m0, m0
; Repeatably accumulates sad from horizontal slices of the block onto m0. Each
; call increases src and dst as it runs allowing the next call to carry on
; from where the previous call left off.
; It should be noted that in the process of converting from 16 to 32-bits, the
; function performs (-1*a) + (-1*b) on pairs of horizontal words. This is
; corrected for by negating the final output.
mov cnt1d, 4
.loop
SAD_64X16_HBD10_INTERNAL cnt2d
dec cnt1d
jg .loop
; Horizontal reduction
movhlps m1, m0
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, m1
movd eax, m0
; Negate reverse the change in sign cause by converting to 32-bits.
neg eax
RET
%macro SAD_128X8_HBD10_INTERNAL 2
mov %1, 8
; Accumulate onto multiple registers to avoid overflowing before converting
; to 32-bits.
; In this case, we need to be able to able to fit into 16-bit SIGNED integers.
pxor m1, m1
pxor m2, m2
pxor m3, m3
pxor m4, m4
.outer_loop:
; Iterate over columns in this row.
mov %2, 4
.inner_loop:
movu m5, [srcq]
movu m6, [srcq+16]
movu m7, [srcq+32]
movu m8, [srcq+48]
lea srcq, [srcq+64]
movu m9, [dstq]
movu m10, [dstq+16]
movu m11, [dstq+32]
movu m12, [dstq+48]
lea dstq, [dstq+64]
W_ABS_DIFF m5, m6, m7, m8, m9, m10, m11, m12
; Evenly distribute abs diffs among the registers we use for accumulation.
paddw m1, m5
paddw m2, m6
paddw m3, m7
paddw m4, m8
dec %2
jg .inner_loop
; When iterating to the next row, subtract the columns we iterated by.
lea srcq, [srcq+src_strideq-256]
lea dstq, [dstq+dst_strideq-256]
dec %1
jg .outer_loop
; Convert to 32-bits by performing (-1*a) + (-1*b) on pairs of horizontal words.
; This has to be corrected for later.
; TODO: punpck might be faster since we only have to do it half as much.
pcmpeqd m5, m5
pmaddwd m1, m5
pmaddwd m2, m5
pmaddwd m3, m5
pmaddwd m4, m5
; Reduce from 4 to 2 regisers, then add them to m0
paddd m1, m2
paddd m3, m4
paddd m0, m1
paddd m0, m3
%endmacro
;10 bit only
INIT_XMM ssse3
cglobal sad_128x128_hbd10, 4, 7, 13, src, src_stride, dst, dst_stride, \
cnt1, cnt2, cnt3