Unverified Commit ed3cf834 authored by Kyle Siefring's avatar Kyle Siefring Committed by GitHub

Add SSSE3 sad asm (#696)

Mostly exclusive to 10 bit and 8 bit input. Roughly a 10% speedup when last measured.
parent 34370492
...@@ -37,6 +37,7 @@ fn bench_get_sad(b: &mut Bencher, bs: &BlockSize) { ...@@ -37,6 +37,7 @@ fn bench_get_sad(b: &mut Bencher, bs: &BlockSize) {
let bsh = bs.height(); let bsh = bs.height();
let w = 640; let w = 640;
let h = 480; let h = 480;
let bit_depth = 10;
let input_plane = new_plane(&mut ra, w, h); let input_plane = new_plane(&mut ra, w, h);
let rec_plane = new_plane(&mut ra, w, h); let rec_plane = new_plane(&mut ra, w, h);
let po = PlaneOffset { x: 0, y: 0 }; let po = PlaneOffset { x: 0, y: 0 };
...@@ -45,7 +46,8 @@ fn bench_get_sad(b: &mut Bencher, bs: &BlockSize) { ...@@ -45,7 +46,8 @@ fn bench_get_sad(b: &mut Bencher, bs: &BlockSize) {
let plane_ref = rec_plane.slice(&po); let plane_ref = rec_plane.slice(&po);
b.iter(|| { b.iter(|| {
let _ = black_box(me::get_sad(&plane_org, &plane_ref, bsw, bsh)); let _ =
black_box(me::get_sad(&plane_org, &plane_ref, bsw, bsh, bit_depth));
}) })
} }
......
...@@ -48,7 +48,7 @@ fn main() { ...@@ -48,7 +48,7 @@ fn main() {
let mut config_include_arg = String::from("-I"); let mut config_include_arg = String::from("-I");
config_include_arg.push_str(&out_dir); config_include_arg.push_str(&out_dir);
config_include_arg.push('/'); config_include_arg.push('/');
nasm_rs::compile_library_args("rav1easm", &["src/x86/ipred.asm"], &[&config_include_arg, "-Isrc/"]); nasm_rs::compile_library_args("rav1easm", &["src/x86/ipred.asm", "src/x86/me.asm"], &[&config_include_arg, "-Isrc/"]);
println!("cargo:rustc-link-lib=static=rav1easm"); println!("cargo:rustc-link-lib=static=rav1easm");
rerun_dir("src/x86"); rerun_dir("src/x86");
rerun_dir("src/ext/x86"); rerun_dir("src/ext/x86");
......
...@@ -2205,7 +2205,7 @@ fn encode_tile(sequence: &mut Sequence, fi: &FrameInvariants, fs: &mut FrameStat ...@@ -2205,7 +2205,7 @@ fn encode_tile(sequence: &mut Sequence, fi: &FrameInvariants, fs: &mut FrameStat
let r = fi.ref_frames[i] as usize; let r = fi.ref_frames[i] as usize;
if pmvs[r].is_none() { if pmvs[r].is_none() {
assert!(!sequence.use_128x128_superblock); assert!(!sequence.use_128x128_superblock);
pmvs[r] = estimate_motion_ss4(fi, fs, BlockSize::BLOCK_64X64, r, &bo); pmvs[r] = estimate_motion_ss4(fi, fs, BlockSize::BLOCK_64X64, r, &bo, sequence.bit_depth);
} }
} }
frame_pmvs.push(pmvs); frame_pmvs.push(pmvs);
...@@ -2255,16 +2255,16 @@ fn encode_tile(sequence: &mut Sequence, fi: &FrameInvariants, fs: &mut FrameStat ...@@ -2255,16 +2255,16 @@ fn encode_tile(sequence: &mut Sequence, fi: &FrameInvariants, fs: &mut FrameStat
assert!(!sequence.use_128x128_superblock); assert!(!sequence.use_128x128_superblock);
pmvs[1][r] = estimate_motion_ss2( pmvs[1][r] = estimate_motion_ss2(
fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(0, 0), &[Some(pmv), pmv_w, pmv_n] fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(0, 0), &[Some(pmv), pmv_w, pmv_n], sequence.bit_depth
); );
pmvs[2][r] = estimate_motion_ss2( pmvs[2][r] = estimate_motion_ss2(
fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(8, 0), &[Some(pmv), pmv_e, pmv_n] fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(8, 0), &[Some(pmv), pmv_e, pmv_n], sequence.bit_depth
); );
pmvs[3][r] = estimate_motion_ss2( pmvs[3][r] = estimate_motion_ss2(
fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(0, 8), &[Some(pmv), pmv_w, pmv_s] fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(0, 8), &[Some(pmv), pmv_w, pmv_s], sequence.bit_depth
); );
pmvs[4][r] = estimate_motion_ss2( pmvs[4][r] = estimate_motion_ss2(
fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(8, 8), &[Some(pmv), pmv_e, pmv_s] fi, fs, BlockSize::BLOCK_32X32, r, &sbo.block_offset(8, 8), &[Some(pmv), pmv_e, pmv_s], sequence.bit_depth
); );
} }
} }
......
...@@ -12,14 +12,95 @@ use context::BLOCK_TO_PLANE_SHIFT; ...@@ -12,14 +12,95 @@ use context::BLOCK_TO_PLANE_SHIFT;
use context::MI_SIZE; use context::MI_SIZE;
use partition::*; use partition::*;
use plane::*; use plane::*;
use util::*;
use FrameInvariants; use FrameInvariants;
use FrameState; use FrameState;
use libc;
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(windows)))]
extern {
fn rav1e_sad_4x4_hbd_ssse3(
src: *const u16, src_stride: libc::ptrdiff_t, dst: *const u16,
dst_stride: libc::ptrdiff_t
) -> u32;
}
#[cfg(all(target_arch = "x86_64", not(windows)))]
extern {
fn rav1e_sad_8x8_hbd10_ssse3(
src: *const u16, src_stride: libc::ptrdiff_t, dst: *const u16,
dst_stride: libc::ptrdiff_t
) -> u32;
fn rav1e_sad_16x16_hbd_ssse3(
src: *const u16, src_stride: libc::ptrdiff_t, dst: *const u16,
dst_stride: libc::ptrdiff_t
) -> u32;
fn rav1e_sad_32x32_hbd10_ssse3(
src: *const u16, src_stride: libc::ptrdiff_t, dst: *const u16,
dst_stride: libc::ptrdiff_t
) -> u32;
fn rav1e_sad_64x64_hbd10_ssse3(
src: *const u16, src_stride: libc::ptrdiff_t, dst: *const u16,
dst_stride: libc::ptrdiff_t
) -> u32;
fn rav1e_sad_128x128_hbd10_ssse3(
src: *const u16, src_stride: libc::ptrdiff_t, dst: *const u16,
dst_stride: libc::ptrdiff_t
) -> u32;
}
#[cfg(all(target_arch = "x86_64", not(windows)))]
#[target_feature(enable = "ssse3")]
unsafe fn sad_ssse3(
plane_org: &PlaneSlice, plane_ref: &PlaneSlice, blk_h: usize, blk_w: usize,
bit_depth: usize
) -> u32 {
let mut sum = 0 as u32;
// TODO: stride *2??? What is the correct way to do this?
let org_stride = plane_org.plane.cfg.stride as libc::ptrdiff_t * 2;
let ref_stride = plane_ref.plane.cfg.stride as libc::ptrdiff_t * 2;
assert!(blk_h >= 4 && blk_w >= 4);
let step_size =
blk_h.min(blk_w).min(if bit_depth <= 10 { 128 } else { 4 });
let func = match step_size.ilog() {
3 => rav1e_sad_4x4_hbd_ssse3,
4 => rav1e_sad_8x8_hbd10_ssse3,
5 => rav1e_sad_16x16_hbd_ssse3,
6 => rav1e_sad_32x32_hbd10_ssse3,
7 => rav1e_sad_64x64_hbd10_ssse3,
8 => rav1e_sad_128x128_hbd10_ssse3,
_ => rav1e_sad_128x128_hbd10_ssse3
};
for r in (0..blk_h).step_by(step_size) {
for c in (0..blk_w).step_by(step_size) {
let org_slice = plane_org.subslice(c, r);
let ref_slice = plane_ref.subslice(c, r);
let org_ptr = org_slice.as_slice().as_ptr();
let ref_ptr = ref_slice.as_slice().as_ptr();
sum += func(org_ptr, org_stride, ref_ptr, ref_stride);
}
}
return sum;
}
#[inline(always)] #[inline(always)]
pub fn get_sad( pub fn get_sad(
plane_org: &PlaneSlice, plane_ref: &PlaneSlice, blk_h: usize, plane_org: &PlaneSlice, plane_ref: &PlaneSlice, blk_h: usize, blk_w: usize,
blk_w: usize bit_depth: usize
) -> u32 { ) -> u32 {
#[cfg(all(target_arch = "x86_64", not(windows)))]
{
if is_x86_feature_detected!("ssse3") && blk_h >= 4 && blk_w >= 4 {
return unsafe {
sad_ssse3(plane_org, plane_ref, blk_h, blk_w, bit_depth)
};
}
}
let mut sum = 0 as u32; let mut sum = 0 as u32;
let org_iter = plane_org.iter_width(blk_w); let org_iter = plane_org.iter_width(blk_w);
...@@ -48,8 +129,8 @@ fn get_mv_range(fi: &FrameInvariants, bo: &BlockOffset, blk_w: usize, blk_h: usi ...@@ -48,8 +129,8 @@ fn get_mv_range(fi: &FrameInvariants, bo: &BlockOffset, blk_w: usize, blk_h: usi
} }
pub fn motion_estimation( pub fn motion_estimation(
fi: &FrameInvariants, fs: &FrameState, bsize: BlockSize, fi: &FrameInvariants, fs: &FrameState, bsize: BlockSize, bo: &BlockOffset,
bo: &BlockOffset, ref_frame: usize, pmv: MotionVector ref_frame: usize, pmv: MotionVector, bit_depth: usize
) -> MotionVector { ) -> MotionVector {
match fi.rec_buffer.frames[fi.ref_frames[ref_frame - LAST_FRAME] as usize] { match fi.rec_buffer.frames[fi.ref_frames[ref_frame - LAST_FRAME] as usize] {
Some(ref rec) => { Some(ref rec) => {
...@@ -70,8 +151,19 @@ pub fn motion_estimation( ...@@ -70,8 +151,19 @@ pub fn motion_estimation(
let mut best_mv = MotionVector { row: 0, col: 0 }; let mut best_mv = MotionVector { row: 0, col: 0 };
full_search( full_search(
x_lo, x_hi, y_lo, y_hi, blk_h, blk_w, x_lo,
&fs.input.planes[0], &rec.frame.planes[0], &mut best_mv, &mut lowest_sad, &po, 2 x_hi,
y_lo,
y_hi,
blk_h,
blk_w,
&fs.input.planes[0],
&rec.frame.planes[0],
&mut best_mv,
&mut lowest_sad,
&po,
2,
bit_depth
); );
let mode = PredictionMode::NEWMV; let mode = PredictionMode::NEWMV;
...@@ -116,7 +208,7 @@ pub fn motion_estimation( ...@@ -116,7 +208,7 @@ pub fn motion_estimation(
let plane_org = fs.input.planes[0].slice(&po); let plane_org = fs.input.planes[0].slice(&po);
let plane_ref = tmp_plane.slice(&PlaneOffset { x: 0, y: 0 }); let plane_ref = tmp_plane.slice(&PlaneOffset { x: 0, y: 0 });
let sad = get_sad(&plane_org, &plane_ref, blk_h, blk_w); let sad = get_sad(&plane_org, &plane_ref, blk_h, blk_w, bit_depth);
if sad < lowest_sad { if sad < lowest_sad {
lowest_sad = sad; lowest_sad = sad;
...@@ -133,15 +225,17 @@ pub fn motion_estimation( ...@@ -133,15 +225,17 @@ pub fn motion_estimation(
} }
} }
fn full_search(x_lo: isize, x_hi: isize, y_lo: isize, y_hi: isize, blk_h: usize, blk_w: usize, fn full_search(
p_org: &Plane, p_ref: &Plane, best_mv: &mut MotionVector, lowest_sad: &mut u32, x_lo: isize, x_hi: isize, y_lo: isize, y_hi: isize, blk_h: usize,
po: &PlaneOffset, step: usize) { blk_w: usize, p_org: &Plane, p_ref: &Plane, best_mv: &mut MotionVector,
lowest_sad: &mut u32, po: &PlaneOffset, step: usize, bit_depth: usize
) {
for y in (y_lo..y_hi).step_by(step) { for y in (y_lo..y_hi).step_by(step) {
for x in (x_lo..x_hi).step_by(step) { for x in (x_lo..x_hi).step_by(step) {
let plane_org = p_org.slice(po); let plane_org = p_org.slice(po);
let plane_ref = p_ref.slice(&PlaneOffset { x, y }); let plane_ref = p_ref.slice(&PlaneOffset { x, y });
let sad = get_sad(&plane_org, &plane_ref, blk_h, blk_w); let sad = get_sad(&plane_org, &plane_ref, blk_h, blk_w, bit_depth);
if sad < *lowest_sad { if sad < *lowest_sad {
*lowest_sad = sad; *lowest_sad = sad;
...@@ -163,7 +257,8 @@ fn adjust_bo(bo: &BlockOffset, fi: &FrameInvariants, blk_w: usize, blk_h: usize) ...@@ -163,7 +257,8 @@ fn adjust_bo(bo: &BlockOffset, fi: &FrameInvariants, blk_w: usize, blk_h: usize)
} }
pub fn estimate_motion_ss4( pub fn estimate_motion_ss4(
fi: &FrameInvariants, fs: &FrameState, bsize: BlockSize, ref_idx: usize, bo: &BlockOffset fi: &FrameInvariants, fs: &FrameState, bsize: BlockSize, ref_idx: usize,
bo: &BlockOffset, bit_depth: usize
) -> Option<MotionVector> { ) -> Option<MotionVector> {
if let Some(ref rec) = fi.rec_buffer.frames[ref_idx] { if let Some(ref rec) = fi.rec_buffer.frames[ref_idx] {
let blk_w = bsize.width(); let blk_w = bsize.width();
...@@ -184,8 +279,19 @@ pub fn estimate_motion_ss4( ...@@ -184,8 +279,19 @@ pub fn estimate_motion_ss4(
let mut best_mv = MotionVector { row: 0, col: 0 }; let mut best_mv = MotionVector { row: 0, col: 0 };
full_search( full_search(
x_lo, x_hi, y_lo, y_hi, blk_h >> 2, blk_w >> 2, x_lo,
&fs.input_qres, &rec.input_qres, &mut best_mv, &mut lowest_sad, &po, 1 x_hi,
y_lo,
y_hi,
blk_h >> 2,
blk_w >> 2,
&fs.input_qres,
&rec.input_qres,
&mut best_mv,
&mut lowest_sad,
&po,
1,
bit_depth
); );
Some(MotionVector { row: best_mv.row * 4, col: best_mv.col * 4 }) Some(MotionVector { row: best_mv.row * 4, col: best_mv.col * 4 })
...@@ -195,7 +301,8 @@ pub fn estimate_motion_ss4( ...@@ -195,7 +301,8 @@ pub fn estimate_motion_ss4(
} }
pub fn estimate_motion_ss2( pub fn estimate_motion_ss2(
fi: &FrameInvariants, fs: &FrameState, bsize: BlockSize, ref_idx: usize, bo: &BlockOffset, pmvs: &[Option<MotionVector>; 3] fi: &FrameInvariants, fs: &FrameState, bsize: BlockSize, ref_idx: usize,
bo: &BlockOffset, pmvs: &[Option<MotionVector>; 3], bit_depth: usize
) -> Option<MotionVector> { ) -> Option<MotionVector> {
if let Some(ref rec) = fi.rec_buffer.frames[ref_idx] { if let Some(ref rec) = fi.rec_buffer.frames[ref_idx] {
let blk_w = bsize.width(); let blk_w = bsize.width();
...@@ -219,8 +326,19 @@ pub fn estimate_motion_ss2( ...@@ -219,8 +326,19 @@ pub fn estimate_motion_ss2(
let y_hi = po.y + (((pmv.row as isize / 8 + range).min(mvy_max / 8)) >> 1); let y_hi = po.y + (((pmv.row as isize / 8 + range).min(mvy_max / 8)) >> 1);
full_search( full_search(
x_lo, x_hi, y_lo, y_hi, blk_h >> 1, blk_w >> 1, x_lo,
&fs.input_hres, &rec.input_hres, &mut best_mv, &mut lowest_sad, &po, 1 x_hi,
y_lo,
y_hi,
blk_h >> 1,
blk_w >> 1,
&fs.input_hres,
&rec.input_hres,
&mut best_mv,
&mut lowest_sad,
&po,
1,
bit_depth
); );
} }
} }
...@@ -292,17 +410,21 @@ pub mod test { ...@@ -292,17 +410,21 @@ pub mod test {
(BLOCK_64X16, 93344), (BLOCK_64X16, 93344),
]; ];
let bit_depth: usize = 8;
let (input_plane, rec_plane) = setup_sad(); let (input_plane, rec_plane) = setup_sad();
for block in blocks { for block in blocks {
let bsw = block.0.width(); let bsw = block.0.width();
let bsh = block.0.height(); let bsh = block.0.height();
let po = PlaneOffset { x: 40, y: 40 }; let po = PlaneOffset { x: 40, y: 40 };
let mut input_slice = input_plane.slice(&po); let mut input_slice = input_plane.slice(&po);
let mut rec_slice = rec_plane.slice(&po); let mut rec_slice = rec_plane.slice(&po);
assert_eq!(block.1, get_sad(&mut input_slice, &mut rec_slice, bsw, bsh)); assert_eq!(
block.1,
get_sad(&mut input_slice, &mut rec_slice, bsw, bsh, bit_depth)
);
} }
} }
} }
...@@ -386,7 +386,10 @@ pub fn rdo_mode_decision( ...@@ -386,7 +386,10 @@ pub fn rdo_mode_decision(
let slot_idx = fi.ref_frames[i - LAST_FRAME]; let slot_idx = fi.ref_frames[i - LAST_FRAME];
ref_slot_set.push(slot_idx); ref_slot_set.push(slot_idx);
let pmv = pmvs[slot_idx as usize].unwrap(); let pmv = pmvs[slot_idx as usize].unwrap();
mvs_from_me.push([motion_estimation(fi, fs, bsize, bo, i, pmv), MotionVector { row: 0, col: 0 }]); mvs_from_me.push([
motion_estimation(fi, fs, bsize, bo, i, pmv, seq.bit_depth),
MotionVector { row: 0, col: 0 }
]);
} }
} }
assert!(ref_frames_set.len() != 0); assert!(ref_frames_set.len() != 0);
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment