diff --git a/benches/predict.rs b/benches/predict.rs index 8a25a2b74f0f9914221a668616cfb9e71e896f65..a15c3c2df93f16f20ad7f249779766cdfff25950 100644 --- a/benches/predict.rs +++ b/benches/predict.rs @@ -25,8 +25,19 @@ pub fn generate_block(rng: &mut ChaChaRng) -> (Vec<u16>, Vec<u16>, Vec<u16>) { (block, above_context, left_context) } +pub fn generate_block_u8(rng: &mut ChaChaRng) -> (Vec<u8>, Vec<u8>, Vec<u8>) { + let block = vec![0u8; BLOCK_SIZE.width() * BLOCK_SIZE.height()]; + let above_context: Vec<u8> = + (0..BLOCK_SIZE.height()).map(|_| rng.gen()).collect(); + let left_context: Vec<u8> = + (0..BLOCK_SIZE.width()).map(|_| rng.gen()).collect(); + + (block, above_context, left_context) +} + pub fn pred_bench(c: &mut Criterion) { c.bench_function("intra_dc_4x4", |b| intra_dc_4x4(b)); + c.bench_function("intra_dc_128_4x4_u8", |b| intra_dc_128_4x4_u8(b)); c.bench_function("intra_dc_left_4x4", |b| intra_dc_left_4x4(b)); c.bench_function("intra_dc_top_4x4", |b| intra_dc_top_4x4(b)); c.bench_function("intra_h_4x4", |b| intra_h_4x4(b)); @@ -54,6 +65,21 @@ pub fn intra_dc_4x4(b: &mut Bencher) { }) } +pub fn intra_dc_128_4x4_u8(b: &mut Bencher) { + let mut ra = ChaChaRng::from_seed([0; 32]); + let (mut block, above, left) = generate_block_u8(&mut ra); + + b.iter(|| { + for _ in 0..MAX_ITER { + Block4x4::pred_dc_128( + &mut block, + BLOCK_SIZE.width(), + 8 + ); + } + }) +} + pub fn intra_dc_left_4x4(b: &mut Bencher) { let mut ra = ChaChaRng::from_seed([0; 32]); let (mut block, above, left) = generate_block(&mut ra); diff --git a/build.rs b/build.rs index 2928f3ac0c1710e947790b6660df3ced4496a653..c9f92f88bcead12ca2011bd085a358c27a5503b1 100644 --- a/build.rs +++ b/build.rs @@ -14,22 +14,27 @@ use std::fs; use std::path::Path; fn main() { - #[cfg(target_arch = "x86_64")] { + #[cfg(all(target_arch = "x86_64", not(windows)))] { use std::fs::File; use std::io::Write; let out_dir = env::var("OUT_DIR").unwrap(); { let dest_path = Path::new(&out_dir).join("config.asm"); let mut config_file = File::create(dest_path).unwrap(); + config_file.write(b" %define private_prefix rav1e\n").unwrap(); config_file.write(b" %define ARCH_X86_32 0\n").unwrap(); config_file.write(b" %define ARCH_X86_64 1\n").unwrap(); config_file.write(b" %define PIC 1\n").unwrap(); config_file.write(b" %define STACK_ALIGNMENT 32\n").unwrap(); + if cfg!(target_os="macos") { + config_file.write(b" %define PREFIX 1\n").unwrap(); + } } let mut config_include_arg = String::from("-I"); config_include_arg.push_str(&out_dir); config_include_arg.push('/'); - nasm_rs::compile_library_args("rav1easm", &["src/x86/mc.asm"], &[&config_include_arg, "-Isrc/"]); + nasm_rs::compile_library_args("rav1easm", &["src/x86/ipred.asm"], &[&config_include_arg, "-Isrc/"]); + println!("cargo:rustc-link-lib=static=rav1easm"); } if cfg!(windows) && cfg!(feature = "decode_test") { diff --git a/src/predict.rs b/src/predict.rs index f9971e49fc140482e4cbd7e561e5855d930c4b68..65b8f747eb84db77af8967c250909e526399774d 100644 --- a/src/predict.rs +++ b/src/predict.rs @@ -11,7 +11,6 @@ #![cfg_attr(feature = "cargo-clippy", allow(cast_lossless))] #![cfg_attr(feature = "cargo-clippy", allow(needless_range_loop))] -#[cfg(test)] use libc; use num_traits::*; @@ -217,6 +216,14 @@ fn get_scaled_luma_q0(alpha_q3: i16, ac_pred_q3: i16) -> i32 { } } +#[cfg(all(target_arch = "x86_64", not(windows)))] +extern { + fn rav1e_ipred_dc_128_avx2( + dst: *mut u8, stride: libc::ptrdiff_t, topleft: *const u8, + width: libc::c_int, height: libc::c_int, angle: libc::c_int + ); +} + // TODO: rename the type bounds later pub trait Intra<T>: Dim where @@ -241,6 +248,22 @@ where #[cfg_attr(feature = "comparative_bench", inline(never))] fn pred_dc_128(output: &mut [T], stride: usize, bit_depth: usize) { + #[cfg(all(target_arch = "x86_64", not(windows)))] + { + use std::ptr; + if size_of::<T>() == 1 && is_x86_feature_detected!("avx2") { + return unsafe { + rav1e_ipred_dc_128_avx2( + output.as_mut_ptr() as *mut _, + stride as libc::ptrdiff_t, + ptr::null(), + Self::W as libc::c_int, + Self::H as libc::c_int, + 0 + ) + }; + } + } for y in 0..Self::H { for x in 0..Self::W { output[y * stride + x] = (128u32 << (bit_depth - 8)).as_(); @@ -874,6 +897,18 @@ pub mod test { } } + #[test] + fn pred_matches_u8() { + let row128 = [128u8; 32]; + let mut o = vec![0u8; 32 * 32]; + + Block4x4::pred_dc_128(&mut o, 32, 8); + + for l in o.chunks(32).take(4) { + assert_eq!(l[..4], row128[..4]); + } + } + #[test] fn pred_same() { let mut ra = ChaChaRng::from_seed([0; 32]);