From 8e6d7a09e5bab2759b0a969e2916da788b47a4eb Mon Sep 17 00:00:00 2001 From: David Michael Barr <b@rr-dav.id.au> Date: Mon, 22 Oct 2018 21:28:22 +0900 Subject: [PATCH] Make CfL predictor generic over u8 and u16 --- src/predict.rs | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/src/predict.rs b/src/predict.rs index c65c5b50..5c7c9d08 100644 --- a/src/predict.rs +++ b/src/predict.rs @@ -432,12 +432,13 @@ where #[target_feature(enable = "ssse3")] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] unsafe fn pred_cfl_ssse3( - output: &mut [u16], stride: usize, ac: &[i16], alpha: i16, + output: &mut [T], stride: usize, ac: &[i16], alpha: i16, bit_depth: usize ) { let alpha_sign = _mm_set1_epi16(alpha); let alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); - let dc_q0 = _mm_set1_epi16(*output.as_ptr() as i16); + let dc_scalar: u32 = (*output.as_ptr()).into(); + let dc_q0 = _mm_set1_epi16(dc_scalar as i16); let max = _mm_set1_epi16((1 << bit_depth) - 1); for j in 0..Self::H { @@ -445,6 +446,7 @@ where let line = output.as_mut_ptr().offset((stride * j) as isize); let mut i = 0isize; + let mut last = _mm_setzero_si128(); while (i as usize) < Self::W { let ac_q3 = _mm_loadu_si128(luma.offset(i) as *const _); let ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); @@ -452,11 +454,27 @@ where _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); let scaled_luma_q0 = _mm_sign_epi16(abs_scaled_luma_q0, ac_sign); let pred = _mm_add_epi16(scaled_luma_q0, dc_q0); - let res = _mm_min_epi16(max, _mm_max_epi16(pred, _mm_setzero_si128())); - if Self::W == 4 { - _mm_storel_epi64(line.offset(i) as *mut _, res); + if size_of::<T>() == 1 { + if Self::W < 16 { + let res = _mm_packus_epi16(pred, pred); + if Self::W == 4 { + *(line.offset(i) as *mut i32) = _mm_cvtsi128_si32(res); + } else { + _mm_storel_epi64(line.offset(i) as *mut _, res); + } + } else if (i & 15) == 0 { + last = pred; + } else { + let res = _mm_packus_epi16(last, pred); + _mm_storeu_si128(line.offset(i - 8) as *mut _, res); + } } else { - _mm_storeu_si128(line.offset(i) as *mut _, res); + let res = _mm_min_epi16(max, _mm_max_epi16(pred, _mm_setzero_si128())); + if Self::W == 4 { + _mm_storel_epi64(line.offset(i) as *mut _, res); + } else { + _mm_storeu_si128(line.offset(i) as *mut _, res); + } } i += 8; } @@ -465,7 +483,7 @@ where #[cfg_attr(feature = "comparative_bench", inline(never))] fn pred_cfl( - output: &mut [u16], stride: usize, ac: &[i16], alpha: i16, + output: &mut [T], stride: usize, ac: &[i16], alpha: i16, bit_depth: usize ) { if alpha == 0 { @@ -485,14 +503,14 @@ where } let sample_max = (1 << bit_depth) - 1; - let avg = output[0] as i32; + let avg: i32 = output[0].into(); for (line, luma) in output.chunks_mut(stride).zip(ac.chunks(32)).take(Self::H) { for (v, &l) in line[..Self::W].iter_mut().zip(luma[..Self::W].iter()) { *v = - (avg + get_scaled_luma_q0(alpha, l)).max(0).min(sample_max) as u16; + (avg + get_scaled_luma_q0(alpha, l)).max(0).min(sample_max).as_(); } } } -- GitLab