Commit d1ebacea authored by David Michael Barr's avatar David Michael Barr

Clean up CfL implementation

* CPU feature check for SSSE3.
* Measure rdo_cfl_alpha() speed.
* Remove redundant loop for 2x speed-up of rdo_cfl_alpha().
* Allow rdo_cfl_alpha() to return None.
* Tidy up formatting.
parent 2fab8c39
......@@ -21,6 +21,7 @@ use rav1e::cdef::cdef_filter_frame;
use rav1e::ec;
use rav1e::partition::*;
use rav1e::predict::*;
use rav1e::rdo::rdo_cfl_alpha;
#[cfg(feature = "comparative_bench")]
mod comparative;
......@@ -108,11 +109,33 @@ fn cdef_frame_bench(b: &mut Bencher, w: usize, h: usize) {
b.iter(|| cdef_filter_frame(&fi, &mut fs.rec, &mut bc, 8));
}
fn cfl_rdo(c: &mut Criterion) {
for &bsize in &[
BlockSize::BLOCK_4X4,
BlockSize::BLOCK_8X8,
BlockSize::BLOCK_16X16,
BlockSize::BLOCK_32X32
] {
let n = format!("cfl_rdo({:?})", bsize);
c.bench_function(&n, move |b| cfl_rdo_bench(b, bsize));
}
}
fn cfl_rdo_bench(b: &mut Bencher, bsize: BlockSize) {
let config =
EncoderConfig { quantizer: 100, speed: 10, ..Default::default() };
let fi = FrameInvariants::new(1024, 1024, config);
let mut fs = FrameState::new(&fi);
let offset = BlockOffset { x: 1, y: 1 };
b.iter(|| rdo_cfl_alpha(&mut fs, &offset, bsize, 8))
}
criterion_group!(
intra_prediction,
predict::pred_bench,
);
criterion_group!(cfl, cfl_rdo);
criterion_group!(cdef, cdef_frame);
criterion_group!(write_block, write_b);
......@@ -120,4 +143,4 @@ criterion_group!(write_block, write_b);
criterion_main!(comparative::intra_prediction);
#[cfg(not(feature = "comparative_bench"))]
criterion_main!(write_block, intra_prediction, cdef);
criterion_main!(write_block, intra_prediction, cdef, cfl);
......@@ -451,7 +451,8 @@ pub trait Intra: Dim {
while (i as usize) < Self::W {
let ac_q3 = _mm_loadu_si128(luma.offset(i) as *const _);
let ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
let abs_scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
let abs_scaled_luma_q0 =
_mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
let scaled_luma_q0 = _mm_sign_epi16(abs_scaled_luma_q0, ac_sign);
let pred = _mm_add_epi16(scaled_luma_q0, dc_q0);
let res = _mm_min_epi16(max, _mm_max_epi16(pred, _mm_setzero_si128()));
......@@ -470,14 +471,20 @@ pub trait Intra: Dim {
output: &mut [u16], stride: usize, ac: &[i16], alpha: i16,
bit_depth: usize
) {
if alpha == 0 { return; }
if alpha == 0 {
return;
}
assert!(32 >= Self::W);
assert!(ac.len() >= 32 * (Self::H - 1) + Self::W);
assert!(stride >= Self::W);
assert!(output.len() >= stride * (Self::H - 1) + Self::W);
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe {
return Self::pred_cfl_ssse3(output, stride, ac, alpha, bit_depth);
{
if is_x86_feature_detected!("ssse3") {
return unsafe {
Self::pred_cfl_ssse3(output, stride, ac, alpha, bit_depth)
};
}
}
let sample_max = (1 << bit_depth) - 1;
......@@ -721,13 +728,15 @@ pub mod test {
let o1 = vec![0u16; 32 * 32];
let o2 = vec![0u16; 32 * 32];
let max: u16 = (1 << bit_depth) - 1;
let above: Vec<u16> = (0..32).map(|_| ra.gen())
.map(|v: u16| v & max).collect();
let left: Vec<u16> = (0..32).map(|_| ra.gen())
.map(|v: u16| v & max).collect();
let above: Vec<u16> =
(0..32).map(|_| ra.gen()).map(|v: u16| v & max).collect();
let left: Vec<u16> =
(0..32).map(|_| ra.gen()).map(|v: u16| v & max).collect();
let luma_max: i16 = (1 << (bit_depth + 3)) - 1;
let ac: Vec<i16> = (0..(32 * 32)).map(|_| ra.gen())
.map(|v: i16| (v & luma_max) - (luma_max >> 1)).collect();
let ac: Vec<i16> = (0..(32 * 32))
.map(|_| ra.gen())
.map(|v: i16| (v & luma_max) - (luma_max >> 1))
.collect();
let alpha = -1 as i16;
(above, left, ac, alpha, o1, o2)
......
......@@ -294,14 +294,21 @@ pub fn rdo_mode_decision(
for &chroma_mode in &mode_set_chroma {
let mut cfl = CFLParams::new();
if chroma_mode == PredictionMode::UV_CFL_PRED {
if !best_mode_chroma.is_intra() { continue; }
if !best_mode_chroma.is_intra() {
continue;
}
let cw_checkpoint = cw.checkpoint();
let mut wr: &mut dyn Writer = &mut WriterCounter::new();
write_tx_blocks(
fi, fs, cw, wr, luma_mode, luma_mode, bo, bsize, tx_size, tx_type, false, seq.bit_depth, cfl, true
);
cw.rollback(&cw_checkpoint);
cfl = rdo_cfl_alpha(fs, bo, bsize, seq.bit_depth);
match rdo_cfl_alpha(fs, bo, bsize, seq.bit_depth) {
Some(params) => {
cfl = params;
}
None => continue
}
}
for &skip in &[false, true] {
......@@ -364,52 +371,47 @@ pub fn rdo_mode_decision(
}
}
fn rdo_cfl_alpha(
pub fn rdo_cfl_alpha(
fs: &mut FrameState, bo: &BlockOffset, bsize: BlockSize, bit_depth: usize
) -> CFLParams {
) -> Option<CFLParams> {
// TODO: these are only valid for 4:2:0
let uv_tx_size = match bsize {
BlockSize::BLOCK_4X4 | BlockSize::BLOCK_8X8 => TxSize::TX_4X4,
BlockSize::BLOCK_16X16 => TxSize::TX_8X8,
BlockSize::BLOCK_32X32 => TxSize::TX_16X16,
_ => TxSize::TX_32X32
BlockSize::BLOCK_4X4 | BlockSize::BLOCK_8X8 => TxSize::TX_4X4,
BlockSize::BLOCK_16X16 => TxSize::TX_8X8,
BlockSize::BLOCK_32X32 => TxSize::TX_16X16,
_ => TxSize::TX_32X32
};
let mut ac = [0i16; 32 * 32];
luma_ac(&mut ac, fs, bo, bsize);
let mut alpha_sse = [[0u64; 33]; 2];
for p in 1..3 {
let rec = &mut fs.rec.planes[p];
let input = &fs.input.planes[p];
let po = bo.plane_offset(&fs.input.planes[p].cfg);
for alpha in -16..17 {
PredictionMode::UV_CFL_PRED.predict_intra(
&mut rec.mut_slice(&po), uv_tx_size, bit_depth, &ac, alpha);
alpha_sse[(p - 1) as usize][(alpha + 16) as usize] = sse_wxh(
&input.slice(&po),
&rec.slice(&po),
uv_tx_size.width(),
uv_tx_size.height()
);
}
}
let mut best_cfl = CFLParams::new();
let mut best_rd = std::u64::MAX;
for alpha_u in -16..17 {
for alpha_v in -16..17 {
if alpha_u == 0 && alpha_v == 0 { continue; }
let cfl = CFLParams::from_alpha(alpha_u, alpha_v);
let rd = alpha_sse[0][(alpha_u + 16) as usize] +
alpha_sse[1][(alpha_v + 16) as usize];
if rd < best_rd {
best_rd = rd;
best_cfl = cfl;
}
}
let best_alpha: Vec<i16> = (1..3)
.map(|p| {
let rec = &mut fs.rec.planes[p];
let input = &fs.input.planes[p];
let po = bo.plane_offset(&fs.input.planes[p].cfg);
(-16i16..17i16)
.min_by_key(|&alpha| {
PredictionMode::UV_CFL_PRED.predict_intra(
&mut rec.mut_slice(&po),
uv_tx_size,
bit_depth,
&ac,
alpha
);
sse_wxh(
&input.slice(&po),
&rec.slice(&po),
uv_tx_size.width(),
uv_tx_size.height()
)
}).unwrap()
}).collect();
if best_alpha[0] == 0 && best_alpha[1] == 0 {
None
} else {
Some(CFLParams::from_alpha(best_alpha[0], best_alpha[1]))
}
best_cfl
}
// RDO-based intra frame transform type decision
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment