diff --git a/rustfmt.toml b/rustfmt.toml index e6ae70fe0d09b5d30afbf4b2218ebaec94a58bec..0c9065f37257c48b6d45e8417e5a96f0daa1c544 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -16,7 +16,7 @@ ignore = [ # "src/ec.rs", # Clean "src/lib.rs", # Did not clean yet to avoid conflicts with open PRs. "src/cdef.rs", # Did not try to clean yet. - "src/deblock.rs", # Did not try to clean yet. + # "src/deblock.rs", # Clean # "src/partition.rs", # Clean # "src/plane.rs", # Clean # "src/predict.rs", # Clean diff --git a/src/deblock.rs b/src/deblock.rs index 8de7de435b1b10b61a87cd1049453256b5afbe80..75b88e91866ca04b047d417964e4f9810af735f6 100644 --- a/src/deblock.rs +++ b/src/deblock.rs @@ -9,962 +9,1376 @@ #![allow(safe_extern_statics)] -use std::cmp; use context::*; +use partition::PredictionMode::*; +use partition::*; use plane::*; -use util::ILog; use quantize::*; -use partition::*; -use partition::PredictionMode::*; +use std::cmp; use util::clamp; +use util::ILog; +use DeblockState; use FrameInvariants; -use FrameType; use FrameState; -use DeblockState; - -fn deblock_adjusted_level(deblock: &DeblockState, block: &Block, pli: usize, vertical: bool) -> usize { - let idx = if pli == 0 { if vertical { 0 } else { 1 } } else { pli+1 }; - - let level = if deblock.block_deltas_enabled { - // By-block filter strength delta, if the feature is active. - let block_delta = if deblock.block_delta_multi { - block.deblock_deltas[ idx ] << deblock.block_delta_shift - } else { - block.deblock_deltas[ 0 ] << deblock.block_delta_shift - }; +use FrameType; - // Add to frame-specified filter strength (Y-vertical, Y-horizontal, U, V) - clamp(block_delta + deblock.levels[idx] as i8, 0, MAX_LOOP_FILTER as i8) as u8 +fn deblock_adjusted_level( + deblock: &DeblockState, block: &Block, pli: usize, vertical: bool +) -> usize { + let idx = if pli == 0 { + if vertical { + 0 + } else { + 1 + } + } else { + pli + 1 + }; + + let level = if deblock.block_deltas_enabled { + // By-block filter strength delta, if the feature is active. + let block_delta = if deblock.block_delta_multi { + block.deblock_deltas[idx] << deblock.block_delta_shift } else { - deblock.levels[idx] + block.deblock_deltas[0] << deblock.block_delta_shift }; - // if fi.seg_feaure_active { - // rav1e does not yet support segments or segment features - // } - - // Are delta modifiers for specific references and modes active? If so, add them too. - if deblock.deltas_enabled { - let mode = block.mode; - let reference = block.ref_frames[0]; - let mode_type = if mode >= NEARESTMV && mode != GLOBALMV && mode!= GLOBAL_GLOBALMV {1} else {0}; - let l5 = level >> 5; - clamp (level as i32 + ((deblock.ref_deltas[reference] as i32) << l5) + - if reference == INTRA_FRAME { - 0 - } else { - (deblock.mode_deltas[mode_type] as i32) << l5 - }, 0, MAX_LOOP_FILTER as i32) as usize - } else { - level as usize - } + // Add to frame-specified filter strength (Y-vertical, Y-horizontal, U, V) + clamp(block_delta + deblock.levels[idx] as i8, 0, MAX_LOOP_FILTER as i8) + as u8 + } else { + deblock.levels[idx] + }; + + // if fi.seg_feaure_active { + // rav1e does not yet support segments or segment features + // } + + // Are delta modifiers for specific references and modes active? If so, add them too. + if deblock.deltas_enabled { + let mode = block.mode; + let reference = block.ref_frames[0]; + let mode_type = + if mode >= NEARESTMV && mode != GLOBALMV && mode != GLOBAL_GLOBALMV { + 1 + } else { + 0 + }; + let l5 = level >> 5; + clamp( + level as i32 + + ((deblock.ref_deltas[reference] as i32) << l5) + + if reference == INTRA_FRAME { + 0 + } else { + (deblock.mode_deltas[mode_type] as i32) << l5 + }, + 0, + MAX_LOOP_FILTER as i32 + ) as usize + } else { + level as usize + } } -fn deblock_left<'a>(bc: &'a BlockContext, in_bo: &BlockOffset, p: &Plane) -> &'a Block { - let xdec = p.cfg.xdec; - let ydec = p.cfg.ydec; +fn deblock_left<'a>( + bc: &'a BlockContext, in_bo: &BlockOffset, p: &Plane +) -> &'a Block { + let xdec = p.cfg.xdec; + let ydec = p.cfg.ydec; - // This little bit of weirdness is straight out of the spec; - // subsampled chroma uses odd mi row/col - let bo = BlockOffset{x: in_bo.x | xdec, y: in_bo.y | ydec}; + // This little bit of weirdness is straight out of the spec; + // subsampled chroma uses odd mi row/col + let bo = BlockOffset { x: in_bo.x | xdec, y: in_bo.y | ydec }; - // We already know we're not at the upper/left corner, so prev_block is in frame - bc.at(&bo.with_offset(-1 << xdec, 0)) + // We already know we're not at the upper/left corner, so prev_block is in frame + bc.at(&bo.with_offset(-1 << xdec, 0)) } -fn deblock_up<'a>(bc: &'a BlockContext, in_bo: &BlockOffset, p: &Plane) -> &'a Block { - let xdec = p.cfg.xdec; - let ydec = p.cfg.ydec; +fn deblock_up<'a>( + bc: &'a BlockContext, in_bo: &BlockOffset, p: &Plane +) -> &'a Block { + let xdec = p.cfg.xdec; + let ydec = p.cfg.ydec; - // This little bit of weirdness is straight out of the spec; - // subsampled chroma uses odd mi row/col - let bo = BlockOffset{x: in_bo.x | xdec, y: in_bo.y | ydec}; - - // We already know we're not at the upper/left corner, so prev_block is in frame - bc.at(&bo.with_offset(0, -1 << ydec)) + // This little bit of weirdness is straight out of the spec; + // subsampled chroma uses odd mi row/col + let bo = BlockOffset { x: in_bo.x | xdec, y: in_bo.y | ydec }; + + // We already know we're not at the upper/left corner, so prev_block is in frame + bc.at(&bo.with_offset(0, -1 << ydec)) } // Must be called on a tx edge, and not on a frame edge. This is enforced above the call. -fn deblock_size(block: &Block, prev_block: &Block, p: &Plane, pli: usize, - vertical: bool, block_edge: bool) -> usize { - let xdec = p.cfg.xdec; - let ydec = p.cfg.ydec; - - // filter application is conditional on skip and block edge - if !(block_edge || !block.skip || !prev_block.skip || - block.ref_frames[0] <= INTRA_FRAME || prev_block.ref_frames[0] <= INTRA_FRAME) { - 0 +fn deblock_size( + block: &Block, prev_block: &Block, p: &Plane, pli: usize, vertical: bool, + block_edge: bool +) -> usize { + let xdec = p.cfg.xdec; + let ydec = p.cfg.ydec; + + // filter application is conditional on skip and block edge + if !(block_edge + || !block.skip + || !prev_block.skip + || block.ref_frames[0] <= INTRA_FRAME + || prev_block.ref_frames[0] <= INTRA_FRAME) + { + 0 + } else { + let (tx_size, prev_tx_size) = if vertical { + (cmp::max(block.tx_w >> xdec, 1), cmp::max(prev_block.tx_w >> xdec, 1)) } else { - let (tx_size, prev_tx_size) = if vertical { - (cmp::max(block.tx_w>>xdec, 1), cmp::max(prev_block.tx_w>>xdec, 1)) - } else { - (cmp::max(block.tx_h>>ydec, 1), cmp::max(prev_block.tx_h>>ydec, 1)) - }; + (cmp::max(block.tx_h >> ydec, 1), cmp::max(prev_block.tx_h >> ydec, 1)) + }; - cmp::min( if pli==0 {14} else {6}, cmp::min(tx_size, prev_tx_size) << MI_SIZE_LOG2) - } + cmp::min( + if pli == 0 { 14 } else { 6 }, + cmp::min(tx_size, prev_tx_size) << MI_SIZE_LOG2 + ) + } } // Must be called on a tx edge -fn deblock_level(deblock: &DeblockState, block: &Block, prev_block: &Block, - pli: usize, vertical: bool) -> usize { - - let level = deblock_adjusted_level(deblock, block, pli, vertical); - if level == 0 { - deblock_adjusted_level(deblock, prev_block, pli, vertical) - } else { - level - } +fn deblock_level( + deblock: &DeblockState, block: &Block, prev_block: &Block, pli: usize, + vertical: bool +) -> usize { + let level = deblock_adjusted_level(deblock, block, pli, vertical); + if level == 0 { + deblock_adjusted_level(deblock, prev_block, pli, vertical) + } else { + level + } } // four taps, 4 outputs (two are trivial) -fn filter_narrow2_4(p1: i32, p0: i32, q0: i32, q1: i32, shift: usize) -> [i32; 4] { - let filter0 = clamp(p1 - q1, -128 << shift, (128 << shift) - 1); - let filter1 = clamp(filter0 + 3*(q0 - p0) + 4, -128 << shift, (128 << shift)-1) >> 3; - // be certain our optimization removing a clamp is sound - debug_assert!( { let base = clamp(filter0 + 3 * (q0 - p0), -128 << shift, (128 << shift) - 1); - let test = clamp(base + 4, -128 << shift, (128 << shift)-1) >> 3; - filter1 == test } ); - let filter2 = clamp(filter0 + 3*(q0 - p0) + 3, -128 << shift, (128 << shift)-1) >> 3; - // be certain our optimization removing a clamp is sound - debug_assert!( { let base = clamp(filter0 + 3 * (q0 - p0), -128 << shift, (128 << shift) - 1); - let test = clamp(base + 3, -128 << shift, (128 << shift)-1) >> 3; - filter2 == test } ); - [p1, - clamp(p0 + filter2, 0, (256 << shift)-1), - clamp(q0 - filter1, 0, (256 << shift)-1), - q1] +fn filter_narrow2_4( + p1: i32, p0: i32, q0: i32, q1: i32, shift: usize +) -> [i32; 4] { + let filter0 = clamp(p1 - q1, -128 << shift, (128 << shift) - 1); + let filter1 = + clamp(filter0 + 3 * (q0 - p0) + 4, -128 << shift, (128 << shift) - 1) >> 3; + // be certain our optimization removing a clamp is sound + debug_assert!({ + let base = + clamp(filter0 + 3 * (q0 - p0), -128 << shift, (128 << shift) - 1); + let test = clamp(base + 4, -128 << shift, (128 << shift) - 1) >> 3; + filter1 == test + }); + let filter2 = + clamp(filter0 + 3 * (q0 - p0) + 3, -128 << shift, (128 << shift) - 1) >> 3; + // be certain our optimization removing a clamp is sound + debug_assert!({ + let base = + clamp(filter0 + 3 * (q0 - p0), -128 << shift, (128 << shift) - 1); + let test = clamp(base + 3, -128 << shift, (128 << shift) - 1) >> 3; + filter2 == test + }); + [ + p1, + clamp(p0 + filter2, 0, (256 << shift) - 1), + clamp(q0 - filter1, 0, (256 << shift) - 1), + q1 + ] } // six taps, 6 outputs (four are trivial) -fn filter_narrow2_6(p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize) -> [i32; 6] { - let x = filter_narrow2_4(p1, p0, q0, q1, shift); - [p2, x[0], x[1], x[2], x[3], q2] +fn filter_narrow2_6( + p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize +) -> [i32; 6] { + let x = filter_narrow2_4(p1, p0, q0, q1, shift); + [p2, x[0], x[1], x[2], x[3], q2] } // 12 taps, 12 outputs (ten are trivial) -fn filter_narrow2_12(p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, - q0: i32, q1: i32, q2: i32, q3: i32, q4: i32, q5: i32, shift: usize) -> [i32; 12] { - let x = filter_narrow2_4(p1, p0, q0, q1, shift); - [p5, p4, p3, p2, x[0], x[1], x[2], x[3], q2, q3, q4, q5] +fn filter_narrow2_12( + p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, + q2: i32, q3: i32, q4: i32, q5: i32, shift: usize +) -> [i32; 12] { + let x = filter_narrow2_4(p1, p0, q0, q1, shift); + [p5, p4, p3, p2, x[0], x[1], x[2], x[3], q2, q3, q4, q5] } // four taps, 4 outputs -fn filter_narrow4_4(p1: i32, p0: i32, q0: i32, q1: i32, shift: usize) -> [i32; 4] { - let filter1 = clamp(3 * (q0 - p0) + 4, -128 << shift, (128 << shift)-1) >> 3; - // be certain our optimization removing a clamp is sound - debug_assert!( { let base = clamp(3 * (q0 - p0), -128 << shift, (128 << shift) - 1); - let test = clamp(base + 4, -128 << shift, (128 << shift)-1) >> 3; - filter1 == test } ); - let filter2 = clamp(3 * (q0 - p0) + 3, -128 << shift, (128 << shift)-1) >> 3; - // be certain our optimization removing a clamp is sound - debug_assert!( { let base = clamp(3 * (q0 - p0), -128 << shift, (128 << shift) - 1); - let test = clamp(base + 3, -128 << shift, (128 << shift)-1) >> 3; - filter2 == test } ); - let filter3 = filter1 + 1 >> 1; - [clamp(p1 + filter3, 0, (256 << shift)-1), - clamp(p0 + filter2, 0, (256 << shift)-1), - clamp(q0 - filter1, 0, (256 << shift)-1), - clamp(q1 - filter3, 0, (256 << shift)-1)] +fn filter_narrow4_4( + p1: i32, p0: i32, q0: i32, q1: i32, shift: usize +) -> [i32; 4] { + let filter1 = + clamp(3 * (q0 - p0) + 4, -128 << shift, (128 << shift) - 1) >> 3; + // be certain our optimization removing a clamp is sound + debug_assert!({ + let base = clamp(3 * (q0 - p0), -128 << shift, (128 << shift) - 1); + let test = clamp(base + 4, -128 << shift, (128 << shift) - 1) >> 3; + filter1 == test + }); + let filter2 = + clamp(3 * (q0 - p0) + 3, -128 << shift, (128 << shift) - 1) >> 3; + // be certain our optimization removing a clamp is sound + debug_assert!({ + let base = clamp(3 * (q0 - p0), -128 << shift, (128 << shift) - 1); + let test = clamp(base + 3, -128 << shift, (128 << shift) - 1) >> 3; + filter2 == test + }); + let filter3 = filter1 + 1 >> 1; + [ + clamp(p1 + filter3, 0, (256 << shift) - 1), + clamp(p0 + filter2, 0, (256 << shift) - 1), + clamp(q0 - filter1, 0, (256 << shift) - 1), + clamp(q1 - filter3, 0, (256 << shift) - 1) + ] } // six taps, 6 outputs (two are trivial) -fn filter_narrow4_6(p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize) -> [i32; 6] { - let x = filter_narrow4_4(p1, p0, q0, q1, shift); - [p2, x[0], x[1], x[2], x[3], q2] +fn filter_narrow4_6( + p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize +) -> [i32; 6] { + let x = filter_narrow4_4(p1, p0, q0, q1, shift); + [p2, x[0], x[1], x[2], x[3], q2] } // 12 taps, 12 outputs (eight are trivial) -fn filter_narrow4_12(p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, - q0: i32, q1: i32, q2: i32, q3: i32, q4: i32, q5: i32, shift: usize) -> [i32; 12] { - let x = filter_narrow4_4(p1, p0, q0, q1, shift); - [p5, p4, p3, p2, x[0], x[1], x[2], x[3], q2, q3, q4, q5] +fn filter_narrow4_12( + p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, + q2: i32, q3: i32, q4: i32, q5: i32, shift: usize +) -> [i32; 12] { + let x = filter_narrow4_4(p1, p0, q0, q1, shift); + [p5, p4, p3, p2, x[0], x[1], x[2], x[3], q2, q3, q4, q5] } // six taps, 4 outputs #[cfg_attr(rustfmt, rustfmt_skip)] -fn filter_wide6_4(p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32) -> [i32; 4] { - [p2*3 + p1*2 + p0*2 + q0 + (1<<2) >> 3, - p2 + p1*2 + p0*2 + q0*2 + q1 + (1<<2) >> 3, - p1 + p0*2 + q0*2 + q1*2 + q2 + (1<<2) >> 3, - p0 + q0*2 + q1*2 + q2*3 + (1<<2) >> 3] +fn filter_wide6_4( + p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32 +) -> [i32; 4] { + [ + p2*3 + p1*2 + p0*2 + q0 + (1<<2) >> 3, + p2 + p1*2 + p0*2 + q0*2 + q1 + (1<<2) >> 3, + p1 + p0*2 + q0*2 + q1*2 + q2 + (1<<2) >> 3, + p0 + q0*2 + q1*2 + q2*3 + (1<<2) >> 3 + ] } // eight taps, 6 outputs #[cfg_attr(rustfmt, rustfmt_skip)] -fn filter_wide8_6(p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32) -> [i32; 6] { - [p3*3 + p2*2 + p1 + p0 + q0 + (1<<2) >> 3, - p3*2 + p2 + p1*2 + p0 + q0 + q1 + (1<<2) >> 3, - p3 + p2 + p1 + p0*2 + q0 + q1 + q2 +(1<<2) >> 3, - p2 + p1 + p0 + q0*2 + q1 + q2 + q3 + (1<<2) >> 3, - p1 + p0 + q0 + q1*2 + q2 + q3*2 + (1<<2) >> 3, - p0 + q0 + q1 + q2*2 + q3*3 + (1<<2) >> 3] +fn filter_wide8_6( + p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32 +) -> [i32; 6] { + [ + p3*3 + p2*2 + p1 + p0 + q0 + (1<<2) >> 3, + p3*2 + p2 + p1*2 + p0 + q0 + q1 + (1<<2) >> 3, + p3 + p2 + p1 + p0*2 + q0 + q1 + q2 +(1<<2) >> 3, + p2 + p1 + p0 + q0*2 + q1 + q2 + q3 + (1<<2) >> 3, + p1 + p0 + q0 + q1*2 + q2 + q3*2 + (1<<2) >> 3, + p0 + q0 + q1 + q2*2 + q3*3 + (1<<2) >> 3 + ] } // 12 taps, 12 outputs (six are trivial) -fn filter_wide8_12(p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, - q0: i32, q1: i32, q2: i32, q3: i32, q4: i32, q5: i32) -> [i32; 12] { - let x = filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3); - [p5, p4, p3, x[0], x[1], x[2], x[3], x[4], x[5], q3, q4, q5] +fn filter_wide8_12( + p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, + q2: i32, q3: i32, q4: i32, q5: i32 +) -> [i32; 12] { + let x = filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3); + [p5, p4, p3, x[0], x[1], x[2], x[3], x[4], x[5], q3, q4, q5] } // fourteen taps, 12 outputs #[cfg_attr(rustfmt, rustfmt_skip)] -fn filter_wide14_12(p6: i32, p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, - q0: i32, q1: i32, q2: i32, q3: i32, q4: i32, q5: i32, q6: i32) -> [i32; 12] { - [p6*7 + p5*2 + p4*2 + p3 + p2 + p1 + p0 + q0 + (1<<3) >> 4, - p6*5 + p5*2 + p4*2 + p3*2 + p2 + p1 + p0 + q0 + q1 + (1<<3) >> 4, - p6*4 + p5 + p4*2 + p3*2 + p2*2 + p1 + p0 + q0 + q1 + q2 + (1<<3) >> 4, - p6*3 + p5 + p4 + p3*2 + p2*2 + p1*2 + p0 + q0 + q1 + q2 + q3 + (1<<3) >> 4, - p6*2 + p5 + p4 + p3 + p2*2 + p1*2 + p0*2 + q0 + q1 + q2 + q3 + q4 + (1<<3) >> 4, - p6 + p5 + p4 + p3 + p2 + p1*2 + p0*2 + q0*2 + q1 + q2 + q3 + q4 + q5 + (1<<3) >> 4, - p5 + p4 + p3 + p2 + p1 + p0*2 + q0*2 + q1*2 + q2 + q3 + q4 + q5 + q6 + (1<<3) >> 4, - p4 + p3 + p2 + p1 + p0 + q0*2 + q1*2 + q2*2 + q3 + q4 + q5 + q6*2 + (1<<3) >> 4, - p3 + p2 + p1 + p0 + q0 + q1*2 + q2*2 + q3*2 + q4 + q5 + q6*3 + (1<<3) >> 4, - p2 + p1 + p0 + q0 + q1 + q2*2 + q3*2 + q4*2 + q5 + q6*4 + (1<<3) >> 4, - p1 + p0 + q0 + q1 + q2 + q3*2 + q4*2 + q5*2 + q6*5 + (1<<3) >> 4, - p0 + q0 + q1 + q2 + q3 + q4*2 + q5*2 + q6*7 + (1<<3) >> 4] -} +fn filter_wide14_12( + p6: i32, p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, + q1: i32, q2: i32, q3: i32, q4: i32, q5: i32, q6: i32 +) -> [i32; 12] { + [ + p6*7 + p5*2 + p4*2 + p3 + p2 + p1 + p0 + q0 + (1<<3) >> 4, + p6*5 + p5*2 + p4*2 + p3*2 + p2 + p1 + p0 + q0 + q1 + (1<<3) >> 4, + p6*4 + p5 + p4*2 + p3*2 + p2*2 + p1 + p0 + q0 + q1 + q2 + (1<<3) >> 4, + p6*3 + p5 + p4 + p3*2 + p2*2 + p1*2 + p0 + q0 + q1 + q2 + q3 + (1<<3) >> 4, + p6*2 + p5 + p4 + p3 + p2*2 + p1*2 + p0*2 + q0 + q1 + q2 + q3 + q4 + (1<<3) >> 4, + p6 + p5 + p4 + p3 + p2 + p1*2 + p0*2 + q0*2 + q1 + q2 + q3 + q4 + q5 + (1<<3) >> 4, + p5 + p4 + p3 + p2 + p1 + p0*2 + q0*2 + q1*2 + q2 + q3 + q4 + q5 + q6 + (1<<3) >> 4, + p4 + p3 + p2 + p1 + p0 + q0*2 + q1*2 + q2*2 + q3 + q4 + q5 + q6*2 + (1<<3) >> 4, + p3 + p2 + p1 + p0 + q0 + q1*2 + q2*2 + q3*2 + q4 + q5 + q6*3 + (1<<3) >> 4, + p2 + p1 + p0 + q0 + q1 + q2*2 + q3*2 + q4*2 + q5 + q6*4 + (1<<3) >> 4, + p1 + p0 + q0 + q1 + q2 + q3*2 + q4*2 + q5*2 + q6*5 + (1<<3) >> 4, + p0 + q0 + q1 + q2 + q3 + q4*2 + q5*2 + q6*7 + (1<<3) >> 4 + ] +} fn stride_copy(dst: &mut [u16], src: &[i32], pitch: usize) { - for (dst, src) in dst.iter_mut().step_by(pitch).take(src.len()).zip(src) { *dst = *src as u16 }; + for (dst, src) in dst.iter_mut().step_by(pitch).take(src.len()).zip(src) { + *dst = *src as u16 + } } fn stride_sse(a: &[u16], b: &[i32], pitch: usize) -> i64 { - let mut acc: i32 = 0; - for (a, b) in a.iter().step_by(pitch).take(b.len()).zip(b) { - acc += (*a as i32 - *b) * (*a as i32 - *b) - }; - acc as i64 + let mut acc: i32 = 0; + for (a, b) in a.iter().step_by(pitch).take(b.len()).zip(b) { + acc += (*a as i32 - *b) * (*a as i32 - *b) + } + acc as i64 } fn _level_to_limit(level: i32, shift: usize) -> i32 { - level << shift + level << shift } fn limit_to_level(limit: i32, shift: usize) -> i32 { - limit + (1 << shift) - 1 >> shift + limit + (1 << shift) - 1 >> shift } fn _level_to_blimit(level: i32, shift: usize) -> i32 { - 3 * level + 4 << shift + 3 * level + 4 << shift } fn blimit_to_level(blimit: i32, shift: usize) -> i32 { - ((blimit + (1 << shift) - 1 >> shift) - 2) / 3 + ((blimit + (1 << shift) - 1 >> shift) - 2) / 3 } fn _level_to_thresh(level: i32, shift: usize) -> i32 { - level >> 4 << shift + level >> 4 << shift } fn thresh_to_level(thresh: i32, shift: usize) -> i32 { - thresh + (1 << shift) - 1 >> shift << 4 + thresh + (1 << shift) - 1 >> shift << 4 } fn nhev4(p1: i32, p0: i32, q0: i32, q1: i32, shift: usize) -> usize { - thresh_to_level(cmp::max((p1 - p0).abs(), (q1 - q0).abs()), shift) as usize + thresh_to_level(cmp::max((p1 - p0).abs(), (q1 - q0).abs()), shift) as usize } fn mask4(p1: i32, p0: i32, q0: i32, q1: i32, shift: usize) -> usize { - cmp::max(limit_to_level(cmp::max((p1 - p0).abs(), (q1 - q0).abs()), shift), - blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift)) as usize + cmp::max( + limit_to_level(cmp::max((p1 - p0).abs(), (q1 - q0).abs()), shift), + blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift) + ) as usize } // Assumes rec[0] is set 2 taps back from the edge -fn deblock_size4(rec: &mut[u16], pitch: usize, stride: usize, level: usize, bd: usize) { - let mut s = 0; - for _i in 0..4 { - let p = &mut rec[s..]; - let p1 = p[0] as i32; - let p0 = p[pitch] as i32; - let q0 = p[pitch*2] as i32; - let q1 = p[pitch*3] as i32; - if mask4(p1, p0, q0, q1, bd - 8) <= level { - let x; - if nhev4(p1, p0, q0, q1, bd - 8) <= level { - x = filter_narrow4_4(p1, p0, q0, q1, bd - 8); - } else { - x = filter_narrow2_4(p1, p0, q0, q1, bd - 8); - } - stride_copy(p, &x, pitch); - } - s += stride; +fn deblock_size4( + rec: &mut [u16], pitch: usize, stride: usize, level: usize, bd: usize +) { + let mut s = 0; + for _i in 0..4 { + let p = &mut rec[s..]; + let p1 = p[0] as i32; + let p0 = p[pitch] as i32; + let q0 = p[pitch * 2] as i32; + let q1 = p[pitch * 3] as i32; + if mask4(p1, p0, q0, q1, bd - 8) <= level { + let x; + if nhev4(p1, p0, q0, q1, bd - 8) <= level { + x = filter_narrow4_4(p1, p0, q0, q1, bd - 8); + } else { + x = filter_narrow2_4(p1, p0, q0, q1, bd - 8); + } + stride_copy(p, &x, pitch); } + s += stride; + } } // Assumes rec[0] and src[0] are set 2 taps back from the edge. // Accesses four taps, accumulates four pixels into the tally -fn sse_size4(rec: &[u16], src: &[u16], tally: &mut [i64; MAX_LOOP_FILTER+2], - rec_pitch: usize, src_pitch: usize, rec_stride: usize, src_stride: usize, bd: usize) { - let mut rec_s = 0; - let mut src_s = 0; - for _i in 0..4 { - let p = &rec[rec_s..]; // four taps - let a = &src[src_s..]; // four pixels to compare - let p1 = p[0] as i32; - let p0 = p[rec_pitch] as i32; - let q0 = p[rec_pitch*2] as i32; - let q1 = p[rec_pitch*3] as i32; - - // three possibilities: no filter, narrow2 and narrow4 - // All possibilities produce four outputs - let none: [_; 4] = [p1, p0, q0, q1]; - let narrow2 = filter_narrow2_4(p1, p0, q0, q1, bd - 8); - let narrow4 = filter_narrow4_4(p1, p0, q0, q1, bd - 8); - - // mask4 sets the dividing line for filter vs no filter - // nhev4 sets the dividing line between narrow2 and narrow4 - let mask = clamp(mask4(p1, p0, q0, q1, bd-8), 1, MAX_LOOP_FILTER+1) as usize; - let nhev = clamp(nhev4(p1, p0, q0, q1, bd-8), mask, MAX_LOOP_FILTER+1) as usize; - - // sse for each; short-circuit the 'special' no-op cases. - let sse_none = stride_sse(a, &none, src_pitch); - let sse_narrow2 = if nhev != mask { stride_sse(a, &narrow2, src_pitch) } else { sse_none }; - let sse_narrow4 = if nhev <= MAX_LOOP_FILTER { stride_sse(a, &narrow4, src_pitch) } else { sse_none }; - - // accumulate possible filter values into the tally - // level 0 is a special case - tally[0] += sse_none; - tally[mask] -= sse_none; - tally[mask] += sse_narrow2; - tally[nhev] -= sse_narrow2; - tally[nhev] += sse_narrow4; - - rec_s += rec_stride; - src_s += src_stride; - } +fn sse_size4( + rec: &[u16], src: &[u16], tally: &mut [i64; MAX_LOOP_FILTER + 2], + rec_pitch: usize, src_pitch: usize, rec_stride: usize, src_stride: usize, + bd: usize +) { + let mut rec_s = 0; + let mut src_s = 0; + for _i in 0..4 { + let p = &rec[rec_s..]; // four taps + let a = &src[src_s..]; // four pixels to compare + let p1 = p[0] as i32; + let p0 = p[rec_pitch] as i32; + let q0 = p[rec_pitch * 2] as i32; + let q1 = p[rec_pitch * 3] as i32; + + // three possibilities: no filter, narrow2 and narrow4 + // All possibilities produce four outputs + let none: [_; 4] = [p1, p0, q0, q1]; + let narrow2 = filter_narrow2_4(p1, p0, q0, q1, bd - 8); + let narrow4 = filter_narrow4_4(p1, p0, q0, q1, bd - 8); + + // mask4 sets the dividing line for filter vs no filter + // nhev4 sets the dividing line between narrow2 and narrow4 + let mask = + clamp(mask4(p1, p0, q0, q1, bd - 8), 1, MAX_LOOP_FILTER + 1) as usize; + let nhev = + clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1) as usize; + + // sse for each; short-circuit the 'special' no-op cases. + let sse_none = stride_sse(a, &none, src_pitch); + let sse_narrow2 = + if nhev != mask { stride_sse(a, &narrow2, src_pitch) } else { sse_none }; + let sse_narrow4 = if nhev <= MAX_LOOP_FILTER { + stride_sse(a, &narrow4, src_pitch) + } else { + sse_none + }; + + // accumulate possible filter values into the tally + // level 0 is a special case + tally[0] += sse_none; + tally[mask] -= sse_none; + tally[mask] += sse_narrow2; + tally[nhev] -= sse_narrow2; + tally[nhev] += sse_narrow4; + + rec_s += rec_stride; + src_s += src_stride; + } } -fn mask6(p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize) -> usize { - cmp::max( limit_to_level(cmp::max((p2 - p1).abs(), - cmp::max((p1 - p0).abs(), - cmp::max((q2 - q1).abs(), - (q1 - q0).abs()))), shift), - blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift)) as usize +fn mask6( + p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize +) -> usize { + cmp::max( + limit_to_level( + cmp::max( + (p2 - p1).abs(), + cmp::max((p1 - p0).abs(), cmp::max((q2 - q1).abs(), (q1 - q0).abs())) + ), + shift + ), + blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift) + ) as usize } fn flat6(p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32) -> usize { - cmp::max((p1 - p0).abs(), - cmp::max((q1 - q0).abs(), - cmp::max((p2 - p0).abs(),(q2 - q0).abs()))) as usize + cmp::max( + (p1 - p0).abs(), + cmp::max((q1 - q0).abs(), cmp::max((p2 - p0).abs(), (q2 - q0).abs())) + ) as usize } // Assumes slice[0] is set 3 taps back from the edge -fn deblock_size6(rec: &mut[u16], pitch: usize, stride: usize, level: usize, bd: usize) { - let mut s = 0; - let flat = 1 << bd - 8; - for _i in 0..4 { - let p = &mut rec[s..]; - let p2 = p[0] as i32; - let p1 = p[pitch] as i32; - let p0 = p[pitch*2] as i32; - let q0 = p[pitch*3] as i32; - let q1 = p[pitch*4] as i32; - let q2 = p[pitch*5] as i32; - if mask6(p2, p1, p0, q0, q1, q2, bd - 8) <= level { - let x; - if flat6(p2, p1, p0, q0, q1, q2) <= flat { - x = filter_wide6_4(p2, p1, p0, q0, q1, q2); - } else if nhev4(p1, p0, q0, q1, bd - 8) <= level { - x = filter_narrow4_4(p1, p0, q0, q1, bd - 8); - } else { - x = filter_narrow2_4(p1, p0, q0, q1, bd - 8); - } - stride_copy(&mut p[pitch..], &x, pitch); - } - s += stride; +fn deblock_size6( + rec: &mut [u16], pitch: usize, stride: usize, level: usize, bd: usize +) { + let mut s = 0; + let flat = 1 << bd - 8; + for _i in 0..4 { + let p = &mut rec[s..]; + let p2 = p[0] as i32; + let p1 = p[pitch] as i32; + let p0 = p[pitch * 2] as i32; + let q0 = p[pitch * 3] as i32; + let q1 = p[pitch * 4] as i32; + let q2 = p[pitch * 5] as i32; + if mask6(p2, p1, p0, q0, q1, q2, bd - 8) <= level { + let x; + if flat6(p2, p1, p0, q0, q1, q2) <= flat { + x = filter_wide6_4(p2, p1, p0, q0, q1, q2); + } else if nhev4(p1, p0, q0, q1, bd - 8) <= level { + x = filter_narrow4_4(p1, p0, q0, q1, bd - 8); + } else { + x = filter_narrow2_4(p1, p0, q0, q1, bd - 8); + } + stride_copy(&mut p[pitch..], &x, pitch); } + s += stride; + } } // Assumes rec[0] and src[0] are set 3 taps back from the edge. // Accesses six taps, accumulates four pixels into the tally -fn sse_size6(rec: &[u16], src: &[u16], tally: &mut [i64; MAX_LOOP_FILTER+2], - rec_pitch: usize, src_pitch: usize, rec_stride: usize, src_stride: usize, bd: usize){ - let mut rec_s = 0; - let mut src_s = 0; - let flat = 1 << bd - 8; - for _i in 0..4 { - let p = &rec[rec_s..]; // six taps - let a = &src[src_s + src_pitch..]; // four pixels to compare so offset one forward - let p2 = p[0] as i32; - let p1 = p[rec_pitch] as i32; - let p0 = p[rec_pitch*2] as i32; - let q0 = p[rec_pitch*3] as i32; - let q1 = p[rec_pitch*4] as i32; - let q2 = p[rec_pitch*5] as i32; - - // Four possibilities: no filter, wide6, narrow2 and narrow4 - // All possibilities produce four outputs - let none: [_; 4] = [p1, p0, q0, q1]; - let wide6 = filter_wide6_4(p2, p1, p0, q0, q1, q2); - let narrow2 = filter_narrow2_4(p1, p0, q0, q1, bd - 8); - let narrow4 = filter_narrow4_4(p1, p0, q0, q1, bd - 8); - - // mask6 sets the dividing line for filter vs no filter - // flat6 decides between wide and narrow filters (unrelated to level) - // nhev4 sets the dividing line between narrow2 and narrow4 - let mask = clamp(mask6(p2, p1, p0, q0, q1, q2, bd-8), 1, MAX_LOOP_FILTER+1) as usize; - let flatp = flat6(p2, p1, p0, q0, q1, q2) <= flat; - let nhev = clamp(nhev4(p1, p0, q0, q1, bd-8), mask, MAX_LOOP_FILTER+1) as usize; - - // sse for each; short-circuit the 'special' no-op cases. - let sse_none = stride_sse(a, &none, src_pitch); - let sse_wide6 = if flatp && mask <= MAX_LOOP_FILTER { stride_sse(a, &wide6, src_pitch) } else { sse_none }; - let sse_narrow2 = if !flatp && nhev != mask { stride_sse(a, &narrow2, src_pitch) } else { sse_none }; - let sse_narrow4 = if !flatp && nhev <= MAX_LOOP_FILTER { stride_sse(a, &narrow4, src_pitch) } else { sse_none }; - - // accumulate possible filter values into the tally - tally[0] += sse_none; - tally[mask] -= sse_none; - if flatp { - tally[mask] += sse_wide6; - } else { - tally[mask] += sse_narrow2; - tally[nhev] -= sse_narrow2; - tally[nhev] += sse_narrow4; - } +fn sse_size6( + rec: &[u16], src: &[u16], tally: &mut [i64; MAX_LOOP_FILTER + 2], + rec_pitch: usize, src_pitch: usize, rec_stride: usize, src_stride: usize, + bd: usize +) { + let mut rec_s = 0; + let mut src_s = 0; + let flat = 1 << bd - 8; + for _i in 0..4 { + let p = &rec[rec_s..]; // six taps + let a = &src[src_s + src_pitch..]; // four pixels to compare so offset one forward + let p2 = p[0] as i32; + let p1 = p[rec_pitch] as i32; + let p0 = p[rec_pitch * 2] as i32; + let q0 = p[rec_pitch * 3] as i32; + let q1 = p[rec_pitch * 4] as i32; + let q2 = p[rec_pitch * 5] as i32; + + // Four possibilities: no filter, wide6, narrow2 and narrow4 + // All possibilities produce four outputs + let none: [_; 4] = [p1, p0, q0, q1]; + let wide6 = filter_wide6_4(p2, p1, p0, q0, q1, q2); + let narrow2 = filter_narrow2_4(p1, p0, q0, q1, bd - 8); + let narrow4 = filter_narrow4_4(p1, p0, q0, q1, bd - 8); + + // mask6 sets the dividing line for filter vs no filter + // flat6 decides between wide and narrow filters (unrelated to level) + // nhev4 sets the dividing line between narrow2 and narrow4 + let mask = + clamp(mask6(p2, p1, p0, q0, q1, q2, bd - 8), 1, MAX_LOOP_FILTER + 1) + as usize; + let flatp = flat6(p2, p1, p0, q0, q1, q2) <= flat; + let nhev = + clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1) as usize; + + // sse for each; short-circuit the 'special' no-op cases. + let sse_none = stride_sse(a, &none, src_pitch); + let sse_wide6 = if flatp && mask <= MAX_LOOP_FILTER { + stride_sse(a, &wide6, src_pitch) + } else { + sse_none + }; + let sse_narrow2 = if !flatp && nhev != mask { + stride_sse(a, &narrow2, src_pitch) + } else { + sse_none + }; + let sse_narrow4 = if !flatp && nhev <= MAX_LOOP_FILTER { + stride_sse(a, &narrow4, src_pitch) + } else { + sse_none + }; - rec_s += rec_stride; - src_s += src_stride; + // accumulate possible filter values into the tally + tally[0] += sse_none; + tally[mask] -= sse_none; + if flatp { + tally[mask] += sse_wide6; + } else { + tally[mask] += sse_narrow2; + tally[nhev] -= sse_narrow2; + tally[nhev] += sse_narrow4; } + + rec_s += rec_stride; + src_s += src_stride; + } } -fn mask8(p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32, shift: usize) -> usize { - cmp::max(limit_to_level(cmp::max((p3 - p2).abs(), - cmp::max((p2 - p1).abs(), - cmp::max((p1 - p0).abs(), - cmp::max((q3 - q2).abs(), - cmp::max((q2 - q1).abs(), - (q1 - q0).abs()))))), shift), - blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift)) as usize +fn mask8( + p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32, + shift: usize +) -> usize { + cmp::max( + limit_to_level( + cmp::max( + (p3 - p2).abs(), + cmp::max( + (p2 - p1).abs(), + cmp::max( + (p1 - p0).abs(), + cmp::max( + (q3 - q2).abs(), + cmp::max((q2 - q1).abs(), (q1 - q0).abs()) + ) + ) + ) + ), + shift + ), + blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift) + ) as usize } -fn flat8(p3:i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32) -> usize { - cmp::max((p1 - p0).abs(), - cmp::max((q1 - q0).abs(), - cmp::max((p2 - p0).abs(), - cmp::max((q2 - q0).abs(), - cmp::max((p3 - p0).abs(), (q3 - q0).abs()))))) as usize +fn flat8( + p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32 +) -> usize { + cmp::max( + (p1 - p0).abs(), + cmp::max( + (q1 - q0).abs(), + cmp::max( + (p2 - p0).abs(), + cmp::max((q2 - q0).abs(), cmp::max((p3 - p0).abs(), (q3 - q0).abs())) + ) + ) + ) as usize } // Assumes rec[0] is set 4 taps back from the edge -fn deblock_size8(rec: &mut[u16], pitch: usize, stride: usize, level: usize, bd: usize) { - let mut s = 0; - let flat = 1 << bd - 8; - for _i in 0..4 { - let p = &mut rec[s..]; - let p3 = p[0] as i32; - let p2 = p[pitch] as i32; - let p1 = p[pitch*2] as i32; - let p0 = p[pitch*3] as i32; - let q0 = p[pitch*4] as i32; - let q1 = p[pitch*5] as i32; - let q2 = p[pitch*6] as i32; - let q3 = p[pitch*7] as i32; - if mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8) <= level { - let x: [i32; 6]; - if flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat { - x = filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3); - } else { - if nhev4(p1, p0, q0, q1, bd - 8) <= level { - x = filter_narrow4_6(p2, p1, p0, q0, q1, q2, bd - 8); - } else { - x = filter_narrow2_6(p2, p1, p0, q0, q1, q2, bd - 8); - } - } - stride_copy(&mut p[pitch..], &x, pitch); - } - s += stride; +fn deblock_size8( + rec: &mut [u16], pitch: usize, stride: usize, level: usize, bd: usize +) { + let mut s = 0; + let flat = 1 << bd - 8; + for _i in 0..4 { + let p = &mut rec[s..]; + let p3 = p[0] as i32; + let p2 = p[pitch] as i32; + let p1 = p[pitch * 2] as i32; + let p0 = p[pitch * 3] as i32; + let q0 = p[pitch * 4] as i32; + let q1 = p[pitch * 5] as i32; + let q2 = p[pitch * 6] as i32; + let q3 = p[pitch * 7] as i32; + if mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8) <= level { + let x: [i32; 6]; + if flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat { + x = filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3); + } else { + if nhev4(p1, p0, q0, q1, bd - 8) <= level { + x = filter_narrow4_6(p2, p1, p0, q0, q1, q2, bd - 8); + } else { + x = filter_narrow2_6(p2, p1, p0, q0, q1, q2, bd - 8); + } + } + stride_copy(&mut p[pitch..], &x, pitch); } + s += stride; + } } // Assumes rec[0] and src[0] are set 4 taps back from the edge. // Accesses eight taps, accumulates six pixels into the tally -fn sse_size8(rec: &[u16], src: &[u16], tally: &mut [i64; MAX_LOOP_FILTER+2], - rec_pitch: usize, src_pitch: usize, rec_stride: usize, src_stride: usize, bd: usize) { - let mut rec_s = 0; - let mut src_s = 0; - let flat = 1 << bd - 8; - for _i in 0..4 { - let p = &rec[rec_s..]; // eight taps - let a = &src[src_s + src_pitch..]; // six pixels to compare so offset one forward - let p3 = p[0] as i32; - let p2 = p[rec_pitch] as i32; - let p1 = p[rec_pitch*2] as i32; - let p0 = p[rec_pitch*3] as i32; - let q0 = p[rec_pitch*4] as i32; - let q1 = p[rec_pitch*5] as i32; - let q2 = p[rec_pitch*6] as i32; - let q3 = p[rec_pitch*7] as i32; - - // Four possibilities: no filter, wide8, narrow2 and narrow4 - let none: [_; 6] = [p2, p1, p0, q0, q1, q2]; - let wide8: [_; 6] = filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3); - let narrow2: [_; 6] = filter_narrow2_6(p2, p1, p0, q0, q1, q2, bd - 8); - let narrow4: [_; 6] = filter_narrow4_6(p2, p1, p0, q0, q1, q2, bd - 8); - - // mask8 sets the dividing line for filter vs no filter - // flat8 decides between wide and narrow filters (unrelated to level) - // nhev4 sets the dividing line between narrow2 and narrow4 - let mask = clamp(mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd-8), 1, MAX_LOOP_FILTER+1) as usize; - let flatp = flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat; - let nhev = clamp(nhev4(p1, p0, q0, q1, bd-8), mask, MAX_LOOP_FILTER+1) as usize; - - // sse for each; short-circuit the 'special' no-op cases. - let sse_none = stride_sse(a, &none, src_pitch); - let sse_wide8 = if flatp && mask <= MAX_LOOP_FILTER { stride_sse(a, &wide8, src_pitch) } else { sse_none }; - let sse_narrow2 = if !flatp && nhev != mask { stride_sse(a, &narrow2, src_pitch) } else { sse_none }; - let sse_narrow4 = if !flatp && nhev <= MAX_LOOP_FILTER { stride_sse(a, &narrow4, src_pitch) } else { sse_none }; - - // accumulate possible filter values into the tally - tally[0] += sse_none; - tally[mask] -= sse_none; - if flatp { - tally[mask] += sse_wide8; - } else { - tally[mask] += sse_narrow2; - tally[nhev] -= sse_narrow2; - tally[nhev] += sse_narrow4; - } +fn sse_size8( + rec: &[u16], src: &[u16], tally: &mut [i64; MAX_LOOP_FILTER + 2], + rec_pitch: usize, src_pitch: usize, rec_stride: usize, src_stride: usize, + bd: usize +) { + let mut rec_s = 0; + let mut src_s = 0; + let flat = 1 << bd - 8; + for _i in 0..4 { + let p = &rec[rec_s..]; // eight taps + let a = &src[src_s + src_pitch..]; // six pixels to compare so offset one forward + let p3 = p[0] as i32; + let p2 = p[rec_pitch] as i32; + let p1 = p[rec_pitch * 2] as i32; + let p0 = p[rec_pitch * 3] as i32; + let q0 = p[rec_pitch * 4] as i32; + let q1 = p[rec_pitch * 5] as i32; + let q2 = p[rec_pitch * 6] as i32; + let q3 = p[rec_pitch * 7] as i32; + + // Four possibilities: no filter, wide8, narrow2 and narrow4 + let none: [_; 6] = [p2, p1, p0, q0, q1, q2]; + let wide8: [_; 6] = filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3); + let narrow2: [_; 6] = filter_narrow2_6(p2, p1, p0, q0, q1, q2, bd - 8); + let narrow4: [_; 6] = filter_narrow4_6(p2, p1, p0, q0, q1, q2, bd - 8); + + // mask8 sets the dividing line for filter vs no filter + // flat8 decides between wide and narrow filters (unrelated to level) + // nhev4 sets the dividing line between narrow2 and narrow4 + let mask = clamp( + mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8), + 1, + MAX_LOOP_FILTER + 1 + ) as usize; + let flatp = flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat; + let nhev = + clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1) as usize; + + // sse for each; short-circuit the 'special' no-op cases. + let sse_none = stride_sse(a, &none, src_pitch); + let sse_wide8 = if flatp && mask <= MAX_LOOP_FILTER { + stride_sse(a, &wide8, src_pitch) + } else { + sse_none + }; + let sse_narrow2 = if !flatp && nhev != mask { + stride_sse(a, &narrow2, src_pitch) + } else { + sse_none + }; + let sse_narrow4 = if !flatp && nhev <= MAX_LOOP_FILTER { + stride_sse(a, &narrow4, src_pitch) + } else { + sse_none + }; - src_s += src_stride; - rec_s += rec_stride; + // accumulate possible filter values into the tally + tally[0] += sse_none; + tally[mask] -= sse_none; + if flatp { + tally[mask] += sse_wide8; + } else { + tally[mask] += sse_narrow2; + tally[nhev] -= sse_narrow2; + tally[nhev] += sse_narrow4; } + + src_s += src_stride; + rec_s += rec_stride; + } } -fn flat14_outer(p6: i32, p5: i32, p4: i32, p0: i32, q0: i32, q4: i32, q5: i32, q6: i32) -> usize { - cmp::max((p4 - p0).abs(), - cmp::max((q4 - q0).abs(), - cmp::max((p5 - p0).abs(), - cmp::max((q5 - q0).abs(), - cmp::max((p6 - p0).abs(),(q6 - q0).abs()))))) as usize +fn flat14_outer( + p6: i32, p5: i32, p4: i32, p0: i32, q0: i32, q4: i32, q5: i32, q6: i32 +) -> usize { + cmp::max( + (p4 - p0).abs(), + cmp::max( + (q4 - q0).abs(), + cmp::max( + (p5 - p0).abs(), + cmp::max((q5 - q0).abs(), cmp::max((p6 - p0).abs(), (q6 - q0).abs())) + ) + ) + ) as usize } // Assumes rec[0] is set 7 taps back from the edge -fn deblock_size14(rec: &mut[u16], pitch: usize, stride: usize, level: usize, bd: usize) { - let mut s = 0; - let flat = 1 << bd - 8; - for _i in 0..4 { - let p = &mut rec[s..]; - let p6 = p[0] as i32; - let p5 = p[pitch] as i32; - let p4 = p[pitch*2] as i32; - let p3 = p[pitch*3] as i32; - let p2 = p[pitch*4] as i32; - let p1 = p[pitch*5] as i32; - let p0 = p[pitch*6] as i32; - let q0 = p[pitch*7] as i32; - let q1 = p[pitch*8] as i32; - let q2 = p[pitch*9] as i32; - let q3 = p[pitch*10] as i32; - let q4 = p[pitch*11] as i32; - let q5 = p[pitch*12] as i32; - let q6 = p[pitch*13] as i32; - // 'mask' test - if mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8) <= level { - let x: [i32; 12]; - // inner flatness test - if flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat { - // outer flatness test - if flat14_outer(p6, p5, p4, p0, q0, q4, q5, q6) <= flat { - // sufficient flatness across 14 pixel width; run full-width filter - x = filter_wide14_12(p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6); - } else { - // only flat in inner area, run 8-tap - x = filter_wide8_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5); - } - } else { - // not flat, run narrow filter - if nhev4(p1, p0, q0, q1, bd - 8) <= level { - x = filter_narrow4_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8); - } else { - x = filter_narrow2_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8); - } - } - stride_copy(&mut p[pitch..], &x, pitch); +fn deblock_size14( + rec: &mut [u16], pitch: usize, stride: usize, level: usize, bd: usize +) { + let mut s = 0; + let flat = 1 << bd - 8; + for _i in 0..4 { + let p = &mut rec[s..]; + let p6 = p[0] as i32; + let p5 = p[pitch] as i32; + let p4 = p[pitch * 2] as i32; + let p3 = p[pitch * 3] as i32; + let p2 = p[pitch * 4] as i32; + let p1 = p[pitch * 5] as i32; + let p0 = p[pitch * 6] as i32; + let q0 = p[pitch * 7] as i32; + let q1 = p[pitch * 8] as i32; + let q2 = p[pitch * 9] as i32; + let q3 = p[pitch * 10] as i32; + let q4 = p[pitch * 11] as i32; + let q5 = p[pitch * 12] as i32; + let q6 = p[pitch * 13] as i32; + // 'mask' test + if mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8) <= level { + let x: [i32; 12]; + // inner flatness test + if flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat { + // outer flatness test + if flat14_outer(p6, p5, p4, p0, q0, q4, q5, q6) <= flat { + // sufficient flatness across 14 pixel width; run full-width filter + x = filter_wide14_12( + p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, + ); + } else { + // only flat in inner area, run 8-tap + x = filter_wide8_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5); } - s += stride; + } else { + // not flat, run narrow filter + if nhev4(p1, p0, q0, q1, bd - 8) <= level { + x = filter_narrow4_12( + p5, + p4, + p3, + p2, + p1, + p0, + q0, + q1, + q2, + q3, + q4, + q5, + bd - 8 + ); + } else { + x = filter_narrow2_12( + p5, + p4, + p3, + p2, + p1, + p0, + q0, + q1, + q2, + q3, + q4, + q5, + bd - 8 + ); + } + } + stride_copy(&mut p[pitch..], &x, pitch); } + s += stride; + } } // Assumes rec[0] and src[0] are set 7 taps back from the edge. // Accesses fourteen taps, accumulates twelve pixels into the tally -fn sse_size14(rec: &[u16], src: &[u16], tally: &mut [i64; MAX_LOOP_FILTER+2], - rec_pitch: usize, src_pitch: usize, rec_stride: usize, src_stride: usize, bd: usize) { - let mut rec_s = 0; - let mut src_s = 0; - let flat = 1 << bd - 8; - for _i in 0..4 { - let p = &rec[rec_s..]; // 14 taps - let a = &src[src_s + src_pitch..]; // 12 pixels to compare so offset one forward - let p6 = p[0] as i32; - let p5 = p[rec_pitch] as i32; - let p4 = p[rec_pitch*2] as i32; - let p3 = p[rec_pitch*3] as i32; - let p2 = p[rec_pitch*4] as i32; - let p1 = p[rec_pitch*5] as i32; - let p0 = p[rec_pitch*6] as i32; - let q0 = p[rec_pitch*7] as i32; - let q1 = p[rec_pitch*8] as i32; - let q2 = p[rec_pitch*9] as i32; - let q3 = p[rec_pitch*10] as i32; - let q4 = p[rec_pitch*11] as i32; - let q5 = p[rec_pitch*12] as i32; - let q6 = p[rec_pitch*13] as i32; - - // Five possibilities: no filter, wide14, wide8, narrow2 and narrow4 - let none: [i32 ;12] = [p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5]; - let wide14 = filter_wide14_12(p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6); - let wide8 = filter_wide8_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5); - let narrow2 = filter_narrow2_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8); - let narrow4 = filter_narrow4_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8); - - // mask8 sets the dividing line for filter vs no filter - // flat8 decides between wide and narrow filters (unrelated to level) - // flat14 decides between wide14 and wide8 filters - // nhev4 sets the dividing line between narrow2 and narrow4 - let mask = clamp(mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd-8), 1, MAX_LOOP_FILTER+1) as usize; - let flat8p = flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat; - let flat14p = flat14_outer(p6, p5, p4, p0, q0, q4, q5, q6) <= flat; - let nhev = clamp(nhev4(p1, p0, q0, q1, bd-8), mask, MAX_LOOP_FILTER+1) as usize; - - // sse for each; short-circuit the 'special' no-op cases. - let sse_none = stride_sse(a, &none, src_pitch); - let sse_wide8 = if flat8p && !flat14p && mask <= MAX_LOOP_FILTER { - stride_sse(a, &wide8, src_pitch) - } else { - sse_none - }; - let sse_wide14 = if flat8p && flat14p && mask <= MAX_LOOP_FILTER { - stride_sse(a, &wide14, src_pitch) - } else { - sse_none - }; - let sse_narrow2 = if !flat8p && nhev != mask { - stride_sse(a, &narrow2, src_pitch) - } else { - sse_none - }; - let sse_narrow4 = if !flat8p && nhev <= MAX_LOOP_FILTER { - stride_sse(a, &narrow4, src_pitch) - } else { - sse_none - }; - - // accumulate possible filter values into the tally - tally[0] += sse_none; - tally[mask] -= sse_none; - if flat8p { - if flat14p { - tally[mask] += sse_wide14; - } else { - tally[mask] += sse_wide8; - } - } else { - tally[mask] += sse_narrow2; - tally[nhev] -= sse_narrow2; - tally[nhev] += sse_narrow4; - } +fn sse_size14( + rec: &[u16], src: &[u16], tally: &mut [i64; MAX_LOOP_FILTER + 2], + rec_pitch: usize, src_pitch: usize, rec_stride: usize, src_stride: usize, + bd: usize +) { + let mut rec_s = 0; + let mut src_s = 0; + let flat = 1 << bd - 8; + for _i in 0..4 { + let p = &rec[rec_s..]; // 14 taps + let a = &src[src_s + src_pitch..]; // 12 pixels to compare so offset one forward + let p6 = p[0] as i32; + let p5 = p[rec_pitch] as i32; + let p4 = p[rec_pitch * 2] as i32; + let p3 = p[rec_pitch * 3] as i32; + let p2 = p[rec_pitch * 4] as i32; + let p1 = p[rec_pitch * 5] as i32; + let p0 = p[rec_pitch * 6] as i32; + let q0 = p[rec_pitch * 7] as i32; + let q1 = p[rec_pitch * 8] as i32; + let q2 = p[rec_pitch * 9] as i32; + let q3 = p[rec_pitch * 10] as i32; + let q4 = p[rec_pitch * 11] as i32; + let q5 = p[rec_pitch * 12] as i32; + let q6 = p[rec_pitch * 13] as i32; + + // Five possibilities: no filter, wide14, wide8, narrow2 and narrow4 + let none: [i32; 12] = [p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5]; + let wide14 = + filter_wide14_12(p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6); + let wide8 = + filter_wide8_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5); + let narrow2 = filter_narrow2_12( + p5, + p4, + p3, + p2, + p1, + p0, + q0, + q1, + q2, + q3, + q4, + q5, + bd - 8 + ); + let narrow4 = filter_narrow4_12( + p5, + p4, + p3, + p2, + p1, + p0, + q0, + q1, + q2, + q3, + q4, + q5, + bd - 8 + ); + + // mask8 sets the dividing line for filter vs no filter + // flat8 decides between wide and narrow filters (unrelated to level) + // flat14 decides between wide14 and wide8 filters + // nhev4 sets the dividing line between narrow2 and narrow4 + let mask = clamp( + mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8), + 1, + MAX_LOOP_FILTER + 1 + ) as usize; + let flat8p = flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat; + let flat14p = flat14_outer(p6, p5, p4, p0, q0, q4, q5, q6) <= flat; + let nhev = + clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1) as usize; + + // sse for each; short-circuit the 'special' no-op cases. + let sse_none = stride_sse(a, &none, src_pitch); + let sse_wide8 = if flat8p && !flat14p && mask <= MAX_LOOP_FILTER { + stride_sse(a, &wide8, src_pitch) + } else { + sse_none + }; + let sse_wide14 = if flat8p && flat14p && mask <= MAX_LOOP_FILTER { + stride_sse(a, &wide14, src_pitch) + } else { + sse_none + }; + let sse_narrow2 = if !flat8p && nhev != mask { + stride_sse(a, &narrow2, src_pitch) + } else { + sse_none + }; + let sse_narrow4 = if !flat8p && nhev <= MAX_LOOP_FILTER { + stride_sse(a, &narrow4, src_pitch) + } else { + sse_none + }; - rec_s += rec_stride; - src_s += src_stride; + // accumulate possible filter values into the tally + tally[0] += sse_none; + tally[mask] -= sse_none; + if flat8p { + if flat14p { + tally[mask] += sse_wide14; + } else { + tally[mask] += sse_wide8; + } + } else { + tally[mask] += sse_narrow2; + tally[nhev] -= sse_narrow2; + tally[nhev] += sse_narrow4; } -} -fn filter_v_edge(deblock: &DeblockState, - bc: &BlockContext, - bo: &BlockOffset, - p: &mut Plane, - pli: usize, - bd: usize) { - let block = bc.at(&bo); - let tx_edge = bo.x & (block.tx_w - 1) == 0; - if tx_edge { - let prev_block = deblock_left(bc, bo, p); - let block_edge = bo.x & (block.n4_w - 1) == 0; - let filter_size = deblock_size(block, prev_block, p, pli, true, block_edge); - if filter_size > 0 { - let level = deblock_level(deblock, block, prev_block, pli, true); - if level > 0 { - let po = bo.plane_offset(&p.cfg); - let stride = p.cfg.stride; - let mut plane_slice = p.mut_slice(&po); - plane_slice.x -= (filter_size>>1) as isize; - let slice = plane_slice.as_mut_slice(); - match filter_size { - 4 => { deblock_size4(slice, 1, stride, level, bd); }, - 6 => { deblock_size6(slice, 1, stride, level, bd); }, - 8 => { deblock_size8(slice, 1, stride, level, bd); }, - 14 => { deblock_size14(slice, 1, stride, level, bd); }, - _ => {unreachable!()} - } - } - } - } + rec_s += rec_stride; + src_s += src_stride; + } } -fn sse_v_edge(bc: &BlockContext, - bo: &BlockOffset, - rec_plane: &Plane, - src_plane: &Plane, - tally: &mut [i64; MAX_LOOP_FILTER+2], - pli: usize, - bd: usize) { - let block = bc.at(&bo); - let tx_edge = bo.x & (block.tx_w - 1) == 0; - if tx_edge { - let prev_block = deblock_left(bc, bo, rec_plane); - let block_edge = bo.x & (block.n4_w - 1) == 0; - let filter_size = deblock_size(block, prev_block, rec_plane, pli, true, block_edge); - if filter_size > 0 { - let po = bo.plane_offset(&rec_plane.cfg); // rec and src have identical subsampling - let rec_slice = rec_plane.slice(&po); - let src_slice = src_plane.slice(&po); - let rec_tmp = rec_slice.go_left(filter_size>>1); - let src_tmp = src_slice.go_left(filter_size>>1); - let rec = rec_tmp.as_slice(); - let src = src_tmp.as_slice(); - match filter_size { - 4 => { sse_size4(rec, src, tally, 1, 1, rec_plane.cfg.stride, src_plane.cfg.stride, bd); }, - 6 => { sse_size6(rec, src, tally, 1, 1, rec_plane.cfg.stride, src_plane.cfg.stride, bd); }, - 8 => { sse_size8(rec, src, tally, 1, 1, rec_plane.cfg.stride, src_plane.cfg.stride, bd); }, - 14 => { sse_size14(rec, src, tally, 1, 1, rec_plane.cfg.stride, src_plane.cfg.stride, bd); }, - _ => {unreachable!()} - } +fn filter_v_edge( + deblock: &DeblockState, bc: &BlockContext, bo: &BlockOffset, p: &mut Plane, + pli: usize, bd: usize +) { + let block = bc.at(&bo); + let tx_edge = bo.x & (block.tx_w - 1) == 0; + if tx_edge { + let prev_block = deblock_left(bc, bo, p); + let block_edge = bo.x & (block.n4_w - 1) == 0; + let filter_size = + deblock_size(block, prev_block, p, pli, true, block_edge); + if filter_size > 0 { + let level = deblock_level(deblock, block, prev_block, pli, true); + if level > 0 { + let po = bo.plane_offset(&p.cfg); + let stride = p.cfg.stride; + let mut plane_slice = p.mut_slice(&po); + plane_slice.x -= (filter_size >> 1) as isize; + let slice = plane_slice.as_mut_slice(); + match filter_size { + 4 => { + deblock_size4(slice, 1, stride, level, bd); + } + 6 => { + deblock_size6(slice, 1, stride, level, bd); + } + 8 => { + deblock_size8(slice, 1, stride, level, bd); + } + 14 => { + deblock_size14(slice, 1, stride, level, bd); + } + _ => unreachable!() } + } } + } } -fn filter_h_edge(deblock: &DeblockState, - bc: &BlockContext, - bo: &BlockOffset, - p: &mut Plane, - pli: usize, - bd: usize) { - let block = bc.at(&bo); - let tx_edge = bo.y & (block.tx_h - 1) == 0; - if tx_edge { - let prev_block = deblock_up(bc, bo, p); - let block_edge = bo.y & (block.n4_h - 1) == 0; - let filter_size = deblock_size(block, prev_block, p, pli, false, block_edge); - if filter_size > 0 { - let level = deblock_level(deblock, block, prev_block, pli, false); - if level > 0 { - let po = bo.plane_offset(&p.cfg); - let stride = p.cfg.stride; - let mut plane_slice = p.mut_slice(&po); - plane_slice.y -= (filter_size>>1) as isize; - let slice = plane_slice.as_mut_slice(); - match filter_size { - 4 => { deblock_size4(slice, stride, 1, level, bd); }, - 6 => { deblock_size6(slice, stride, 1, level, bd); }, - 8 => { deblock_size8(slice, stride, 1, level, bd); }, - 14 => { deblock_size14(slice, stride, 1, level, bd); }, - _ => {unreachable!()} - } - } +fn sse_v_edge( + bc: &BlockContext, bo: &BlockOffset, rec_plane: &Plane, src_plane: &Plane, + tally: &mut [i64; MAX_LOOP_FILTER + 2], pli: usize, bd: usize +) { + let block = bc.at(&bo); + let tx_edge = bo.x & (block.tx_w - 1) == 0; + if tx_edge { + let prev_block = deblock_left(bc, bo, rec_plane); + let block_edge = bo.x & (block.n4_w - 1) == 0; + let filter_size = + deblock_size(block, prev_block, rec_plane, pli, true, block_edge); + if filter_size > 0 { + let po = bo.plane_offset(&rec_plane.cfg); // rec and src have identical subsampling + let rec_slice = rec_plane.slice(&po); + let src_slice = src_plane.slice(&po); + let rec_tmp = rec_slice.go_left(filter_size >> 1); + let src_tmp = src_slice.go_left(filter_size >> 1); + let rec = rec_tmp.as_slice(); + let src = src_tmp.as_slice(); + match filter_size { + 4 => { + sse_size4( + rec, + src, + tally, + 1, + 1, + rec_plane.cfg.stride, + src_plane.cfg.stride, + bd + ); + } + 6 => { + sse_size6( + rec, + src, + tally, + 1, + 1, + rec_plane.cfg.stride, + src_plane.cfg.stride, + bd + ); + } + 8 => { + sse_size8( + rec, + src, + tally, + 1, + 1, + rec_plane.cfg.stride, + src_plane.cfg.stride, + bd + ); } + 14 => { + sse_size14( + rec, + src, + tally, + 1, + 1, + rec_plane.cfg.stride, + src_plane.cfg.stride, + bd + ); + } + _ => unreachable!() + } } + } } -fn sse_h_edge(bc: &BlockContext, - bo: &BlockOffset, - rec_plane: &Plane, - src_plane: &Plane, - tally: &mut [i64; MAX_LOOP_FILTER+2], - pli: usize, - bd: usize) { - let block = bc.at(&bo); - let tx_edge = bo.y & (block.tx_h - 1) == 0; - if tx_edge { - let prev_block = deblock_up(bc, bo, rec_plane); - let block_edge = bo.y & (block.n4_h - 1) == 0; - let filter_size = deblock_size(block, prev_block, rec_plane, pli, true, block_edge); - if filter_size > 0 { - let po = bo.plane_offset(&rec_plane.cfg); // rec and src have identical subsampling - let rec_slice = rec_plane.slice(&po); - let src_slice = src_plane.slice(&po); - let rec_tmp = rec_slice.go_up(filter_size>>1); - let src_tmp = src_slice.go_up(filter_size>>1); - let rec = rec_tmp.as_slice(); - let src = src_tmp.as_slice(); - match filter_size { - 4 => { sse_size4(rec, src, tally, rec_plane.cfg.stride, src_plane.cfg.stride, 1, 1, bd); }, - 6 => { sse_size6(rec, src, tally, rec_plane.cfg.stride, src_plane.cfg.stride, 1, 1, bd); }, - 8 => { sse_size8(rec, src, tally, rec_plane.cfg.stride, src_plane.cfg.stride, 1, 1, bd); }, - 14 => { sse_size14(rec, src, tally, rec_plane.cfg.stride, src_plane.cfg.stride, 1, 1, bd); }, - _ => {unreachable!()} - } +fn filter_h_edge( + deblock: &DeblockState, bc: &BlockContext, bo: &BlockOffset, p: &mut Plane, + pli: usize, bd: usize +) { + let block = bc.at(&bo); + let tx_edge = bo.y & (block.tx_h - 1) == 0; + if tx_edge { + let prev_block = deblock_up(bc, bo, p); + let block_edge = bo.y & (block.n4_h - 1) == 0; + let filter_size = + deblock_size(block, prev_block, p, pli, false, block_edge); + if filter_size > 0 { + let level = deblock_level(deblock, block, prev_block, pli, false); + if level > 0 { + let po = bo.plane_offset(&p.cfg); + let stride = p.cfg.stride; + let mut plane_slice = p.mut_slice(&po); + plane_slice.y -= (filter_size >> 1) as isize; + let slice = plane_slice.as_mut_slice(); + match filter_size { + 4 => { + deblock_size4(slice, stride, 1, level, bd); + } + 6 => { + deblock_size6(slice, stride, 1, level, bd); + } + 8 => { + deblock_size8(slice, stride, 1, level, bd); + } + 14 => { + deblock_size14(slice, stride, 1, level, bd); + } + _ => unreachable!() } + } } + } } -// Deblocks all edges, vertical and horizontal, in a single plane -pub fn deblock_plane(deblock: &DeblockState, p: &mut Plane, - pli: usize, bc: &mut BlockContext, bd: usize) { - - let xdec = p.cfg.xdec; - let ydec = p.cfg.ydec; - - match pli { - 0 => if deblock.levels[0] == 0 && deblock.levels[1] == 0 {return}, - 1 => if deblock.levels[2] == 0 {return}, - 2 => if deblock.levels[3] == 0 {return}, - _ => {return} - } - - // vertical edge filtering leads horizonal by one full MI-sized - // row (and horizontal filtering doesn't happen along the upper - // edge). Unroll to avoid corner-cases. - if bc.rows > 0 { - for x in (1<<xdec..bc.cols).step_by(1 << xdec) { - filter_v_edge(deblock, bc, &BlockOffset{x: x, y: 0}, p, pli, bd); +fn sse_h_edge( + bc: &BlockContext, bo: &BlockOffset, rec_plane: &Plane, src_plane: &Plane, + tally: &mut [i64; MAX_LOOP_FILTER + 2], pli: usize, bd: usize +) { + let block = bc.at(&bo); + let tx_edge = bo.y & (block.tx_h - 1) == 0; + if tx_edge { + let prev_block = deblock_up(bc, bo, rec_plane); + let block_edge = bo.y & (block.n4_h - 1) == 0; + let filter_size = + deblock_size(block, prev_block, rec_plane, pli, true, block_edge); + if filter_size > 0 { + let po = bo.plane_offset(&rec_plane.cfg); // rec and src have identical subsampling + let rec_slice = rec_plane.slice(&po); + let src_slice = src_plane.slice(&po); + let rec_tmp = rec_slice.go_up(filter_size >> 1); + let src_tmp = src_slice.go_up(filter_size >> 1); + let rec = rec_tmp.as_slice(); + let src = src_tmp.as_slice(); + match filter_size { + 4 => { + sse_size4( + rec, + src, + tally, + rec_plane.cfg.stride, + src_plane.cfg.stride, + 1, + 1, + bd + ); } - if bc.rows > 1 << ydec { - for x in (1<<xdec..bc.cols).step_by(1 << xdec) { - filter_v_edge(deblock, bc, &BlockOffset{x: x, y: 1 << ydec}, p, pli, bd); - } + 6 => { + sse_size6( + rec, + src, + tally, + rec_plane.cfg.stride, + src_plane.cfg.stride, + 1, + 1, + bd + ); } - } - - // filter rows where vertical and horizontal edge filtering both - // happen (horizontal edge filtering lags vertical by one row). - for y in ((2 << ydec)..bc.rows).step_by(1 << ydec) { - // Check for vertical edge at first MI block boundary on this row - if 1 << xdec < bc.cols { - filter_v_edge(deblock, bc, &BlockOffset{x: 1 << xdec, y: y}, p, pli, bd); - } - // run the rest of the row with both vertical and horizontal edge filtering. - // Horizontal lags vertical edge by one row and two columns. - for x in (2 << xdec..bc.cols).step_by(1 << xdec){ - filter_v_edge(deblock, bc, &BlockOffset{x: x, y: y}, p, pli, bd); - filter_h_edge(deblock, bc, &BlockOffset{x: x - (2 << xdec), y: y - (1 << ydec)}, p, pli, bd); + 8 => { + sse_size8( + rec, + src, + tally, + rec_plane.cfg.stride, + src_plane.cfg.stride, + 1, + 1, + bd + ); } - // ..and the last two horizontal edges for the row - if bc.cols - (2 << xdec) > 0 { - filter_h_edge(deblock, bc, &BlockOffset{x: bc.cols - (2 << xdec), y: y - (1 << ydec)}, p, pli, bd); - if bc.cols - (1 << xdec) > 0 { - filter_h_edge(deblock, bc, &BlockOffset{x: bc.cols - (1 << xdec), y: y - (1 << ydec)}, p, pli, bd); - } + 14 => { + sse_size14( + rec, + src, + tally, + rec_plane.cfg.stride, + src_plane.cfg.stride, + 1, + 1, + bd + ); } + _ => unreachable!() + } } + } +} - // Last horizontal row, vertical is already complete +// Deblocks all edges, vertical and horizontal, in a single plane +pub fn deblock_plane( + deblock: &DeblockState, p: &mut Plane, pli: usize, bc: &mut BlockContext, + bd: usize +) { + let xdec = p.cfg.xdec; + let ydec = p.cfg.ydec; + + match pli { + 0 => + if deblock.levels[0] == 0 && deblock.levels[1] == 0 { + return; + }, + 1 => + if deblock.levels[2] == 0 { + return; + }, + 2 => + if deblock.levels[3] == 0 { + return; + }, + _ => return + } + + // vertical edge filtering leads horizonal by one full MI-sized + // row (and horizontal filtering doesn't happen along the upper + // edge). Unroll to avoid corner-cases. + if bc.rows > 0 { + for x in (1 << xdec..bc.cols).step_by(1 << xdec) { + filter_v_edge(deblock, bc, &BlockOffset { x, y: 0 }, p, pli, bd); + } if bc.rows > 1 << ydec { - for x in (0..bc.cols).step_by(1 << xdec) { - filter_h_edge(deblock, bc, &BlockOffset{x: x, y: bc.rows - (1 << ydec)}, p, pli, bd); - } + for x in (1 << xdec..bc.cols).step_by(1 << xdec) { + filter_v_edge( + deblock, + bc, + &BlockOffset { x, y: 1 << ydec }, + p, + pli, + bd + ); + } + } + } + + // filter rows where vertical and horizontal edge filtering both + // happen (horizontal edge filtering lags vertical by one row). + for y in ((2 << ydec)..bc.rows).step_by(1 << ydec) { + // Check for vertical edge at first MI block boundary on this row + if 1 << xdec < bc.cols { + filter_v_edge(deblock, bc, &BlockOffset { x: 1 << xdec, y }, p, pli, bd); + } + // run the rest of the row with both vertical and horizontal edge filtering. + // Horizontal lags vertical edge by one row and two columns. + for x in (2 << xdec..bc.cols).step_by(1 << xdec) { + filter_v_edge(deblock, bc, &BlockOffset { x, y }, p, pli, bd); + filter_h_edge( + deblock, + bc, + &BlockOffset { x: x - (2 << xdec), y: y - (1 << ydec) }, + p, + pli, + bd + ); + } + // ..and the last two horizontal edges for the row + if bc.cols - (2 << xdec) > 0 { + filter_h_edge( + deblock, + bc, + &BlockOffset { x: bc.cols - (2 << xdec), y: y - (1 << ydec) }, + p, + pli, + bd + ); + if bc.cols - (1 << xdec) > 0 { + filter_h_edge( + deblock, + bc, + &BlockOffset { x: bc.cols - (1 << xdec), y: y - (1 << ydec) }, + p, + pli, + bd + ); + } + } + } + + // Last horizontal row, vertical is already complete + if bc.rows > 1 << ydec { + for x in (0..bc.cols).step_by(1 << xdec) { + filter_h_edge( + deblock, + bc, + &BlockOffset { x, y: bc.rows - (1 << ydec) }, + p, + pli, + bd + ); } + } } // sse count of all edges in a single plane, accumulates into vertical and horizontal counts -fn sse_plane(rec: &Plane, src: &Plane, - v_sse: &mut [i64; MAX_LOOP_FILTER+2], h_sse: &mut [i64; MAX_LOOP_FILTER+2], - pli: usize, bc: &mut BlockContext, bd: usize) { - - let xdec = rec.cfg.xdec; - let ydec = rec.cfg.ydec; - - // No horizontal edge filtering along top of frame - for x in (1 << xdec..bc.cols).step_by(1 << xdec){ - sse_v_edge(bc, &BlockOffset{x: x, y: 0}, rec, src, v_sse, pli, bd); - } - - // Unlike actual filtering, we're counting horizontal and vertical - // as separable cases. No need to lag the horizontal processing - // behind vertical. - for y in (1 << ydec..bc.rows).step_by(1 << ydec) { - // No vertical filtering along left edge of frame - sse_h_edge(bc, &BlockOffset{x: 0, y: y}, rec, src, h_sse, pli, bd); - for x in (1 << xdec..bc.cols).step_by(1 << xdec){ - sse_v_edge(bc, &BlockOffset{x: x, y: y}, rec, src, v_sse, pli, bd); - sse_h_edge(bc, &BlockOffset{x: x, y: y}, rec, src, h_sse, pli, bd); - } +fn sse_plane( + rec: &Plane, src: &Plane, v_sse: &mut [i64; MAX_LOOP_FILTER + 2], + h_sse: &mut [i64; MAX_LOOP_FILTER + 2], pli: usize, bc: &mut BlockContext, + bd: usize +) { + let xdec = rec.cfg.xdec; + let ydec = rec.cfg.ydec; + + // No horizontal edge filtering along top of frame + for x in (1 << xdec..bc.cols).step_by(1 << xdec) { + sse_v_edge(bc, &BlockOffset { x, y: 0 }, rec, src, v_sse, pli, bd); + } + + // Unlike actual filtering, we're counting horizontal and vertical + // as separable cases. No need to lag the horizontal processing + // behind vertical. + for y in (1 << ydec..bc.rows).step_by(1 << ydec) { + // No vertical filtering along left edge of frame + sse_h_edge(bc, &BlockOffset { x: 0, y }, rec, src, h_sse, pli, bd); + for x in (1 << xdec..bc.cols).step_by(1 << xdec) { + sse_v_edge(bc, &BlockOffset { x, y }, rec, src, v_sse, pli, bd); + sse_h_edge(bc, &BlockOffset { x, y }, rec, src, h_sse, pli, bd); } + } } // Deblocks all edges in all planes of a frame -pub fn deblock_filter_frame(fs: &mut FrameState, - bc: &mut BlockContext, bit_depth: usize) { - for pli in 0..PLANES { - deblock_plane(&fs.deblock, &mut fs.rec.planes[pli], pli, bc, bit_depth); - } +pub fn deblock_filter_frame( + fs: &mut FrameState, bc: &mut BlockContext, bit_depth: usize +) { + for pli in 0..PLANES { + deblock_plane(&fs.deblock, &mut fs.rec.planes[pli], pli, bc, bit_depth); + } } fn sse_optimize(fs: &mut FrameState, bc: &mut BlockContext, bit_depth: usize) { - assert!(MAX_LOOP_FILTER < 999); - // i64 allows us to accumulate a total of ~ 35 bits worth of pixels - assert!(fs.input.planes[0].cfg.width.ilog() + fs.input.planes[0].cfg.height.ilog() < 35); - - for pli in 0..PLANES { - let mut v_tally: [i64; MAX_LOOP_FILTER+2] = [0; MAX_LOOP_FILTER+2]; - let mut h_tally: [i64; MAX_LOOP_FILTER+2] = [0; MAX_LOOP_FILTER+2]; - - sse_plane(&fs.rec.planes[pli], &fs.input.planes[pli], &mut v_tally, &mut h_tally, pli, bc, bit_depth); - - for i in 1..=MAX_LOOP_FILTER { - v_tally[i] += v_tally[i-1]; - h_tally[i] += h_tally[i-1]; - } + assert!(MAX_LOOP_FILTER < 999); + // i64 allows us to accumulate a total of ~ 35 bits worth of pixels + assert!( + fs.input.planes[0].cfg.width.ilog() + fs.input.planes[0].cfg.height.ilog() + < 35 + ); + + for pli in 0..PLANES { + let mut v_tally: [i64; MAX_LOOP_FILTER + 2] = [0; MAX_LOOP_FILTER + 2]; + let mut h_tally: [i64; MAX_LOOP_FILTER + 2] = [0; MAX_LOOP_FILTER + 2]; + + sse_plane( + &fs.rec.planes[pli], + &fs.input.planes[pli], + &mut v_tally, + &mut h_tally, + pli, + bc, + bit_depth + ); + + for i in 1..=MAX_LOOP_FILTER { + v_tally[i] += v_tally[i - 1]; + h_tally[i] += h_tally[i - 1]; + } - match pli { - 0 => { - let mut best_v = 999; - let mut best_h = 999; - for i in 0..=MAX_LOOP_FILTER { - if best_v == 999 || v_tally[best_v] > v_tally[i] { best_v = i; }; - if best_h == 999 || h_tally[best_h] > h_tally[i] { best_h = i; }; - } - fs.deblock.levels[0] = best_v as u8; - fs.deblock.levels[1] = best_h as u8; - }, - 1 | 2 => { - let mut best = 999; - for i in 0..=MAX_LOOP_FILTER { - if best == 999 || v_tally[best] + h_tally[best] > v_tally[i] + h_tally[i] { best = i; }; - } - fs.deblock.levels[pli+1] = best as u8; - }, - _ => {unreachable!()} + match pli { + 0 => { + let mut best_v = 999; + let mut best_h = 999; + for i in 0..=MAX_LOOP_FILTER { + if best_v == 999 || v_tally[best_v] > v_tally[i] { + best_v = i; + }; + if best_h == 999 || h_tally[best_h] > h_tally[i] { + best_h = i; + }; + } + fs.deblock.levels[0] = best_v as u8; + fs.deblock.levels[1] = best_h as u8; + } + 1 | 2 => { + let mut best = 999; + for i in 0..=MAX_LOOP_FILTER { + if best == 999 + || v_tally[best] + h_tally[best] > v_tally[i] + h_tally[i] + { + best = i; + }; } + fs.deblock.levels[pli + 1] = best as u8; + } + _ => unreachable!() } + } } -pub fn deblock_filter_optimize(fi: &FrameInvariants, fs: &mut FrameState, - bc: &mut BlockContext, bit_depth: usize) { - if fi.config.speed > 3 { - let q = ac_q(fi.base_q_idx, bit_depth) as i32; - let level = clamp (match bit_depth { - 8 => { - if fi.frame_type == FrameType::KEY { - q * 17563 - 421574 + (1<<18>>1) >> 18 - } else { - q * 6017 + 650707 + (1<<18>>1) >> 18 - } - } - 10 => { - if fi.frame_type == FrameType::KEY { - (q * 20723 + 4060632 + (1<<20>>1) >> 20) - 4 - } else { - q * 20723 + 4060632 + (1<<20>>1) >> 20 - } - } - 12 => { - if fi.frame_type == FrameType::KEY { - (q * 20723 + 16242526 + (1<<22>>1) >> 22) - 4 - } else { - q * 20723 + 16242526 + (1<<22>>1) >> 22 - } - } - _ => {assert!(false); 0} - }, 0, MAX_LOOP_FILTER as i32) as u8; - - fs.deblock.levels[0] = level; - fs.deblock.levels[1] = level; - fs.deblock.levels[2] = level; - fs.deblock.levels[3] = level; - } else { - sse_optimize(fs, bc, bit_depth); - } +pub fn deblock_filter_optimize( + fi: &FrameInvariants, fs: &mut FrameState, bc: &mut BlockContext, + bit_depth: usize +) { + if fi.config.speed > 3 { + let q = ac_q(fi.base_q_idx, bit_depth) as i32; + let level = clamp( + match bit_depth { + 8 => + if fi.frame_type == FrameType::KEY { + q * 17563 - 421574 + (1 << 18 >> 1) >> 18 + } else { + q * 6017 + 650707 + (1 << 18 >> 1) >> 18 + }, + 10 => + if fi.frame_type == FrameType::KEY { + (q * 20723 + 4060632 + (1 << 20 >> 1) >> 20) - 4 + } else { + q * 20723 + 4060632 + (1 << 20 >> 1) >> 20 + }, + 12 => + if fi.frame_type == FrameType::KEY { + (q * 20723 + 16242526 + (1 << 22 >> 1) >> 22) - 4 + } else { + q * 20723 + 16242526 + (1 << 22 >> 1) >> 22 + }, + _ => { + assert!(false); + 0 + } + }, + 0, + MAX_LOOP_FILTER as i32 + ) as u8; + + fs.deblock.levels[0] = level; + fs.deblock.levels[1] = level; + fs.deblock.levels[2] = level; + fs.deblock.levels[3] = level; + } else { + sse_optimize(fs, bc, bit_depth); + } }