Commit 3caefbda authored by Kyle Siefring's avatar Kyle Siefring Committed by GitHub

Hookup avx2 subpel code (#851)

parent 07032e6b
......@@ -48,7 +48,16 @@ fn main() {
let mut config_include_arg = String::from("-I");
config_include_arg.push_str(&out_dir);
config_include_arg.push('/');
nasm_rs::compile_library_args("rav1easm", &["src/x86/ipred.asm", "src/x86/me.asm"], &[&config_include_arg, "-Isrc/"]);
nasm_rs::compile_library_args(
"rav1easm",
&[
"src/x86/data.asm",
"src/x86/ipred.asm",
"src/x86/mc.asm",
"src/x86/me.asm"
],
&[&config_include_arg, "-Isrc/"]
);
println!("cargo:rustc-link-lib=static=rav1easm");
rerun_dir("src/x86");
rerun_dir("src/ext/x86");
......
......@@ -7,9 +7,10 @@
// Media Patent License 1.0 was not distributed with this source code in the
// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
use num_traits::*;
use plane::*;
use util::*;
#[cfg(all(target_arch = "x86_64", not(windows), feature = "nasm"))]
pub use self::nasm::*;
#[cfg(any(not(target_arch = "x86_64"), windows, not(feature = "nasm")))]
pub use self::native::*;
#[derive(Copy, Clone, Debug, PartialEq, PartialOrd)]
pub enum FilterMode {
......@@ -132,98 +133,325 @@ const SUBPEL_FILTERS: [[[i32; SUBPEL_FILTER_SIZE]; 16]; 6] = [
]
];
fn run_filter<T: AsPrimitive<i32>>(
src: &[T], stride: usize, filter: [i32; 8]
) -> i32 {
filter
.iter()
.zip(src.iter().step_by(stride))
.map(|(f, s)| f * s.as_())
.sum::<i32>()
}
#[cfg(all(target_arch = "x86_64", not(windows), feature = "nasm"))]
mod nasm {
use super::*;
use plane::*;
use util::*;
fn get_filter(
mode: FilterMode, frac: i32, length: usize
) -> [i32; SUBPEL_FILTER_SIZE] {
let filter_idx = if mode == FilterMode::BILINEAR || length > 4 {
mode as usize
} else {
(mode as usize).min(1) + 4
};
SUBPEL_FILTERS[filter_idx][frac as usize]
}
use std::mem;
pub fn put_8tap<'a>(
dst: &'a mut PlaneMutSlice<'a>, src: PlaneSlice, width: usize,
height: usize, col_frac: i32, row_frac: i32, bit_depth: usize,
mode: FilterMode
) {
let dst_stride = dst.plane.cfg.stride;
let dst_slice = dst.as_mut_slice();
let ref_stride = src.plane.cfg.stride;
let y_filter = get_filter(mode, row_frac, height);
let x_filter = get_filter(mode, col_frac, width);
let max_sample_val = ((1 << bit_depth) - 1) as i32;
let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 };
match (col_frac, row_frac) {
(0, 0) => {
let src_slice = src.as_slice();
for r in 0..height {
for c in 0..width {
dst_slice[r * dst_stride + c] = src_slice[r * ref_stride + c];
}
type PutFn = unsafe extern fn(
*mut u8,
libc::ptrdiff_t,
*const u8,
libc::ptrdiff_t,
i32,
i32,
i32,
i32
);
unsafe extern fn put_invalid(
_dst: *mut u8, _dst_stride: libc::ptrdiff_t, _src: *const u8,
_src_stride: libc::ptrdiff_t, _w: i32, _h: i32, _mx: i32, _my: i32
) {
unimplemented!();
}
macro_rules! decl_mc_fns {
($($func_name:ident),+) => {
extern {
$(
fn $func_name(
dst: *mut u8, dst_stride: libc::ptrdiff_t, src: *const u8,
src_stride: libc::ptrdiff_t, w: i32, h: i32, mx: i32, my: i32
);
)*
}
}
(0, _) => {
let src_slice = src.go_up(3).as_slice();
for r in 0..height {
for c in 0..width {
dst_slice[r * dst_stride + c] = round_shift(
run_filter(&src_slice[r * ref_stride + c..], ref_stride, y_filter),
7
)
.max(0)
.min(max_sample_val)
as u16;
}
}
decl_mc_fns!(
rav1e_put_8tap_regular_avx2,
rav1e_put_8tap_regular_smooth_avx2,
rav1e_put_8tap_regular_sharp_avx2,
rav1e_put_8tap_smooth_avx2,
rav1e_put_8tap_smooth_regular_avx2,
rav1e_put_8tap_smooth_sharp_avx2,
rav1e_put_8tap_sharp_avx2,
rav1e_put_8tap_sharp_regular_avx2,
rav1e_put_8tap_sharp_smooth_avx2,
rav1e_put_bilin_avx2
);
fn select_put_fn_avx2(mode_x: FilterMode, mode_y: FilterMode) -> PutFn {
match (mode_x, mode_y) {
(FilterMode::REGULAR, FilterMode::REGULAR) =>
rav1e_put_8tap_regular_avx2,
(FilterMode::REGULAR, FilterMode::SMOOTH) =>
rav1e_put_8tap_regular_smooth_avx2,
(FilterMode::REGULAR, FilterMode::SHARP) =>
rav1e_put_8tap_regular_sharp_avx2,
(FilterMode::SMOOTH, FilterMode::REGULAR) =>
rav1e_put_8tap_smooth_regular_avx2,
(FilterMode::SMOOTH, FilterMode::SMOOTH) => rav1e_put_8tap_smooth_avx2,
(FilterMode::SMOOTH, FilterMode::SHARP) =>
rav1e_put_8tap_smooth_sharp_avx2,
(FilterMode::SHARP, FilterMode::REGULAR) =>
rav1e_put_8tap_sharp_regular_avx2,
(FilterMode::SHARP, FilterMode::SMOOTH) =>
rav1e_put_8tap_sharp_smooth_avx2,
(FilterMode::SHARP, FilterMode::SHARP) => rav1e_put_8tap_sharp_avx2,
(FilterMode::BILINEAR, FilterMode::BILINEAR) => rav1e_put_bilin_avx2,
(_, _) => put_invalid
}
}
type PrepFn =
unsafe extern fn(*mut i16, *const u8, libc::ptrdiff_t, i32, i32, i32, i32);
unsafe extern fn prep_invalid(
_tmp: *mut i16, _src: *const u8, _src_stride: libc::ptrdiff_t, _w: i32,
_h: i32, _mx: i32, _my: i32
) {
unimplemented!();
}
macro_rules! decl_mct_fns {
($($func_name:ident),+) => {
extern {
$(
fn $func_name(
tmp: *mut i16, src: *const u8, src_stride: libc::ptrdiff_t, w: i32,
h: i32, mx: i32, my: i32
);
)*
}
}
(_, 0) => {
let src_slice = src.go_left(3).as_slice();
for r in 0..height {
for c in 0..width {
dst_slice[r * dst_stride + c] = round_shift(
round_shift(
run_filter(&src_slice[r * ref_stride + c..], 1, x_filter),
7 - intermediate_bits
),
intermediate_bits
)
.max(0)
.min(max_sample_val)
as u16;
}
decl_mct_fns!(
rav1e_prep_8tap_regular_avx2,
rav1e_prep_8tap_regular_smooth_avx2,
rav1e_prep_8tap_regular_sharp_avx2,
rav1e_prep_8tap_smooth_avx2,
rav1e_prep_8tap_smooth_regular_avx2,
rav1e_prep_8tap_smooth_sharp_avx2,
rav1e_prep_8tap_sharp_avx2,
rav1e_prep_8tap_sharp_regular_avx2,
rav1e_prep_8tap_sharp_smooth_avx2,
rav1e_prep_bilin_avx2
);
fn select_prep_fn_avx2(mode_x: FilterMode, mode_y: FilterMode) -> PrepFn {
match (mode_x, mode_y) {
(FilterMode::REGULAR, FilterMode::REGULAR) =>
rav1e_prep_8tap_regular_avx2,
(FilterMode::REGULAR, FilterMode::SMOOTH) =>
rav1e_prep_8tap_regular_smooth_avx2,
(FilterMode::REGULAR, FilterMode::SHARP) =>
rav1e_prep_8tap_regular_sharp_avx2,
(FilterMode::SMOOTH, FilterMode::REGULAR) =>
rav1e_prep_8tap_smooth_regular_avx2,
(FilterMode::SMOOTH, FilterMode::SMOOTH) => rav1e_prep_8tap_smooth_avx2,
(FilterMode::SMOOTH, FilterMode::SHARP) =>
rav1e_prep_8tap_smooth_sharp_avx2,
(FilterMode::SHARP, FilterMode::REGULAR) =>
rav1e_prep_8tap_sharp_regular_avx2,
(FilterMode::SHARP, FilterMode::SMOOTH) =>
rav1e_prep_8tap_sharp_smooth_avx2,
(FilterMode::SHARP, FilterMode::SHARP) => rav1e_prep_8tap_sharp_avx2,
(FilterMode::BILINEAR, FilterMode::BILINEAR) => rav1e_prep_bilin_avx2,
(_, _) => prep_invalid
}
}
extern {
fn rav1e_avg_avx2(
dst: *mut u8, dst_stride: libc::ptrdiff_t, tmp1: *const i16,
tmp2: *const i16, w: i32, h: i32
);
}
pub fn put_8tap<'a>(
dst: &'a mut PlaneMutSlice<'a>, src: PlaneSlice, width: usize,
height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode,
mode_y: FilterMode, bit_depth: usize
) {
#[cfg(all(target_arch = "x86_64", not(windows), feature = "nasm"))]
{
if is_x86_feature_detected!("avx2") {
let mut dst8: AlignedArray<[u8; 128 * 128]> =
UninitializedAlignedArray();
let mut src8: [u8; (128 + 7) * (128 + 7)] =
unsafe { mem::uninitialized() };
convert_slice_2d(
&mut src8,
width + 7,
src.go_left(3).go_up(3).as_slice(),
src.plane.cfg.stride,
width + 7,
height + 7
);
unsafe {
select_put_fn_avx2(mode_x, mode_y)(
dst8.array.as_mut_ptr(),
width as isize,
src8[(width + 7) * 3 + 3..].as_ptr(),
(width + 7) as isize,
width as i32,
height as i32,
col_frac,
row_frac
);
}
let dst_stride = dst.plane.cfg.stride;
let dst_slice = dst.as_mut_slice();
convert_slice_2d(
dst_slice,
dst_stride,
&dst8.array,
width,
width,
height
);
return;
}
}
(_, _) => {
let mut intermediate = [0 as i16; 8 * (128 + 7)];
super::native::put_8tap(
dst, src, width, height, col_frac, row_frac, mode_x, mode_y, bit_depth,
);
}
let src_slice = src.go_left(3).go_up(3).as_slice();
for cg in (0..width).step_by(8) {
for r in 0..height + 7 {
for c in cg..(cg + 8).min(width) {
intermediate[8 * r + (c - cg)] = round_shift(
run_filter(&src_slice[r * ref_stride + c..], 1, x_filter),
7 - intermediate_bits
) as i16;
pub fn prep_8tap<'a>(
tmp: &mut [i16], src: PlaneSlice, width: usize, height: usize,
col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode,
bit_depth: usize
) {
if is_x86_feature_detected!("avx2") {
let mut src8: [u8; (128 + 7) * (128 + 7)] =
unsafe { mem::uninitialized() };
convert_slice_2d(
&mut src8,
width + 7,
src.go_left(3).go_up(3).as_slice(),
src.plane.cfg.stride,
width + 7,
height + 7
);
unsafe {
select_prep_fn_avx2(mode_x, mode_y)(
tmp.as_mut_ptr(),
src8[(width + 7) * 3 + 3..].as_ptr(),
(width + 7) as isize,
width as i32,
height as i32,
col_frac,
row_frac
);
}
} else {
super::native::prep_8tap(
tmp, src, width, height, col_frac, row_frac, mode_x, mode_y,
bit_depth,
);
}
}
pub fn mc_avg<'a>(
dst: &'a mut PlaneMutSlice<'a>, tmp1: &[i16], tmp2: &[i16], width: usize,
height: usize, bit_depth: usize
) {
if is_x86_feature_detected!("avx2") {
let mut dst8: AlignedArray<[u8; 128 * 128]> =
UninitializedAlignedArray();
unsafe {
rav1e_avg_avx2(
dst8.array.as_mut_ptr(),
width as isize,
tmp1.as_ptr(),
tmp2.as_ptr(),
width as i32,
height as i32
);
}
let dst_stride = dst.plane.cfg.stride;
let dst_slice = dst.as_mut_slice();
convert_slice_2d(
dst_slice,
dst_stride,
&dst8.array,
width,
width,
height
);
return;
} else {
super::native::mc_avg(dst, tmp1, tmp2, width, height, bit_depth);
}
}
}
mod native {
use super::*;
use num_traits::*;
use plane::*;
use util::*;
fn run_filter<T: AsPrimitive<i32>>(
src: &[T], stride: usize, filter: [i32; 8]
) -> i32 {
filter
.iter()
.zip(src.iter().step_by(stride))
.map(|(f, s)| f * s.as_())
.sum::<i32>()
}
fn get_filter(
mode: FilterMode, frac: i32, length: usize
) -> [i32; SUBPEL_FILTER_SIZE] {
let filter_idx = if mode == FilterMode::BILINEAR || length > 4 {
mode as usize
} else {
(mode as usize).min(1) + 4
};
SUBPEL_FILTERS[filter_idx][frac as usize]
}
pub fn put_8tap<'a>(
dst: &'a mut PlaneMutSlice<'a>, src: PlaneSlice, width: usize,
height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode,
mode_y: FilterMode, bit_depth: usize
) {
let dst_stride = dst.plane.cfg.stride;
let dst_slice = dst.as_mut_slice();
let ref_stride = src.plane.cfg.stride;
let y_filter = get_filter(mode_y, row_frac, height);
let x_filter = get_filter(mode_x, col_frac, width);
let max_sample_val = ((1 << bit_depth) - 1) as i32;
let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 };
match (col_frac, row_frac) {
(0, 0) => {
let src_slice = src.as_slice();
for r in 0..height {
for c in 0..width {
dst_slice[r * dst_stride + c] = src_slice[r * ref_stride + c];
}
}
}
(0, _) => {
let src_slice = src.go_up(3).as_slice();
for r in 0..height {
for c in cg..(cg + 8).min(width) {
for c in 0..width {
dst_slice[r * dst_stride + c] = round_shift(
run_filter(&intermediate[8 * r + c - cg..], 8, y_filter),
7 + intermediate_bits
run_filter(
&src_slice[r * ref_stride + c..],
ref_stride,
y_filter
),
7
)
.max(0)
.min(max_sample_val)
......@@ -231,93 +459,142 @@ pub fn put_8tap<'a>(
}
}
}
}
}
}
pub fn prep_8tap<'a>(
tmp: &mut [i16], src: PlaneSlice, width: usize, height: usize,
col_frac: i32, row_frac: i32, bit_depth: usize, mode: FilterMode
) {
let ref_stride = src.plane.cfg.stride;
let y_filter = get_filter(mode, row_frac, height);
let x_filter = get_filter(mode, col_frac, width);
let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 };
match (col_frac, row_frac) {
(0, 0) => {
let src_slice = src.as_slice();
for r in 0..height {
for c in 0..width {
tmp[r * width + c] =
(src_slice[r * ref_stride + c] << intermediate_bits) as i16;
(_, 0) => {
let src_slice = src.go_left(3).as_slice();
for r in 0..height {
for c in 0..width {
dst_slice[r * dst_stride + c] = round_shift(
round_shift(
run_filter(&src_slice[r * ref_stride + c..], 1, x_filter),
7 - intermediate_bits
),
intermediate_bits
)
.max(0)
.min(max_sample_val)
as u16;
}
}
}
}
(0, _) => {
let src_slice = src.go_up(3).as_slice();
for r in 0..height {
for c in 0..width {
tmp[r * width + c] = round_shift(
run_filter(&src_slice[r * ref_stride + c..], ref_stride, y_filter),
7 - intermediate_bits
) as i16;
(_, _) => {
let mut intermediate = [0 as i16; 8 * (128 + 7)];
let src_slice = src.go_left(3).go_up(3).as_slice();
for cg in (0..width).step_by(8) {
for r in 0..height + 7 {
for c in cg..(cg + 8).min(width) {
intermediate[8 * r + (c - cg)] = round_shift(
run_filter(&src_slice[r * ref_stride + c..], 1, x_filter),
7 - intermediate_bits
) as i16;
}
}
for r in 0..height {
for c in cg..(cg + 8).min(width) {
dst_slice[r * dst_stride + c] = round_shift(
run_filter(&intermediate[8 * r + c - cg..], 8, y_filter),
7 + intermediate_bits
)
.max(0)
.min(max_sample_val)
as u16;
}
}
}
}
}
(_, 0) => {
let src_slice = src.go_left(3).as_slice();
for r in 0..height {
for c in 0..width {
tmp[r * width + c] = round_shift(
run_filter(&src_slice[r * ref_stride + c..], 1, x_filter),
7 - intermediate_bits
) as i16;
}
pub fn prep_8tap<'a>(
tmp: &mut [i16], src: PlaneSlice, width: usize, height: usize,
col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode,
bit_depth: usize
) {
let ref_stride = src.plane.cfg.stride;
let y_filter = get_filter(mode_y, row_frac, height);
let x_filter = get_filter(mode_x, col_frac, width);
let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 };
match (col_frac, row_frac) {
(0, 0) => {
let src_slice = src.as_slice();
for r in 0..height {
for c in 0..width {
tmp[r * width + c] =
(src_slice[r * ref_stride + c] << intermediate_bits) as i16;
}
}
}
}
(_, _) => {
let mut intermediate = [0 as i16; 8 * (128 + 7)];
let src_slice = src.go_left(3).go_up(3).as_slice();
for cg in (0..width).step_by(8) {
for r in 0..height + 7 {
for c in cg..(cg + 8).min(width) {
intermediate[8 * r + (c - cg)] = round_shift(
run_filter(&src_slice[r * ref_stride + c..], 1, x_filter),
(0, _) => {
let src_slice = src.go_up(3).as_slice();
for r in 0..height {
for c in 0..width {
tmp[r * width + c] = round_shift(
run_filter(
&src_slice[r * ref_stride + c..],
ref_stride,
y_filter
),
7 - intermediate_bits
) as i16;
}
}
}
(_, 0) => {
let src_slice = src.go_left(3).as_slice();
for r in 0..height {
for c in cg..(cg + 8).min(width) {
for c in 0..width {
tmp[r * width + c] = round_shift(
run_filter(&intermediate[8 * r + c - cg..], 8, y_filter),
7
run_filter(&src_slice[r * ref_stride + c..], 1, x_filter),
7 - intermediate_bits
) as i16;
}
}
}
(_, _) => {
let mut intermediate = [0 as i16; 8 * (128 + 7)];
let src_slice = src.go_left(3).go_up(3).as_slice();
for cg in (0..width).step_by(8) {
for r in 0..height + 7 {
for c in cg..(cg + 8).min(width) {
intermediate[8 * r + (c - cg)] = round_shift(
run_filter(&src_slice[r * ref_stride + c..], 1, x_filter),
7 - intermediate_bits
) as i16;
}
}
for r in 0..height {
for c in cg..(cg + 8).min(width) {
tmp[r * width + c] = round_shift(
run_filter(&intermediate[8 * r + c - cg..], 8, y_filter),
7
) as i16;
}
}
}
}
}
}
}
pub fn mc_avg<'a>(
dst: &'a mut PlaneMutSlice<'a>, tmp1: &[i16], tmp2: &[i16], width: usize,
height: usize, bit_depth: usize
) {
let dst_stride = dst.plane.cfg.stride;
let dst_slice = dst.as_mut_slice();
let max_sample_val = ((1 << bit_depth) - 1) as i32;
let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 };
for r in 0..height {
for c in 0..width {
dst_slice[r * dst_stride + c] = round_shift(
(tmp1[r * width + c] + tmp2[r * width + c]) as i32,
intermediate_bits + 1
)
.max(0)
.min(max_sample_val) as u16;
pub fn mc_avg<'a>(
dst: &'a mut PlaneMutSlice<'a>, tmp1: &[i16], tmp2: &[i16], width: usize,
height: usize, bit_depth: usize
) {
let dst_stride = dst.plane.cfg.stride;
let dst_slice = dst.as_mut_slice();
let max_sample_val = ((1 << bit_depth) - 1) as i32;
let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 };
for r in 0..height {
for c in 0..width {
dst_slice[r * dst_stride + c] = round_shift(
(tmp1[r * width + c] + tmp2[r * width + c]) as i32,
intermediate_bits + 1
)
.max(0)
.min(max_sample_val) as u16;
}
}
}
}
......@@ -1142,8 +1142,9 @@ impl PredictionMode {
height,
col_frac,
row_frac,
fi.sequence.bit_depth,
mode
mode,
mode,
fi.sequence.bit_depth
);
}
None => ()
......@@ -1165,8 +1166,9 @@ impl PredictionMode {
height,
col_frac,
row_frac,
fi.sequence.bit_depth,
mode
mode,
mode,
fi.sequence.bit_depth
);
}
None => ()
......
......@@ -230,3 +230,17 @@ pub fn msb(x: i32) -> i32 {
pub fn round_shift(value: i32, bit: usize) -> i32 {
(value + (1 << bit >> 1)) >> bit
}
pub fn convert_slice_2d<NEW: 'static + Copy, OLD: AsPrimitive<NEW>>(
dst: &mut [NEW], dst_stride: usize, src: &[OLD], src_stride: usize,
width: usize, height: usize
) {
for r in 0..height {
for (a, b) in dst[r * dst_stride..r * dst_stride + width]
.iter_mut()
.zip(src[r * src_stride..r * src_stride + width].iter())
{
*a = (*b).as_();
}
}
}
; Copyright (c) 2018, The rav1e contributors. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS