Commit 574d4795 authored by Kyle Siefring's avatar Kyle Siefring

Add avx2 for forward txfms

parent 68f62e69
Pipeline #1788 failed with stages
in 33 minutes and 42 seconds
......@@ -24,7 +24,7 @@ pub fn av1_idct4(c: &mut Criterion) {
let (input, mut output) = init_buffers(4);
c.bench_function("av1_idct4_8", move |b| {
b.iter(|| transform::av1_idct4(&input[..], &mut output[..], 16))
b.iter(|| transform::inverse::av1_idct4(&input[..], &mut output[..], 16))
});
}
......@@ -32,7 +32,7 @@ pub fn av1_idct8(c: &mut Criterion) {
let (input, mut output) = init_buffers(8);
c.bench_function("av1_idct8_8", move |b| {
b.iter(|| transform::av1_idct8(&input[..], &mut output[..], 16))
b.iter(|| transform::inverse::av1_idct8(&input[..], &mut output[..], 16))
});
}
......@@ -40,7 +40,9 @@ pub fn av1_iidentity4(c: &mut Criterion) {
let (input, mut output) = init_buffers(4);
c.bench_function("av1_iidentity4_8", move |b| {
b.iter(|| transform::av1_iidentity4(&input[..], &mut output[..], 16))
b.iter(|| {
transform::inverse::av1_iidentity4(&input[..], &mut output[..], 16)
})
});
}
......@@ -48,7 +50,9 @@ pub fn av1_iidentity8(c: &mut Criterion) {
let (input, mut output) = init_buffers(8);
c.bench_function("av1_iidentity8_8", move |b| {
b.iter(|| transform::av1_iidentity8(&input[..], &mut output[..], 16))
b.iter(|| {
transform::inverse::av1_iidentity8(&input[..], &mut output[..], 16)
})
});
}
......@@ -56,7 +60,7 @@ pub fn av1_iadst4(c: &mut Criterion) {
let (input, mut output) = init_buffers(4);
c.bench_function("av1_iadst4_8", move |b| {
b.iter(|| transform::av1_iadst4(&input[..], &mut output[..], 16))
b.iter(|| transform::inverse::av1_iadst4(&input[..], &mut output[..], 16))
});
}
......@@ -64,7 +68,7 @@ pub fn av1_iadst8(c: &mut Criterion) {
let (input, mut output) = init_buffers(8);
c.bench_function("av1_iadst8_8", move |b| {
b.iter(|| transform::av1_iadst8(&input[..], &mut output[..], 16))
b.iter(|| transform::inverse::av1_iadst8(&input[..], &mut output[..], 16))
});
}
......@@ -72,7 +76,9 @@ pub fn daala_fdct4(c: &mut Criterion) {
let (input, mut output) = init_buffers(4);
c.bench_function("daala_fdct4", move |b| {
b.iter(|| transform::daala_fdct4(&input[..], &mut output[..]))
b.iter(|| {
transform::forward::native::daala_fdct4(&input[..], &mut output[..])
})
});
}
......@@ -80,7 +86,9 @@ pub fn daala_fdct8(c: &mut Criterion) {
let (input, mut output) = init_buffers(8);
c.bench_function("daala_fdct8", move |b| {
b.iter(|| transform::daala_fdct8(&input[..], &mut output[..]))
b.iter(|| {
transform::forward::native::daala_fdct8(&input[..], &mut output[..])
})
});
}
......@@ -88,7 +96,9 @@ pub fn fidentity4(c: &mut Criterion) {
let (input, mut output) = init_buffers(4);
c.bench_function("fidentity4", move |b| {
b.iter(|| transform::fidentity4(&input[..], &mut output[..]))
b.iter(|| {
transform::forward::native::fidentity4(&input[..], &mut output[..])
})
});
}
......@@ -96,7 +106,9 @@ pub fn fidentity8(c: &mut Criterion) {
let (input, mut output) = init_buffers(8);
c.bench_function("fidentity8", move |b| {
b.iter(|| transform::fidentity8(&input[..], &mut output[..]))
b.iter(|| {
transform::forward::native::fidentity8(&input[..], &mut output[..])
})
});
}
......@@ -104,7 +116,9 @@ pub fn daala_fdst_vii_4(c: &mut Criterion) {
let (input, mut output) = init_buffers(4);
c.bench_function("daala_fdst_vii_4", move |b| {
b.iter(|| transform::daala_fdst_vii_4(&input[..], &mut output[..]))
b.iter(|| {
transform::forward::native::daala_fdst_vii_4(&input[..], &mut output[..])
})
});
}
......@@ -112,7 +126,9 @@ pub fn daala_fdst8(c: &mut Criterion) {
let (input, mut output) = init_buffers(8);
c.bench_function("daala_fdst8", move |b| {
b.iter(|| transform::daala_fdst8(&input[..], &mut output[..]))
b.iter(|| {
transform::forward::native::daala_fdst8(&input[..], &mut output[..])
})
});
}
......
......@@ -9,6 +9,7 @@
use crate::cpu_features::CpuFeatureLevel;
use crate::tiling::PlaneRegionMut;
use crate::transform::inverse::*;
use crate::transform::*;
use crate::util::AlignedArray;
use crate::Pixel;
......@@ -218,7 +219,9 @@ mod test {
*d = random::<u8>();
*r = i16::from(*s) - i16::from(*d);
}
forward_transform(res, freq, tx_size.width(), tx_size, $ENUM, 8);
forward_transform(
res, freq, tx_size.width(), tx_size, $ENUM, 8, CpuFeatureLevel::NATIVE
);
let mut native_dst = dst.clone();
unsafe { crate::predict::[<Block $W x $H>]::[<inv_txfm2d_add_ $OPT>](
......
This diff is collapsed.
......@@ -9,6 +9,7 @@
use crate::cpu_features::CpuFeatureLevel;
use crate::tiling::PlaneRegionMut;
use crate::transform::inverse::*;
use crate::transform::*;
use crate::util::AlignedArray;
use crate::Pixel;
......@@ -256,7 +257,9 @@ mod test {
*d = random::<u8>();
*r = i16::from(*s) - i16::from(*d);
}
forward_transform(res, freq, tx_size.width(), tx_size, $ENUM, 8);
forward_transform(
res, freq, tx_size.width(), tx_size, $ENUM, 8, CpuFeatureLevel::NATIVE
);
let mut native_dst = dst.clone();
unsafe { crate::predict::[<Block $W x $H>]::[<inv_txfm2d_add_ $OPT>](
......
......@@ -7,4 +7,5 @@
// Media Patent License 1.0 was not distributed with this source code in the
// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
pub mod forward;
pub mod inverse;
......@@ -1191,6 +1191,7 @@ pub fn encode_tx_block<T: Pixel>(
tx_size,
tx_type,
fi.sequence.bit_depth,
fi.cpu_feature_level,
);
ts.qc.quantize(coeffs, qcoeffs, tx_size, tx_type);
......
......@@ -67,6 +67,9 @@ mod hawktracer {
#[cfg(any(cargo_c, feature = "capi"))]
pub mod capi;
#[macro_use]
mod transform;
mod activity;
pub(crate) mod asm;
mod cpu_features;
......@@ -77,7 +80,6 @@ mod predict;
mod quantize;
mod rdo;
mod rdo_tables;
mod transform;
#[macro_use]
mod util;
mod cdef;
......
This diff is collapsed.
This diff is collapsed.
......@@ -10,8 +10,11 @@
#![allow(non_camel_case_types)]
#![allow(dead_code)]
pub use self::forward::*;
pub use self::inverse::*;
#[macro_use]
pub mod forward_shared;
use self::forward::*;
use self::inverse::*;
use crate::context::MI_SIZE_LOG2;
use crate::partition::{BlockSize, BlockSize::*};
......@@ -21,8 +24,8 @@ use crate::util::*;
use crate::cpu_features::CpuFeatureLevel;
use TxSize::*;
mod forward;
mod inverse;
pub mod forward;
pub mod inverse;
pub static RAV1E_TX_TYPES: &[TxType] = &[
TxType::DCT_DCT,
......@@ -382,31 +385,31 @@ const HTX_TAB: [TxType1D; TX_TYPES] = [
pub fn forward_transform(
input: &[i16], output: &mut [i32], stride: usize, tx_size: TxSize,
tx_type: TxType, bit_depth: usize,
tx_type: TxType, bit_depth: usize, cpu: CpuFeatureLevel,
) {
use self::TxSize::*;
match tx_size {
TX_4X4 => fht4x4(input, output, stride, tx_type, bit_depth),
TX_8X8 => fht8x8(input, output, stride, tx_type, bit_depth),
TX_16X16 => fht16x16(input, output, stride, tx_type, bit_depth),
TX_32X32 => fht32x32(input, output, stride, tx_type, bit_depth),
TX_64X64 => fht64x64(input, output, stride, tx_type, bit_depth),
TX_4X8 => fht4x8(input, output, stride, tx_type, bit_depth),
TX_8X4 => fht8x4(input, output, stride, tx_type, bit_depth),
TX_8X16 => fht8x16(input, output, stride, tx_type, bit_depth),
TX_16X8 => fht16x8(input, output, stride, tx_type, bit_depth),
TX_16X32 => fht16x32(input, output, stride, tx_type, bit_depth),
TX_32X16 => fht32x16(input, output, stride, tx_type, bit_depth),
TX_32X64 => fht32x64(input, output, stride, tx_type, bit_depth),
TX_64X32 => fht64x32(input, output, stride, tx_type, bit_depth),
TX_4X16 => fht4x16(input, output, stride, tx_type, bit_depth),
TX_16X4 => fht16x4(input, output, stride, tx_type, bit_depth),
TX_8X32 => fht8x32(input, output, stride, tx_type, bit_depth),
TX_32X8 => fht32x8(input, output, stride, tx_type, bit_depth),
TX_16X64 => fht16x64(input, output, stride, tx_type, bit_depth),
TX_64X16 => fht64x16(input, output, stride, tx_type, bit_depth),
TX_4X4 => fht4x4(input, output, stride, tx_type, bit_depth, cpu),
TX_8X8 => fht8x8(input, output, stride, tx_type, bit_depth, cpu),
TX_16X16 => fht16x16(input, output, stride, tx_type, bit_depth, cpu),
TX_32X32 => fht32x32(input, output, stride, tx_type, bit_depth, cpu),
TX_64X64 => fht64x64(input, output, stride, tx_type, bit_depth, cpu),
TX_4X8 => fht4x8(input, output, stride, tx_type, bit_depth, cpu),
TX_8X4 => fht8x4(input, output, stride, tx_type, bit_depth, cpu),
TX_8X16 => fht8x16(input, output, stride, tx_type, bit_depth, cpu),
TX_16X8 => fht16x8(input, output, stride, tx_type, bit_depth, cpu),
TX_16X32 => fht16x32(input, output, stride, tx_type, bit_depth, cpu),
TX_32X16 => fht32x16(input, output, stride, tx_type, bit_depth, cpu),
TX_32X64 => fht32x64(input, output, stride, tx_type, bit_depth, cpu),
TX_64X32 => fht64x32(input, output, stride, tx_type, bit_depth, cpu),
TX_4X16 => fht4x16(input, output, stride, tx_type, bit_depth, cpu),
TX_16X4 => fht16x4(input, output, stride, tx_type, bit_depth, cpu),
TX_8X32 => fht8x32(input, output, stride, tx_type, bit_depth, cpu),
TX_32X8 => fht32x8(input, output, stride, tx_type, bit_depth, cpu),
TX_16X64 => fht16x64(input, output, stride, tx_type, bit_depth, cpu),
TX_64X16 => fht64x16(input, output, stride, tx_type, bit_depth, cpu),
}
}
......@@ -450,6 +453,8 @@ mod test {
fn test_roundtrip<T: Pixel>(
tx_size: TxSize, tx_type: TxType, tolerance: i16,
) {
let cpu = CpuFeatureLevel::default();
let mut src_storage = [T::cast_from(0); 64 * 64];
let src = &mut src_storage[..tx_size.area()];
// dynamic allocation: test
......@@ -466,14 +471,14 @@ mod test {
*d = T::cast_from(random::<u8>());
*r = i16::cast_from(*s) - i16::cast_from(*d);
}
forward_transform(res, freq, tx_size.width(), tx_size, tx_type, 8);
forward_transform(res, freq, tx_size.width(), tx_size, tx_type, 8, cpu);
inverse_transform_add(
freq,
&mut dst.as_region_mut(),
tx_size,
tx_type,
8,
CpuFeatureLevel::default(),
cpu,
);
for (s, d) in src.iter().zip(dst.data.iter()) {
......@@ -523,6 +528,8 @@ mod test {
(TX_4X4, ADST_DCT, 0),
(TX_4X4, DCT_ADST, 0),
(TX_4X4, ADST_ADST, 0),
(TX_4X4, FLIPADST_DCT, 0),
(TX_4X4, DCT_FLIPADST, 0),
(TX_4X4, IDTX, 0),
(TX_4X4, V_DCT, 0),
(TX_4X4, H_DCT, 0),
......@@ -532,6 +539,8 @@ mod test {
(TX_8X8, ADST_DCT, 1),
(TX_8X8, DCT_ADST, 1),
(TX_8X8, ADST_ADST, 1),
(TX_8X8, FLIPADST_DCT, 1),
(TX_8X8, DCT_FLIPADST, 1),
(TX_8X8, IDTX, 0),
(TX_8X8, V_DCT, 0),
(TX_8X8, H_DCT, 0),
......@@ -541,6 +550,8 @@ mod test {
(TX_16X16, ADST_DCT, 1),
(TX_16X16, DCT_ADST, 1),
(TX_16X16, ADST_ADST, 1),
(TX_16X16, FLIPADST_DCT, 1),
(TX_16X16, DCT_FLIPADST, 1),
(TX_16X16, IDTX, 0),
(TX_16X16, V_DCT, 1),
(TX_16X16, H_DCT, 1),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment