Commit cb5c47f2 authored by Debargha Mukherjee's avatar Debargha Mukherjee Committed by Gerrit Code Review

Merge "Accelerated transform in high bit depth"

parents 194b374b 406030d1
......@@ -85,16 +85,26 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
# dct
#
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht4x4_16_add/;
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht4x4_16_add/;
add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht8x8_64_add/;
add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht8x8_64_add/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp9_iht16x16_256_add/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp9_iht16x16_256_add/;
} else {
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht4x4_16_add sse2/;
add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht8x8_64_add sse2/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp9_iht16x16_256_add sse2/;
}
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
......
......@@ -12,14 +12,14 @@
#include "vpx_dsp/x86/txfm_common_sse2.h"
#include "vpx_ports/mem.h"
void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
__m128i in[2];
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
in[0] = _mm_loadu_si128((const __m128i *)(input));
in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
in[0] = load_input_data(input);
in[1] = load_input_data(input + 8);
switch (tx_type) {
case 0: // DCT_DCT
......@@ -77,21 +77,21 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
}
}
void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
__m128i in[8];
const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
// load input data
in[0] = _mm_load_si128((const __m128i *)input);
in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
in[0] = load_input_data(input);
in[1] = load_input_data(input + 8 * 1);
in[2] = load_input_data(input + 8 * 2);
in[3] = load_input_data(input + 8 * 3);
in[4] = load_input_data(input + 8 * 4);
in[5] = load_input_data(input + 8 * 5);
in[6] = load_input_data(input + 8 * 6);
in[7] = load_input_data(input + 8 * 7);
switch (tx_type) {
case 0: // DCT_DCT
......@@ -144,8 +144,8 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
RECON_AND_STORE(dest + 7 * stride, in[7]);
}
void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
int tx_type) {
void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride, int tx_type) {
__m128i in0[16], in1[16];
load_buffer_8x16(input, in0);
......
......@@ -626,39 +626,6 @@ if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes"))
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_1_add/;
add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_16_add/;
add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_1_add/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_64_add/;
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_12_add/;
add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_1_add/;
add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_256_add/;
add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_10_add/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add/;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add/;
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add/;
add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_iwht4x4_1_add/;
......@@ -691,6 +658,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_16_add/;
add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_1_add/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_64_add/;
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_12_add/;
add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_1_add/;
add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_256_add/;
add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_10_add/;
add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_1_add/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add/;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add/;
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add/;
add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct4x4_16_add/;
......@@ -706,6 +706,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct16x16_10_add/;
} else {
add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_16_add sse2/;
add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_1_add sse2/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_64_add sse2/;
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_12_add sse2/;
add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_1_add sse2/;
add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_256_add sse2/;
add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_10_add sse2/;
add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_1_add sse2/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add sse2/;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add sse2/;
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add sse2/;
add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct4x4_16_add sse2/;
......
This diff is collapsed.
......@@ -15,6 +15,7 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/inv_txfm.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
// perform 8x8 transpose
static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
......@@ -89,24 +90,35 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
res0[15] = tbuf[7];
}
static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
// Function to allow 8 bit optimisations to be used when profile 0 is used with
// highbitdepth enabled
static INLINE __m128i load_input_data(const tran_low_t *data) {
#if CONFIG_VP9_HIGHBITDEPTH
return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
data[6], data[7]);
#else
return _mm_load_si128((const __m128i *)data);
#endif
}
static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
in[0] = load_input_data(input + 0 * 16);
in[1] = load_input_data(input + 1 * 16);
in[2] = load_input_data(input + 2 * 16);
in[3] = load_input_data(input + 3 * 16);
in[4] = load_input_data(input + 4 * 16);
in[5] = load_input_data(input + 5 * 16);
in[6] = load_input_data(input + 6 * 16);
in[7] = load_input_data(input + 7 * 16);
in[8] = load_input_data(input + 8 * 16);
in[9] = load_input_data(input + 9 * 16);
in[10] = load_input_data(input + 10 * 16);
in[11] = load_input_data(input + 11 * 16);
in[12] = load_input_data(input + 12 * 16);
in[13] = load_input_data(input + 13 * 16);
in[14] = load_input_data(input + 14 * 16);
in[15] = load_input_data(input + 15 * 16);
}
#define RECON_AND_STORE(dest, in_x) \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment