Commit 1ac47a7c authored by Urvang Joshi's avatar Urvang Joshi

round_shift_array: Use SSE4 version everywhere.

Usage of CPU by round_shift_array goes from 2.01% to 1.04%.
Overall encoding is slightly faster (~0.05%).

This means some of the intermediate array have to be aligned.
Also, these functions were moved to common header/source files.

BUG=aomedia:1106

Change-Id: I492c9b1f2e7339c6cb83cfe68a61218642654d1b
parent 3ae12355
......@@ -536,6 +536,10 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
} # CONFIG_HIGHBITDEPTH
# Helper functions.
add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
specialize "av1_round_shift_array", qw/sse4_1/;
#
# Encoder functions.
#
......
......@@ -26,6 +26,7 @@ set(AOM_AV1_COMMON_SOURCES
"${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h"
"${AOM_ROOT}/av1/common/av1_loopfilter.c"
"${AOM_ROOT}/av1/common/av1_loopfilter.h"
"${AOM_ROOT}/av1/common/av1_txfm.c"
"${AOM_ROOT}/av1/common/av1_txfm.h"
"${AOM_ROOT}/av1/common/blockd.c"
"${AOM_ROOT}/av1/common/blockd.h"
......@@ -176,6 +177,8 @@ set(AOM_AV1_COMMON_INTRIN_SSSE3
"${AOM_ROOT}/av1/common/x86/av1_convolve_ssse3.c")
set(AOM_AV1_COMMON_INTRIN_SSE4_1
"${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c"
"${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h"
"${AOM_ROOT}/av1/common/x86/av1_fwd_txfm1d_sse4.c"
"${AOM_ROOT}/av1/common/x86/av1_fwd_txfm2d_sse4.c"
"${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c")
......
......@@ -69,6 +69,9 @@ AV1_COMMON_SRCS-yes += common/scan.c
AV1_COMMON_SRCS-yes += common/scan.h
# TODO(angiebird) the forward transform belongs under encoder/
AV1_COMMON_SRCS-yes += common/av1_txfm.h
AV1_COMMON_SRCS-yes += common/av1_txfm.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_txfm_sse4.h
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_txfm_sse4.c
AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.h
AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.c
AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d.h
......
......@@ -11,6 +11,7 @@
#include <assert.h>
#include "./aom_dsp_rtcd.h"
#include "./av1_rtcd.h"
#include "aom_dsp/txfm_common.h"
#include "av1/common/enums.h"
......@@ -115,7 +116,7 @@ static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
// flip upside down
temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
}
round_shift_array(temp_in, txfm_size_row, -shift[0]);
av1_round_shift_array(temp_in, txfm_size_row, -shift[0]);
// Multiply everything by Sqrt2 on the larger dimension if the
// transform is rectangular and the size difference is a factor of 2.
// If the size difference is a factor of 4, multiply by
......@@ -124,10 +125,10 @@ static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = (int32_t)fdct_round_shift(temp_in[r] * Sqrt2);
} else if (rect_type == 2) {
round_shift_array(temp_in, txfm_size_row, -rect_type2_shift);
av1_round_shift_array(temp_in, txfm_size_row, -rect_type2_shift);
}
txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
round_shift_array(temp_out, txfm_size_row, -shift[1]);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
if (cfg->lr_flip == 0) {
for (r = 0; r < txfm_size_row; ++r)
buf[r * txfm_size_col + c] = temp_out[r];
......@@ -154,14 +155,14 @@ static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
}
txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col,
cos_bit_row, stage_range_row);
round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
av1_round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
}
}
void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[4 * 8];
DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]);
int16_t rinput[4 * 8];
TX_SIZE tx_size = TX_4X8;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......@@ -194,7 +195,7 @@ void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[8 * 16];
DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]);
int16_t rinput[8 * 16];
TX_SIZE tx_size = TX_8X16;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......@@ -227,7 +228,7 @@ void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[16 * 32];
DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]);
int16_t rinput[16 * 32];
TX_SIZE tx_size = TX_16X32;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......@@ -260,7 +261,7 @@ void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[4 * 16];
DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]);
int16_t rinput[4 * 16];
TX_SIZE tx_size = TX_4X16;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......@@ -293,7 +294,7 @@ void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[32 * 8];
DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]);
int16_t rinput[32 * 8];
TX_SIZE tx_size = TX_8X32;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......@@ -378,7 +379,7 @@ void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[32 * 64];
DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]);
int16_t rinput[64 * 32];
TX_SIZE tx_size = TX_32X64;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......@@ -424,7 +425,7 @@ void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[64 * 16];
DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]);
int16_t rinput[64 * 16];
TX_SIZE tx_size = TX_16X64;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......
......@@ -9,6 +9,7 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "./aom_dsp_rtcd.h"
#include "./av1_rtcd.h"
#include "aom_dsp/inv_txfm.h"
#include "av1/common/enums.h"
......@@ -185,7 +186,7 @@ static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
// Rows
for (r = 0; r < txfm_size_row; ++r) {
txfm_func_row(input, buf_ptr, cos_bit_row, stage_range_row);
round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
// Multiply everything by Sqrt2 if the transform is rectangular with
// log ratio being 1 or -1, if the log ratio is 2 or -2, multiply by
// 2^rect_type2_shift.
......@@ -193,7 +194,7 @@ static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
for (c = 0; c < txfm_size_col; ++c)
buf_ptr[c] = (int32_t)dct_const_round_shift(buf_ptr[c] * Sqrt2);
} else if (rect_type2_shift) {
round_shift_array(buf_ptr, txfm_size_col, -rect_type2_shift);
av1_round_shift_array(buf_ptr, txfm_size_col, -rect_type2_shift);
}
input += txfm_size_col;
buf_ptr += txfm_size_col;
......@@ -210,7 +211,7 @@ static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
}
txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
round_shift_array(temp_out, txfm_size_row, -shift[1]);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
if (cfg->ud_flip == 0) {
for (r = 0; r < txfm_size_row; ++r) {
output[r * stride + c] =
......@@ -241,14 +242,14 @@ static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output,
void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
int txfm_buf[4 * 8 + 8 + 8];
DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd);
}
void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int txfm_buf[8 * 4 + 8 + 8];
DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
int32_t rinput[8 * 4];
uint16_t routput[8 * 4];
TX_SIZE tx_size = TX_8X4;
......@@ -263,21 +264,21 @@ void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output,
inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
transpose_uint16(output, stride, routput, rw, rw, rh);
#else
int txfm_buf[8 * 4 + 4 + 4];
DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 4 + 4]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd);
#endif
}
void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
int txfm_buf[8 * 16 + 16 + 16];
DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd);
}
void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int txfm_buf[16 * 8 + 16 + 16];
DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]);
int32_t rinput[16 * 8];
uint16_t routput[16 * 8];
TX_SIZE tx_size = TX_16X8;
......@@ -292,21 +293,21 @@ void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output,
inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
transpose_uint16(output, stride, routput, rw, rw, rh);
#else
int txfm_buf[16 * 8 + 8 + 8];
DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 8 + 8]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd);
#endif
}
void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
int txfm_buf[16 * 32 + 32 + 32];
DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd);
}
void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int txfm_buf[32 * 16 + 32 + 32];
DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]);
int32_t rinput[32 * 16];
uint16_t routput[32 * 16];
TX_SIZE tx_size = TX_32X16;
......@@ -321,32 +322,32 @@ void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output,
inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
transpose_uint16(output, stride, routput, rw, rw, rh);
#else
int txfm_buf[32 * 16 + 16 + 16];
DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 16 + 16]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd);
#endif
}
void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
int txfm_buf[4 * 4 + 4 + 4];
DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd);
}
void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
int txfm_buf[8 * 8 + 8 + 8];
DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd);
}
void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
int txfm_buf[16 * 16 + 16 + 16];
DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd);
}
void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
int txfm_buf[32 * 32 + 32 + 32];
DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd);
}
......@@ -363,7 +364,7 @@ void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
}
memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input));
int txfm_buf[64 * 64 + 64 + 64];
DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64,
bd);
}
......@@ -379,7 +380,7 @@ void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output,
memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
}
#if CONFIG_TXMG
int txfm_buf[64 * 32 + 64 + 64];
DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
int32_t rinput[64 * 32];
uint16_t routput[64 * 32];
TX_SIZE tx_size = TX_64X32;
......@@ -394,7 +395,7 @@ void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output,
inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
transpose_uint16(output, stride, routput, rw, rw, rh);
#else
int txfm_buf[64 * 32 + 64 + 64];
DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32,
bd);
#endif // CONFIG_TXMG
......@@ -408,7 +409,7 @@ void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output,
int32_t mod_input[32 * 64];
memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input));
memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input));
int txfm_buf[64 * 32 + 64 + 64];
DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64,
bd);
}
......@@ -421,7 +422,7 @@ void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output,
int32_t mod_input[16 * 64];
memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input));
memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input));
int txfm_buf[16 * 64 + 64 + 64];
DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64,
bd);
}
......@@ -437,7 +438,7 @@ void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output,
memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
}
#if CONFIG_TXMG
int txfm_buf[16 * 64 + 64 + 64];
DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
int32_t rinput[16 * 64];
uint16_t routput[16 * 64];
TX_SIZE tx_size = TX_64X16;
......@@ -452,7 +453,7 @@ void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output,
inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
transpose_uint16(output, stride, routput, rw, rw, rh);
#else
int txfm_buf[16 * 64 + 64 + 64];
DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16,
bd);
#endif // CONFIG_TXMG
......@@ -461,14 +462,14 @@ void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output,
void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
int txfm_buf[4 * 16 + 16 + 16];
DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd);
}
void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int txfm_buf[4 * 16 + 16 + 16];
DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
int32_t rinput[4 * 16];
uint16_t routput[4 * 16];
TX_SIZE tx_size = TX_16X4;
......@@ -483,21 +484,21 @@ void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output,
inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
transpose_uint16(output, stride, routput, rw, rw, rh);
#else
int txfm_buf[4 * 16 + 16 + 16];
DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd);
#endif // CONFIG_TXMG
}
void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
int txfm_buf[8 * 32 + 32 + 32];
DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd);
}
void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int txfm_buf[8 * 32 + 32 + 32];
DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
int32_t rinput[8 * 32];
uint16_t routput[8 * 32];
TX_SIZE tx_size = TX_32X8;
......@@ -512,7 +513,7 @@ void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output,
inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
transpose_uint16(output, stride, routput, rw, rw, rh);
#else
int txfm_buf[8 * 32 + 32 + 32];
DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd);
#endif // CONFIG_TXMG
}
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "./aom_dsp_rtcd.h"
#include "av1/common/av1_txfm.h"
void av1_round_shift_array_c(int32_t *arr, int size, int bit) {
int i;
if (bit == 0) {
return;
} else {
if (bit > 0) {
for (i = 0; i < size; i++) {
arr[i] = round_shift(arr[i], bit);
}
} else {
for (i = 0; i < size; i++) {
arr[i] = arr[i] * (1 << (-bit));
}
}
}
}
......@@ -85,23 +85,6 @@ static INLINE int32_t round_shift(int32_t value, int bit) {
return (int32_t)(((int64_t)value + (1ll << (bit - 1))) >> bit);
}
static INLINE void round_shift_array(int32_t *arr, int size, int bit) {
int i;
if (bit == 0) {
return;
} else {
if (bit > 0) {
for (i = 0; i < size; i++) {
arr[i] = round_shift(arr[i], bit);
}
} else {
for (i = 0; i < size; i++) {
arr[i] = arr[i] * (1 << (-bit));
}
}
}
}
static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
int bit) {
int32_t result_32 = (int32_t)clamp64((int64_t)w0 * in0 + (int64_t)w1 * in1,
......
......@@ -64,12 +64,12 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
txfm_size);
round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
transpose_32(txfm_size, out_128, buf_128);
txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
transpose_32(txfm_size, buf_128, out_128);
}
......
......@@ -3,6 +3,7 @@
#include <smmintrin.h>
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/av1_txfm_sse4.h"
#ifdef __cplusplus
extern "C" {
......@@ -81,28 +82,6 @@ static INLINE void transpose_32(int txfm_size, const __m128i *input,
}
}
static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
__m128i tmp, round;
round = _mm_set1_epi32(1 << (bit - 1));
tmp = _mm_add_epi32(vec, round);
return _mm_srai_epi32(tmp, bit);
}
static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
const int size, const int bit) {
if (bit > 0) {
int i;
for (i = 0; i < size; i++) {
output[i] = round_shift_32_sse4_1(input[i], bit);
}
} else {
int i;
for (i = 0; i < size; i++) {
output[i] = _mm_slli_epi32(input[i], -bit);
}
}
}
// out0 = in0*w0 + in1*w1
// out1 = -in1*w0 + in0*w1
#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
......@@ -113,11 +92,11 @@ static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
in0_w0 = _mm_mullo_epi32(in0, ww0); \
in1_w1 = _mm_mullo_epi32(in1, ww1); \
out0 = _mm_add_epi32(in0_w0, in1_w1); \
out0 = round_shift_32_sse4_1(out0, bit); \
out0 = av1_round_shift_32_sse4_1(out0, bit); \
in0_w1 = _mm_mullo_epi32(in0, ww1); \
in1_w0 = _mm_mullo_epi32(in1, ww0); \
out1 = _mm_sub_epi32(in0_w1, in1_w0); \
out1 = round_shift_32_sse4_1(out1, bit); \
out1 = av1_round_shift_32_sse4_1(out1, bit); \
} while (0)
// out0 = in0*w0 + in1*w1
......@@ -130,11 +109,11 @@ static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
in0_w0 = _mm_mullo_epi32(in0, ww0); \
in1_w1 = _mm_mullo_epi32(in1, ww1); \
out0 = _mm_add_epi32(in0_w0, in1_w1); \
out0 = round_shift_32_sse4_1(out0, bit); \
out0 = av1_round_shift_32_sse4_1(out0, bit); \
in0_w1 = _mm_mullo_epi32(in0, ww1); \
in1_w0 = _mm_mullo_epi32(in1, ww0); \
out1 = _mm_sub_epi32(in1_w0, in0_w1); \
out1 = round_shift_32_sse4_1(out1, bit); \
out1 = av1_round_shift_32_sse4_1(out1, bit); \
} while (0)
#ifdef __cplusplus
......
#include "./aom_dsp_rtcd.h"
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/av1_txfm_sse4.h"
void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) {
__m128i *const vec = (__m128i *)arr;
const int vec_size = size >> 2;
av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit);
}
#ifndef AV1_TXFM_SSE4_H_
#define AV1_TXFM_SSE4_H_
#include <smmintrin.h>
#ifdef __cplusplus
extern "C" {
#endif
static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
__m128i tmp, round;
round = _mm_set1_epi32(1 << (bit - 1));
tmp = _mm_add_epi32(vec, round);
return _mm_srai_epi32(tmp, bit);
}
static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
__m128i *output,
const int size,
const int bit) {
if (bit > 0) {
int i;
for (i = 0; i < size; i++) {
output[i] = av1_round_shift_32_sse4_1(input[i], bit);
}
} else {
int i;
for (i = 0; i < size; i++) {
output[i] = _mm_slli_epi32(input[i], -bit);
}
}
}
#ifdef __cplusplus
}
#endif
#endif // AV1_TXFM_SSE4_H_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment