Commit 1ac47a7c authored by Urvang Joshi's avatar Urvang Joshi

round_shift_array: Use SSE4 version everywhere.

Usage of CPU by round_shift_array goes from 2.01% to 1.04%.
Overall encoding is slightly faster (~0.05%).

This means some of the intermediate array have to be aligned.
Also, these functions were moved to common header/source files.

BUG=aomedia:1106

Change-Id: I492c9b1f2e7339c6cb83cfe68a61218642654d1b
parent 3ae12355
......@@ -536,6 +536,10 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
} # CONFIG_HIGHBITDEPTH
# Helper functions.
add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
specialize "av1_round_shift_array", qw/sse4_1/;
#
# Encoder functions.
#
......
......@@ -26,6 +26,7 @@ set(AOM_AV1_COMMON_SOURCES
"${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h"
"${AOM_ROOT}/av1/common/av1_loopfilter.c"
"${AOM_ROOT}/av1/common/av1_loopfilter.h"
"${AOM_ROOT}/av1/common/av1_txfm.c"
"${AOM_ROOT}/av1/common/av1_txfm.h"
"${AOM_ROOT}/av1/common/blockd.c"
"${AOM_ROOT}/av1/common/blockd.h"
......@@ -176,6 +177,8 @@ set(AOM_AV1_COMMON_INTRIN_SSSE3
"${AOM_ROOT}/av1/common/x86/av1_convolve_ssse3.c")
set(AOM_AV1_COMMON_INTRIN_SSE4_1
"${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c"
"${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h"
"${AOM_ROOT}/av1/common/x86/av1_fwd_txfm1d_sse4.c"
"${AOM_ROOT}/av1/common/x86/av1_fwd_txfm2d_sse4.c"
"${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c")
......
......@@ -69,6 +69,9 @@ AV1_COMMON_SRCS-yes += common/scan.c
AV1_COMMON_SRCS-yes += common/scan.h
# TODO(angiebird) the forward transform belongs under encoder/
AV1_COMMON_SRCS-yes += common/av1_txfm.h
AV1_COMMON_SRCS-yes += common/av1_txfm.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_txfm_sse4.h
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_txfm_sse4.c
AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.h
AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.c
AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d.h
......
......@@ -11,6 +11,7 @@
#include <assert.h>
#include "./aom_dsp_rtcd.h"
#include "./av1_rtcd.h"
#include "aom_dsp/txfm_common.h"
#include "av1/common/enums.h"
......@@ -115,7 +116,7 @@ static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
// flip upside down
temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
}
round_shift_array(temp_in, txfm_size_row, -shift[0]);
av1_round_shift_array(temp_in, txfm_size_row, -shift[0]);
// Multiply everything by Sqrt2 on the larger dimension if the
// transform is rectangular and the size difference is a factor of 2.
// If the size difference is a factor of 4, multiply by
......@@ -124,10 +125,10 @@ static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
for (r = 0; r < txfm_size_row; ++r)
temp_in[r] = (int32_t)fdct_round_shift(temp_in[r] * Sqrt2);
} else if (rect_type == 2) {
round_shift_array(temp_in, txfm_size_row, -rect_type2_shift);
av1_round_shift_array(temp_in, txfm_size_row, -rect_type2_shift);
}
txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
round_shift_array(temp_out, txfm_size_row, -shift[1]);
av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
if (cfg->lr_flip == 0) {
for (r = 0; r < txfm_size_row; ++r)
buf[r * txfm_size_col + c] = temp_out[r];
......@@ -154,14 +155,14 @@ static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
}
txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col,
cos_bit_row, stage_range_row);
round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
av1_round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
}
}
void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[4 * 8];
DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]);
int16_t rinput[4 * 8];
TX_SIZE tx_size = TX_4X8;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......@@ -194,7 +195,7 @@ void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[8 * 16];
DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]);
int16_t rinput[8 * 16];
TX_SIZE tx_size = TX_8X16;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......@@ -227,7 +228,7 @@ void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[16 * 32];
DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]);
int16_t rinput[16 * 32];
TX_SIZE tx_size = TX_16X32;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......@@ -260,7 +261,7 @@ void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[4 * 16];
DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]);
int16_t rinput[4 * 16];
TX_SIZE tx_size = TX_4X16;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......@@ -293,7 +294,7 @@ void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[32 * 8];
DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]);
int16_t rinput[32 * 8];
TX_SIZE tx_size = TX_8X32;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......@@ -378,7 +379,7 @@ void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[32 * 64];
DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]);
int16_t rinput[64 * 32];
TX_SIZE tx_size = TX_32X64;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......@@ -424,7 +425,7 @@ void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd) {
#if CONFIG_TXMG
int32_t txfm_buf[64 * 16];
DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]);
int16_t rinput[64 * 16];
TX_SIZE tx_size = TX_16X64;
TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
......
This diff is collapsed.
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "./aom_dsp_rtcd.h"
#include "av1/common/av1_txfm.h"
void av1_round_shift_array_c(int32_t *arr, int size, int bit) {
int i;
if (bit == 0) {
return;
} else {
if (bit > 0) {
for (i = 0; i < size; i++) {
arr[i] = round_shift(arr[i], bit);
}
} else {
for (i = 0; i < size; i++) {
arr[i] = arr[i] * (1 << (-bit));
}
}
}
}
......@@ -85,23 +85,6 @@ static INLINE int32_t round_shift(int32_t value, int bit) {
return (int32_t)(((int64_t)value + (1ll << (bit - 1))) >> bit);
}
static INLINE void round_shift_array(int32_t *arr, int size, int bit) {
int i;
if (bit == 0) {
return;
} else {
if (bit > 0) {
for (i = 0; i < size; i++) {
arr[i] = round_shift(arr[i], bit);
}
} else {
for (i = 0; i < size; i++) {
arr[i] = arr[i] * (1 << (-bit));
}
}
}
}
static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
int bit) {
int32_t result_32 = (int32_t)clamp64((int64_t)w0 * in0 + (int64_t)w1 * in1,
......
......@@ -64,12 +64,12 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
txfm_size);
round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
transpose_32(txfm_size, out_128, buf_128);
txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
transpose_32(txfm_size, buf_128, out_128);
}
......
......@@ -3,6 +3,7 @@
#include <smmintrin.h>
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/av1_txfm_sse4.h"
#ifdef __cplusplus
extern "C" {
......@@ -81,28 +82,6 @@ static INLINE void transpose_32(int txfm_size, const __m128i *input,
}
}
static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
__m128i tmp, round;
round = _mm_set1_epi32(1 << (bit - 1));
tmp = _mm_add_epi32(vec, round);
return _mm_srai_epi32(tmp, bit);
}
static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
const int size, const int bit) {
if (bit > 0) {
int i;
for (i = 0; i < size; i++) {
output[i] = round_shift_32_sse4_1(input[i], bit);
}
} else {
int i;
for (i = 0; i < size; i++) {
output[i] = _mm_slli_epi32(input[i], -bit);
}
}
}
// out0 = in0*w0 + in1*w1
// out1 = -in1*w0 + in0*w1
#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
......@@ -113,11 +92,11 @@ static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
in0_w0 = _mm_mullo_epi32(in0, ww0); \
in1_w1 = _mm_mullo_epi32(in1, ww1); \
out0 = _mm_add_epi32(in0_w0, in1_w1); \
out0 = round_shift_32_sse4_1(out0, bit); \
out0 = av1_round_shift_32_sse4_1(out0, bit); \
in0_w1 = _mm_mullo_epi32(in0, ww1); \
in1_w0 = _mm_mullo_epi32(in1, ww0); \
out1 = _mm_sub_epi32(in0_w1, in1_w0); \
out1 = round_shift_32_sse4_1(out1, bit); \
out1 = av1_round_shift_32_sse4_1(out1, bit); \
} while (0)
// out0 = in0*w0 + in1*w1
......@@ -130,11 +109,11 @@ static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
in0_w0 = _mm_mullo_epi32(in0, ww0); \
in1_w1 = _mm_mullo_epi32(in1, ww1); \
out0 = _mm_add_epi32(in0_w0, in1_w1); \
out0 = round_shift_32_sse4_1(out0, bit); \
out0 = av1_round_shift_32_sse4_1(out0, bit); \
in0_w1 = _mm_mullo_epi32(in0, ww1); \
in1_w0 = _mm_mullo_epi32(in1, ww0); \
out1 = _mm_sub_epi32(in1_w0, in0_w1); \
out1 = round_shift_32_sse4_1(out1, bit); \
out1 = av1_round_shift_32_sse4_1(out1, bit); \
} while (0)
#ifdef __cplusplus
......
#include "./aom_dsp_rtcd.h"
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/av1_txfm_sse4.h"
void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) {
__m128i *const vec = (__m128i *)arr;
const int vec_size = size >> 2;
av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit);
}
#ifndef AV1_TXFM_SSE4_H_
#define AV1_TXFM_SSE4_H_
#include <smmintrin.h>
#ifdef __cplusplus
extern "C" {
#endif
static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
__m128i tmp, round;
round = _mm_set1_epi32(1 << (bit - 1));
tmp = _mm_add_epi32(vec, round);
return _mm_srai_epi32(tmp, bit);
}
static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
__m128i *output,
const int size,
const int bit) {
if (bit > 0) {
int i;
for (i = 0; i < size; i++) {
output[i] = av1_round_shift_32_sse4_1(input[i], bit);
}
} else {
int i;
for (i = 0; i < size; i++) {
output[i] = _mm_slli_epi32(input[i], -bit);
}
}
}
#ifdef __cplusplus
}
#endif
#endif // AV1_TXFM_SSE4_H_
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment