Commit d4327bce authored by Angie Chiang's avatar Angie Chiang Committed by Sebastien Alaiwan
Browse files

Simplify cos_bit setting in txfm

Move cos_bit from txfm 1d cfg to 2d cfg
Each txfm stage only uses one cos_bit

This is a lossless change and it speeds up encoder by 2%

Change-Id: I45d398761e4729b8c4c37729571fe3765cb0c83f
parent dc3d916b
This diff is collapsed.
......@@ -18,38 +18,38 @@
extern "C" {
#endif
void av1_idct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
void av1_idct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
void av1_idct16_new(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
void av1_idct32_new(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
#if CONFIG_TX64X64
void av1_idct64_new(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
#endif // CONFIG_TX64X64
void av1_iadst4_new(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
void av1_iadst8_new(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
void av1_iadst16_new(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
void av1_iadst32_new(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
void av1_iidentity4_c(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
void av1_iidentity8_c(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
void av1_iidentity16_c(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
void av1_iidentity32_c(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
void av1_iadst32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
#if CONFIG_TX64X64
void av1_iidentity64_c(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
void av1_iidentity64_c(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
#endif // CONFIG_TX64X64
#ifdef __cplusplus
......
This diff is collapsed.
......@@ -287,6 +287,18 @@ const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
#endif // CONFIG_TX64X64
};
const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/] = { { 13, 13, 13, 0, 0 },
{ 13, 13, 13, 13, 0 },
{ 13, 13, 13, 13, 13 },
{ 0, 13, 13, 13, 13 },
{ 0, 0, 13, 13, 13 } };
const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/] = { { 13, 13, 13, 0, 0 },
{ 13, 13, 12, 12, 0 },
{ 12, 12, 12, 12, 12 },
{ 0, 12, 12, 12, 12 },
{ 0, 0, 12, 12, 12 } };
void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
TXFM_2D_FLIP_CFG *cfg) {
assert(cfg != NULL);
......@@ -296,6 +308,10 @@ void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
cfg->col_cfg = inv_txfm_col_cfg_ls[tx_type_col][tx_size];
cfg->row_cfg = inv_txfm_row_cfg_ls[tx_type_row][tx_size];
cfg->shift = inv_txfm_shift_ls[tx_size];
const int txw_idx = tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
const int txh_idx = tx_size_high_log2[tx_size] - tx_size_high_log2[0];
cfg->cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
cfg->cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
}
void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
......@@ -335,8 +351,8 @@ static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
assert(cfg->col_cfg->stage_num <= MAX_TXFM_STAGE_NUM);
av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd);
const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
const int8_t *cos_bit_row = cfg->row_cfg->cos_bit;
const int8_t cos_bit_col = cfg->cos_bit_col;
const int8_t cos_bit_row = cfg->cos_bit_row;
const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->col_cfg->txfm_type);
const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->row_cfg->txfm_type);
......
......@@ -104,8 +104,8 @@ static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
return round_shift(result_32, bit);
}
typedef void (*TxfmFunc)(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range);
typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit,
const int8_t *stage_range);
typedef enum TXFM_TYPE {
TXFM_TYPE_DCT4,
......@@ -129,7 +129,6 @@ typedef struct TXFM_1D_CFG {
const int stage_num;
const int8_t *stage_range;
const int8_t *cos_bit;
const TXFM_TYPE txfm_type;
} TXFM_1D_CFG;
......@@ -137,6 +136,8 @@ typedef struct TXFM_2D_FLIP_CFG {
int ud_flip; // flip upside down
int lr_flip; // flip left to right
const int8_t *shift;
int8_t cos_bit_col;
int8_t cos_bit_row;
const TXFM_1D_CFG *col_cfg;
const TXFM_1D_CFG *row_cfg;
} TXFM_2D_FLIP_CFG;
......
......@@ -88,17 +88,21 @@ static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
#if CONFIG_TX64X64 && (!CONFIG_DAALA_TX32 || !CONFIG_DAALA_TX64)
static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
int32_t in[64], out[64];
const int txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
const int txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
for (int i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
av1_idct64_new(in, out, inv_cos_bit_col_dct_64, inv_stage_range_col_dct_64);
av1_idct64_new(in, out, inv_cos_bit_col[txw_idx][txh_idx],
inv_stage_range_col_dct_64);
for (int i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
}
static void idct64_row_c(const tran_low_t *input, tran_low_t *output) {
int32_t in[64], out[64];
const int txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
const int txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
for (int i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
av1_idct64_new(in, out, inv_cos_bit_row_dct_64, inv_stage_range_row_dct_64);
av1_idct64_new(in, out, inv_cos_bit_row[txw_idx][txh_idx],
inv_stage_range_row_dct_64);
for (int i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
}
......
......@@ -601,20 +601,18 @@ static void idct32_avx2(__m256i *in, __m256i *out, int bit) {
void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m256i in[128], out[128];
const TXFM_1D_CFG *row_cfg = NULL;
const TXFM_1D_CFG *col_cfg = NULL;
const int8_t *shift = inv_txfm_shift_ls[TX_32X32];
const int txw_idx = tx_size_wide_log2[TX_32X32] - tx_size_wide_log2[0];
const int txh_idx = tx_size_high_log2[TX_32X32] - tx_size_high_log2[0];
switch (tx_type) {
case DCT_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_32;
col_cfg = &inv_txfm_1d_col_cfg_dct_32;
load_buffer_32x32(coeff, in);
transpose_32x32(in, out);
idct32_avx2(out, in, row_cfg->cos_bit[2]);
idct32_avx2(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
round_shift_32x32(in, -shift[0]);
transpose_32x32(in, out);
idct32_avx2(out, in, col_cfg->cos_bit[2]);
idct32_avx2(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_32x32(in, output, stride, 0, 0, -shift[1], bd);
break;
default: assert(0);
......
......@@ -222,81 +222,63 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[4];
const TXFM_1D_CFG *row_cfg = NULL;
const TXFM_1D_CFG *col_cfg = NULL;
const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
const int txw_idx = tx_size_wide_log2[TX_4X4] - tx_size_wide_log2[0];
const int txh_idx = tx_size_high_log2[TX_4X4] - tx_size_high_log2[0];
switch (tx_type) {
case DCT_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_4;
col_cfg = &inv_txfm_1d_col_cfg_dct_4;
load_buffer_4x4(coeff, in);
idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_4;
col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_4;
col_cfg = &inv_txfm_1d_col_cfg_dct_4;
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_4;
col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case FLIPADST_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_4;
col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_4;
col_cfg = &inv_txfm_1d_col_cfg_dct_4;
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_4;
col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
break;
case ADST_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_4;
col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_4;
col_cfg = &inv_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx]);
iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
default: assert(0);
......@@ -819,108 +801,90 @@ static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[16], out[16];
const TXFM_1D_CFG *row_cfg = NULL;
const TXFM_1D_CFG *col_cfg = NULL;
const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
const int txw_idx = tx_size_wide_log2[TX_8X8] - tx_size_wide_log2[0];
const int txh_idx = tx_size_high_log2[TX_8X8] - tx_size_high_log2[0];
switch (tx_type) {
case DCT_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_8;
col_cfg = &inv_txfm_1d_col_cfg_dct_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(in, out);
round_shift_8x8(out, -shift[0]);
idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_8;
col_cfg = &inv_txfm_1d_col_cfg_dct_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(in, out);
round_shift_8x8(out, -shift[0]);
idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_8;
col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(in, out);
round_shift_8x8(out, -shift[0]);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_8;
col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(in, out);
round_shift_8x8(out, -shift[0]);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case FLIPADST_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_8;
col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(in, out);
round_shift_8x8(out, -shift[0]);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_8;
col_cfg = &inv_txfm_1d_col_cfg_dct_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(in, out);
round_shift_8x8(out, -shift[0]);
idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
break;
case ADST_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_8;
col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(in, out);
round_shift_8x8(out, -shift[0]);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_8;
col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(in, out);
round_shift_8x8(out, -shift[0]);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
break;
case FLIPADST_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_8;
col_cfg = &inv_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(coeff, in);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
transpose_8x8(in, out);
round_shift_8x8(out, -shift[0]);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
break;
default: assert(0);
......@@ -1569,108 +1533,90 @@ static void round_shift_16x16(__m128i *in, int shift) {
void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[64], out[64];
const TXFM_1D_CFG *row_cfg = NULL;
const TXFM_1D_CFG *col_cfg = NULL;
const int8_t *shift = inv_txfm_shift_ls[TX_16X16];
const int txw_idx = tx_size_wide_log2[TX_16X16] - tx_size_wide_log2[0];
const int txh_idx = tx_size_high_log2[TX_16X16] - tx_size_high_log2[0];
switch (tx_type) {
case DCT_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_16;
col_cfg = &inv_txfm_1d_col_cfg_dct_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_16;
col_cfg = &inv_txfm_1d_col_cfg_dct_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_16;
col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_16;
col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
break;
case FLIPADST_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_16;
col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
idct16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_16;
col_cfg = &inv_txfm_1d_col_cfg_dct_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
idct16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
break;
case ADST_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_16;
col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_16;
col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_16x16(in, output, stride, 1, 1, -shift[1], bd);
break;
case FLIPADST_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_16;
col_cfg = &inv_txfm_1d_col_cfg_adst_16;
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
iadst16x16_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
iadst16x16_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx]);
write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
break;
default: assert(0);
......
This diff is collapsed.
......@@ -18,38 +18,38 @@
extern "C" {
#endif