Commit 4a75b5a0 authored by Angie Chiang's avatar Angie Chiang Committed by Sebastien Alaiwan

Move shift from 1d_cfg to 2d_cfg

Change-Id: I22d0fac0d8e94fc02a7adf13b144ed17097ff84b
parent a4c80485
This diff is collapsed.
......@@ -245,6 +245,48 @@ static const TXFM_1D_CFG *inv_txfm_row_cfg_ls[TX_TYPES_1D][TX_SIZES_ALL] = {
},
};
static const int8_t inv_shift_4x4[2] = { 0, -4 };
static const int8_t inv_shift_8x8[2] = { 0, -5 };
static const int8_t inv_shift_16x16[2] = { -1, -5 };
static const int8_t inv_shift_32x32[2] = { -1, -5 };
#if CONFIG_TX64X64
static const int8_t inv_shift_64x64[2] = { -1, -5 };
#endif
static const int8_t inv_shift_4x8[2] = { 0, -5 };
static const int8_t inv_shift_8x4[2] = { 0, -5 };
static const int8_t inv_shift_8x16[2] = { -1, -5 };
static const int8_t inv_shift_16x8[2] = { -1, -5 };
static const int8_t inv_shift_16x32[2] = { -1, -5 };
static const int8_t inv_shift_32x16[2] = { -1, -5 };
#if CONFIG_TX64X64
static const int8_t inv_shift_32x64[2] = { -1, -5 };
static const int8_t inv_shift_64x32[2] = { -1, -5 };
#endif
static const int8_t inv_shift_4x16[2] = { -1, -5 };
static const int8_t inv_shift_16x4[2] = { -1, -5 };
static const int8_t inv_shift_8x32[2] = { -1, -5 };
static const int8_t inv_shift_32x8[2] = { -1, -5 };
#if CONFIG_TX64X64
static const int8_t inv_shift_16x64[2] = { -1, -5 };
static const int8_t inv_shift_64x16[2] = { -1, -5 };
#endif // CONFIG_TX64X64
const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
inv_shift_4x4, inv_shift_8x8, inv_shift_16x16, inv_shift_32x32,
#if CONFIG_TX64X64
inv_shift_64x64,
#endif // CONFIG_TX64X64
inv_shift_4x8, inv_shift_8x4, inv_shift_8x16, inv_shift_16x8,
inv_shift_16x32, inv_shift_32x16,
#if CONFIG_TX64X64
inv_shift_32x64, inv_shift_64x32,
#endif // CONFIG_TX64X64
inv_shift_4x16, inv_shift_16x4, inv_shift_8x32, inv_shift_32x8,
#if CONFIG_TX64X64
inv_shift_16x64, inv_shift_64x16,
#endif // CONFIG_TX64X64
};
void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
TXFM_2D_FLIP_CFG *cfg) {
assert(cfg != NULL);
......@@ -253,6 +295,7 @@ void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
const TX_TYPE_1D tx_type_row = htx_tab[tx_type];
cfg->col_cfg = inv_txfm_col_cfg_ls[tx_type_col][tx_size];
cfg->row_cfg = inv_txfm_row_cfg_ls[tx_type_row][tx_size];
cfg->shift = inv_txfm_shift_ls[tx_size];
}
void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
......@@ -271,8 +314,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
if (txfm_size_col == txfm_size_row) assert(rect_type == 0);
int rect_type_shift = 0;
// Take the shift from the larger dimension in the rectangular case.
const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
: cfg->col_cfg->shift;
const int8_t *shift = cfg->shift;
int shift1 = shift[1];
if (rect_type == 1 || rect_type == -1) {
......@@ -312,8 +354,7 @@ static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
const int txfm_size_col = cfg->row_cfg->txfm_size;
const int txfm_size_row = cfg->col_cfg->txfm_size;
// Take the shift from the larger dimension in the rectangular case.
const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
: cfg->col_cfg->shift;
const int8_t *shift = cfg->shift;
const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
int rect_type2_shift = 0;
int rect_type1_shift = 0;
......
......@@ -115,7 +115,6 @@ typedef struct TXFM_1D_CFG {
const int txfm_size;
const int stage_num;
const int8_t *shift;
const int8_t *stage_range;
const int8_t *cos_bit;
const TXFM_TYPE txfm_type;
......@@ -124,6 +123,7 @@ typedef struct TXFM_1D_CFG {
typedef struct TXFM_2D_FLIP_CFG {
int ud_flip; // flip upside down
int lr_flip; // flip left to right
const int8_t *shift;
const TXFM_1D_CFG *col_cfg;
const TXFM_1D_CFG *row_cfg;
} TXFM_2D_FLIP_CFG;
......
......@@ -603,6 +603,7 @@ void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
__m256i in[128], out[128];
const TXFM_1D_CFG *row_cfg = NULL;
const TXFM_1D_CFG *col_cfg = NULL;
const int8_t *shift = inv_txfm_shift_ls[TX_32X32];
switch (tx_type) {
case DCT_DCT:
......@@ -611,10 +612,10 @@ void av1_inv_txfm2d_add_32x32_avx2(const int32_t *coeff, uint16_t *output,
load_buffer_32x32(coeff, in);
transpose_32x32(in, out);
idct32_avx2(out, in, row_cfg->cos_bit[2]);
round_shift_32x32(in, -row_cfg->shift[0]);
round_shift_32x32(in, -shift[0]);
transpose_32x32(in, out);
idct32_avx2(out, in, col_cfg->cos_bit[2]);
write_buffer_32x32(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_32x32(in, output, stride, 0, 0, -shift[1], bd);
break;
default: assert(0);
}
......
......@@ -224,6 +224,7 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
__m128i in[4];
const TXFM_1D_CFG *row_cfg = NULL;
const TXFM_1D_CFG *col_cfg = NULL;
const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
switch (tx_type) {
case DCT_DCT:
......@@ -232,7 +233,7 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_4x4(coeff, in);
idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_4;
......@@ -240,7 +241,7 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_4x4(coeff, in);
idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_4;
......@@ -248,7 +249,7 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_4;
......@@ -256,7 +257,7 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
write_buffer_4x4(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
break;
case FLIPADST_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_4;
......@@ -264,7 +265,7 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_4x4(coeff, in);
idct4x4_sse4_1(in, row_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_4;
......@@ -272,7 +273,7 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
idct4x4_sse4_1(in, col_cfg->cos_bit[2]);
write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_4;
......@@ -280,7 +281,7 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
write_buffer_4x4(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
break;
case ADST_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_4;
......@@ -288,7 +289,7 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
write_buffer_4x4(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_4;
......@@ -296,7 +297,7 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_4x4(coeff, in);
iadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
iadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
write_buffer_4x4(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
default: assert(0);
}
......@@ -820,6 +821,7 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
__m128i in[16], out[16];
const TXFM_1D_CFG *row_cfg = NULL;
const TXFM_1D_CFG *col_cfg = NULL;
const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
switch (tx_type) {
case DCT_DCT:
......@@ -830,7 +832,7 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_8;
......@@ -840,7 +842,7 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_8;
......@@ -850,7 +852,7 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_8;
......@@ -860,7 +862,7 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
case FLIPADST_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_8;
......@@ -870,7 +872,7 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_8;
......@@ -880,7 +882,7 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
break;
case ADST_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_8;
......@@ -890,7 +892,7 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_8;
......@@ -900,7 +902,7 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
break;
case FLIPADST_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_8;
......@@ -910,7 +912,7 @@ void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
break;
default: assert(0);
}
......@@ -1560,6 +1562,7 @@ void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
__m128i in[64], out[64];
const TXFM_1D_CFG *row_cfg = NULL;
const TXFM_1D_CFG *col_cfg = NULL;
const int8_t *shift = inv_txfm_shift_ls[TX_16X16];
switch (tx_type) {
case DCT_DCT:
......@@ -1568,10 +1571,10 @@ void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
round_shift_16x16(in, -row_cfg->shift[0]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
break;
case DCT_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_16;
......@@ -1579,10 +1582,10 @@ void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
round_shift_16x16(in, -row_cfg->shift[0]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_16;
......@@ -1590,10 +1593,10 @@ void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
round_shift_16x16(in, -row_cfg->shift[0]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
break;
case ADST_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_16;
......@@ -1601,10 +1604,10 @@ void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
round_shift_16x16(in, -row_cfg->shift[0]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_16x16(in, output, stride, 0, 0, -row_cfg->shift[1], bd);
write_buffer_16x16(in, output, stride, 0, 0, -shift[1], bd);
break;
case FLIPADST_DCT:
row_cfg = &inv_txfm_1d_row_cfg_dct_16;
......@@ -1612,10 +1615,10 @@ void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
idct16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
round_shift_16x16(in, -row_cfg->shift[0]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
break;
case DCT_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_16;
......@@ -1623,10 +1626,10 @@ void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
round_shift_16x16(in, -row_cfg->shift[0]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
idct16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
break;
case ADST_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_16;
......@@ -1634,10 +1637,10 @@ void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
round_shift_16x16(in, -row_cfg->shift[0]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_16x16(in, output, stride, 1, 0, -row_cfg->shift[1], bd);
write_buffer_16x16(in, output, stride, 1, 0, -shift[1], bd);
break;
case FLIPADST_FLIPADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_16;
......@@ -1645,10 +1648,10 @@ void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
round_shift_16x16(in, -row_cfg->shift[0]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_16x16(in, output, stride, 1, 1, -row_cfg->shift[1], bd);
write_buffer_16x16(in, output, stride, 1, 1, -shift[1], bd);
break;
case FLIPADST_ADST:
row_cfg = &inv_txfm_1d_row_cfg_adst_16;
......@@ -1656,10 +1659,10 @@ void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
load_buffer_16x16(coeff, in);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, row_cfg->cos_bit[2]);
round_shift_16x16(in, -row_cfg->shift[0]);
round_shift_16x16(in, -shift[0]);
transpose_16x16(in, out);
iadst16x16_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_16x16(in, output, stride, 0, 1, -row_cfg->shift[1], bd);
write_buffer_16x16(in, output, stride, 0, 1, -shift[1], bd);
break;
default: assert(0);
}
......
This diff is collapsed.
......@@ -48,7 +48,7 @@ static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
const TXFM_2D_FLIP_CFG *cfg, int bd) {
// Take the shift from the larger dimension in the rectangular case.
const int8_t *shift = cfg->row_cfg->shift;
const int8_t *shift = cfg->shift;
// i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
for (int i = 0; i < cfg->col_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
stage_range_col[i] = cfg->col_cfg->stage_range[i] + shift[0] + bd + 1;
......@@ -74,7 +74,7 @@ static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
const int txfm_size_col = cfg->row_cfg->txfm_size;
const int txfm_size_row = cfg->col_cfg->txfm_size;
// Take the shift from the larger dimension in the rectangular case.
const int8_t *shift = cfg->row_cfg->shift;
const int8_t *shift = cfg->shift;
const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
......@@ -616,6 +616,48 @@ static const TXFM_1D_CFG *fwd_txfm_row_cfg_ls[TX_TYPES_1D][TX_SIZES_ALL] = {
},
};
static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 };
static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 };
static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 };
static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 };
#if CONFIG_TX64X64
static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 };
#endif
static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 };
static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 };
static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 };
static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 };
static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 };
static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 };
#if CONFIG_TX64X64
static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 };
static const int8_t fwd_shift_64x32[3] = { 0, -2, -2 };
#endif
static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 };
static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 };
static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 };
static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 };
#if CONFIG_TX64X64
static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 };
static const int8_t fwd_shift_64x16[3] = { 0, -2, 0 };
#endif // CONFIG_TX64X64
const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32,
#if CONFIG_TX64X64
fwd_shift_64x64,
#endif // CONFIG_TX64X64
fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16, fwd_shift_16x8,
fwd_shift_16x32, fwd_shift_32x16,
#if CONFIG_TX64X64
fwd_shift_32x64, fwd_shift_64x32,
#endif // CONFIG_TX64X64
fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32, fwd_shift_32x8,
#if CONFIG_TX64X64
fwd_shift_16x64, fwd_shift_64x16,
#endif // CONFIG_TX64X64
};
void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
TXFM_2D_FLIP_CFG *cfg) {
assert(cfg != NULL);
......@@ -624,4 +666,5 @@ void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
const TX_TYPE_1D tx_type_row = htx_tab[tx_type];
cfg->col_cfg = fwd_txfm_col_cfg_ls[tx_type_col][tx_size];
cfg->row_cfg = fwd_txfm_row_cfg_ls[tx_type_row][tx_size];
cfg->shift = fwd_txfm_shift_ls[tx_size];
}
......@@ -47,7 +47,7 @@ static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
// transforms.
assert(cfg->row_cfg->txfm_size == cfg->col_cfg->txfm_size);
const int txfm_size = cfg->row_cfg->txfm_size;
const int8_t *shift = cfg->row_cfg->shift;
const int8_t *shift = cfg->shift;
const int8_t *stage_range_col = cfg->col_cfg->stage_range;
const int8_t *stage_range_row = cfg->row_cfg->stage_range;
const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
......
......@@ -184,12 +184,13 @@ void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
__m128i in[4];
const TXFM_1D_CFG *row_cfg = NULL;
const TXFM_1D_CFG *col_cfg = NULL;
const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
switch (tx_type) {
case DCT_DCT:
row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
write_buffer_4x4(in, coeff);
......@@ -197,7 +198,7 @@ void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
case ADST_DCT:
row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
write_buffer_4x4(in, coeff);
......@@ -205,7 +206,7 @@ void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
case DCT_ADST:
row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
write_buffer_4x4(in, coeff);
......@@ -213,7 +214,7 @@ void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
case ADST_ADST:
row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(input, in, input_stride, 0, 0, row_cfg->shift[0]);
load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
write_buffer_4x4(in, coeff);
......@@ -221,7 +222,7 @@ void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
case FLIPADST_DCT:
row_cfg = &fwd_txfm_1d_row_cfg_dct_4;
col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
fdct4x4_sse4_1(in, row_cfg->cos_bit[2]);
write_buffer_4x4(in, coeff);
......@@ -229,7 +230,7 @@ void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
case DCT_FLIPADST:
row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
col_cfg = &fwd_txfm_1d_col_cfg_dct_4;
load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
fdct4x4_sse4_1(in, col_cfg->cos_bit[2]);
fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
write_buffer_4x4(in, coeff);
......@@ -237,7 +238,7 @@ void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
case FLIPADST_FLIPADST:
row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(input, in, input_stride, 1, 1, row_cfg->shift[0]);
load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
write_buffer_4x4(in, coeff);
......@@ -245,7 +246,7 @@ void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
case ADST_FLIPADST:
row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(input, in, input_stride, 0, 1, row_cfg->shift[0]);
load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
write_buffer_4x4(in, coeff);
......@@ -253,7 +254,7 @@ void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
case FLIPADST_ADST:
row_cfg = &fwd_txfm_1d_row_cfg_adst_4;
col_cfg = &fwd_txfm_1d_col_cfg_adst_4;
load_buffer_4x4(input, in, input_stride, 1, 0, row_cfg->shift[0]);
load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
fadst4x4_sse4_1(in, col_cfg->cos_bit[2]);
fadst4x4_sse4_1(in, row_cfg->cos_bit[2]);
write_buffer_4x4(in, coeff);
......@@ -791,14 +792,15 @@ void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
__m128i in[16], out[16];
const TXFM_1D_CFG *row_cfg = NULL;
const TXFM_1D_CFG *col_cfg = NULL;
const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
switch (tx_type) {
case DCT_DCT:
row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
fdct8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
transpose_8x8(out, in);
......@@ -807,9 +809,9 @@ void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
case ADST_DCT:
row_cfg = &fwd_txfm_1d_row_cfg_dct_8;
col_cfg = &fwd_txfm_1d_col_cfg_adst_8;
load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
fadst8x8_sse4_1(in, out, col_cfg->cos_bit[2]);
col_txfm_8x8_rounding(out, -row_cfg->shift[1]);
col_txfm_8x8_rounding(out, -shift[1]);
transpose_8x8(out, in);
fdct8x8_sse4_1(in, out, row_cfg->cos_bit[2]);
transpose_8x8(out, in);
......@@ -818,9 +820,9 @@ void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
case DCT_ADST:
row_cfg = &fwd_txfm_1d_row_cfg_adst_8;
col_cfg = &fwd_txfm_1d_col_cfg_dct_8;
load_buffer_8x8(input, in, stride, 0, 0, row_cfg->shift[0]);
load_buffer_8x8(input, in, stride, 0, 0, shift[0]);