Commit 8e1d0f70 authored by Angie Chiang's avatar Angie Chiang

Change scales of fht 32x16 16x32 32x32 functions

Performance drop with ext_tx and rect_tx on
       BDRate
lowres -0.028
midres -0.075
hdres  -0.054

Change-Id: I50f89b9e9785d82ab05c3276a3c8b22b4dcfd408
parent 705ce47f
...@@ -1654,15 +1654,14 @@ void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride, ...@@ -1654,15 +1654,14 @@ void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
(tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2); (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
ht.rows(temp_in, temp_out); ht.rows(temp_in, temp_out);
for (j = 0; j < n; ++j) for (j = 0; j < n; ++j)
out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
} }
// Columns // Columns
for (i = 0; i < n; ++i) { for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.cols(temp_in, temp_out); ht.cols(temp_in, temp_out);
for (j = 0; j < n2; ++j) for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
} }
// Note: overall scale factor of transform is 4 times unitary // Note: overall scale factor of transform is 4 times unitary
} }
...@@ -1707,15 +1706,14 @@ void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride, ...@@ -1707,15 +1706,14 @@ void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
(tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2); (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
ht.cols(temp_in, temp_out); ht.cols(temp_in, temp_out);
for (j = 0; j < n; ++j) for (j = 0; j < n; ++j)
out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
} }
// Rows // Rows
for (i = 0; i < n; ++i) { for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.rows(temp_in, temp_out); ht.rows(temp_in, temp_out);
for (j = 0; j < n2; ++j) for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
output[j + i * n2] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
} }
// Note: overall scale factor of transform is 4 times unitary // Note: overall scale factor of transform is 4 times unitary
} }
...@@ -2074,17 +2072,6 @@ void av1_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, ...@@ -2074,17 +2072,6 @@ void av1_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
} }
#endif // CONFIG_AOM_HIGHBITDEPTH #endif // CONFIG_AOM_HIGHBITDEPTH
// TODO(luoyi): Adding this function to avoid DCT_DCT overflow.
// Remove this function after we scale the column txfm output correctly.
static INLINE int range_check_dct32x32(const int16_t *input, int16_t bound,
int size) {
int i;
for (i = 0; i < size; ++i) {
if (abs(input[i]) > bound) return 1;
}
return 0;
}
void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride, void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) { int tx_type) {
static const transform_2d FHT[] = { static const transform_2d FHT[] = {
...@@ -2117,27 +2104,19 @@ void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride, ...@@ -2117,27 +2104,19 @@ void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type); maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
#endif #endif
if (DCT_DCT == tx_type) {
if (range_check_dct32x32(input, (1 << 6) - 1, 1 << 10)) {
aom_fdct32x32_c(input, output, stride);
return;
}
}
// Columns // Columns
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out); ht.cols(temp_in, temp_out);
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j)
out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
} }
// Rows // Rows
for (i = 0; i < 32; ++i) { for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
ht.rows(temp_in, temp_out); ht.rows(temp_in, temp_out);
for (j = 0; j < 32; ++j) for (j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j];
output[j + i * 32] =
(tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
} }
} }
......
...@@ -92,14 +92,14 @@ static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff, ...@@ -92,14 +92,14 @@ static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type, int diff_stride, TX_TYPE tx_type,
FWD_TXFM_OPT fwd_txfm_opt) { FWD_TXFM_OPT fwd_txfm_opt) {
(void)fwd_txfm_opt; (void)fwd_txfm_opt;
av1_fht16x32(src_diff, coeff, diff_stride, tx_type); av1_fht16x32_c(src_diff, coeff, diff_stride, tx_type);
} }
static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff, static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type, int diff_stride, TX_TYPE tx_type,
FWD_TXFM_OPT fwd_txfm_opt) { FWD_TXFM_OPT fwd_txfm_opt) {
(void)fwd_txfm_opt; (void)fwd_txfm_opt;
av1_fht32x16(src_diff, coeff, diff_stride, tx_type); av1_fht32x16_c(src_diff, coeff, diff_stride, tx_type);
} }
static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
...@@ -135,7 +135,7 @@ static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff, ...@@ -135,7 +135,7 @@ static void fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type); av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
else else
#endif #endif
av1_fht32x32(src_diff, coeff, diff_stride, tx_type); av1_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
} }
#if CONFIG_TX64X64 #if CONFIG_TX64X64
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment