Commit 5500ce76 authored by Monty Montgomery's avatar Monty Montgomery Committed by Christopher Montgomery

Move all of LBD Daala TX to up-4, down-1 shift

Now that tran_low_t is assumed to be 32 bit when Daala TX is active,
there's no reason for multi-stage shifting to fit coefficients into 16
bits for the inter-tranform transpose matrix. Go to a consistent up by
four, down by one shifting scheme for all TX block sizes.

(Note this is for the current AV1 coefficient scaling scheme with
av1_get_tx_scale and deeper coefficients for higher bitdepth input.
Daala TX is moving to the long-intended constant-coefficient-depth in
upcoming patches).

subset 1:
monty-4-1-baseline-s1@2017-11-11T05:57:15.857Z ->
 monty-4-1-test-s1@2017-11-11T05:57:52.983Z

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
-0.0117 | -0.0246 |  0.0530 |   0.0238 | 0.0254 |  0.0447 |    -0.0442

Change-Id: I2214e94ac822542c504d472276723277ed350abf
parent 9134586f
......@@ -1008,9 +1008,9 @@ void av1_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
for (int i = 0; i < n2; ++i) {
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
tran_low_t temp_in[16];
for (int j = 0; j < n; j++) temp_in[j] = input[j] * 2;
for (int j = 0; j < n; j++) temp_in[j] = input[j] * 4;
IHT_16x32[tx_type].rows(temp_in, outtmp);
for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j] * 4;
for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
#else
IHT_16x32[tx_type].rows(input, outtmp);
for (int j = 0; j < n; ++j)
......@@ -1030,7 +1030,7 @@ void av1_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
#endif
......@@ -1095,9 +1095,9 @@ void av1_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
for (int i = 0; i < n; ++i) {
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
tran_low_t temp_in[32];
for (int j = 0; j < n2; j++) temp_in[j] = input[j] * 2;
for (int j = 0; j < n2; j++) temp_in[j] = input[j] * 4;
IHT_32x16[tx_type].rows(temp_in, outtmp);
for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j] * 4;
for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
#else
IHT_32x16[tx_type].rows(input, outtmp);
for (int j = 0; j < n2; ++j)
......@@ -1117,7 +1117,7 @@ void av1_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
#endif
......@@ -1360,7 +1360,7 @@ void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
for (int i = 0; i < 32; ++i) {
#if CONFIG_DAALA_TX32
tran_low_t temp_in[32];
for (int j = 0; j < 32; j++) temp_in[j] = input[j] * 2;
for (int j = 0; j < 32; j++) temp_in[j] = input[j] * 4;
IHT_32[tx_type].rows(temp_in, out[i]);
#else
IHT_32[tx_type].rows(input, out[i]);
......@@ -1369,15 +1369,8 @@ void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
}
// transpose
for (int i = 0; i < 32; i++) {
for (int j = 0; j < 32; j++) {
#if CONFIG_DAALA_TX32
tmp[j][i] = out[i][j] * 4;
#else
tmp[j][i] = out[i][j];
#endif
}
}
for (int i = 0; i < 32; i++)
for (int j = 0; j < 32; j++) tmp[j][i] = out[i][j];
// inverse transform column vectors
for (int i = 0; i < 32; ++i) IHT_32[tx_type].cols(tmp[i], out[i]);
......@@ -1390,7 +1383,7 @@ void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX32
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
#endif
......@@ -1455,9 +1448,8 @@ void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
for (int i = 0; i < 64; ++i) {
#if CONFIG_DAALA_TX64
tran_low_t temp_in[64];
for (int j = 0; j < 64; j++) temp_in[j] = input[j] * 2;
for (int j = 0; j < 64; j++) temp_in[j] = input[j] * 8;
IHT_64[tx_type].rows(temp_in, out[i]);
// Do not rescale intermediate for Daala
#else
IHT_64[tx_type].rows(input, out[i]);
for (int j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
......@@ -1483,7 +1475,7 @@ void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX64
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 2));
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
#endif
......
......@@ -1965,7 +1965,7 @@ void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
ht.rows(temp_in, temp_out);
for (j = 0; j < n; ++j) {
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
out[j * n2 + i] = temp_out[j];
#else
out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
#endif
......@@ -1976,7 +1976,12 @@ void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.cols(temp_in, temp_out);
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
for (j = 0; j < n2; ++j)
output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
#else
for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
#endif
}
// Note: overall scale factor of transform is 4 times unitary
}
......@@ -2049,7 +2054,7 @@ void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
ht.cols(temp_in, temp_out);
for (j = 0; j < n; ++j) {
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
out[j * n2 + i] = temp_out[j];
#else
out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
#endif
......@@ -2060,7 +2065,12 @@ void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.rows(temp_in, temp_out);
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
for (j = 0; j < n2; ++j)
output[j + i * n2] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
#else
for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
#endif
}
// Note: overall scale factor of transform is 4 times unitary
}
......@@ -2489,8 +2499,7 @@ void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
#if CONFIG_DAALA_TX64
for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i] * 16;
ht.cols(temp_in, temp_out);
for (j = 0; j < 64; ++j)
out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 3;
for (j = 0; j < 64; ++j) out[j * 64 + i] = temp_out[j];
#else
for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
......@@ -2506,7 +2515,7 @@ void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
ht.rows(temp_in, temp_out);
for (j = 0; j < 64; ++j)
#if CONFIG_DAALA_TX64
output[j + i * 64] = temp_out[j];
output[j + i * 64] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
#else
output[j + i * 64] =
(tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment