Commit 9c48eec7 authored by Jean-Marc Valin's avatar Jean-Marc Valin

Removing divisions from od_dir_find8()

Instead of dividing the squared partial sums by the number n of pixels in the
line, we multiply by 840/n, where 840=3*5*7*8. This not only avoids the
divisions, but it also makes the optimization exact as there is no more
rounding.

ntt-short1 resuts:
         MEDIUM (%) HIGH (%)
    PSNR -0.012070 -0.059644
 PSNRHVS -0.016845 -0.020871
    SSIM -0.026984 -0.031257
FASTSSIM -0.026078 0.414901

Change-Id: Ie553d5e3a545dee860a00879d724ecfc00f0a974
parent 0950ed82
......@@ -61,15 +61,21 @@ const double OD_DERING_GAIN_TABLE[OD_DERING_LEVELS] = {
static int od_dir_find8(const od_dering_in *img, int stride, int32_t *var,
int coeff_shift) {
int i;
int cost[8] = {0};
int32_t cost[8] = {0};
int partial[8][15] = {{0}};
int best_cost = 0;
int32_t best_cost = 0;
int best_dir = 0;
/* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
The output is then 840 times larger, but we don't care for finding
the max. */
static const int div_table[] = {0, 840, 420, 280, 210, 168, 140, 120, 105};
for (i = 0; i < 8; i++) {
int j;
for (j = 0; j < 8; j++) {
int x;
x = img[i*stride + j] >> coeff_shift;
/* We subtract 128 here to reduce the maximum range of the squared
partial sums. */
x = (img[i*stride + j] >> coeff_shift) - 128;
partial[0][i + j] += x;
partial[1][i + j/2] += x;
partial[2][i] += x;
......@@ -81,25 +87,28 @@ static int od_dir_find8(const od_dering_in *img, int stride, int32_t *var,
}
}
for (i = 0; i < 8; i++) {
cost[2] += partial[2][i]*partial[2][i] >> 3;
cost[6] += partial[6][i]*partial[6][i] >> 3;
cost[2] += partial[2][i]*partial[2][i];
cost[6] += partial[6][i]*partial[6][i];
}
cost[2] *= div_table[8];
cost[6] *= div_table[8];
for (i = 0; i < 7; i++) {
cost[0] += OD_DIVU_SMALL(partial[0][i]*partial[0][i], i + 1)
+ OD_DIVU_SMALL(partial[0][14 - i]*partial[0][14 - i], i + 1);
cost[4] += OD_DIVU_SMALL(partial[4][i]*partial[4][i], i + 1)
+ OD_DIVU_SMALL(partial[4][14 - i]*partial[4][14 - i], i + 1);
cost[0] += (partial[0][i]*partial[0][i]
+ partial[0][14 - i]*partial[0][14 - i])*div_table[i + 1];
cost[4] += (partial[4][i]*partial[4][i]
+ partial[4][14 - i]*partial[4][14 - i])*div_table[i + 1];
}
cost[0] += partial[0][7]*partial[0][8 - 1] >> 3;
cost[4] += partial[4][7]*partial[4][8 - 1] >> 3;
cost[0] += partial[0][7]*partial[0][7]*div_table[8];
cost[4] += partial[4][7]*partial[4][7]*div_table[8];
for (i = 1; i < 8; i += 2) {
int j;
for (j = 0; j < 4 + 1; j++) {
cost[i] += partial[i][3 + j]*partial[i][3 + j] >> 3;
cost[i] += partial[i][3 + j]*partial[i][3 + j];
}
cost[i] *= div_table[8];
for (j = 0; j < 4 - 1; j++) {
cost[i] += OD_DIVU_SMALL(partial[i][j]*partial[i][j], 2*j + 2)
+ OD_DIVU_SMALL(partial[i][10 - j]*partial[i][10 - j], 2*j + 2);
cost[i] += (partial[i][j]*partial[i][j]
+ partial[i][10 - j]*partial[i][10 - j])*div_table[2*j + 2];
}
}
for (i = 0; i < 8; i++) {
......@@ -111,6 +120,9 @@ static int od_dir_find8(const od_dering_in *img, int stride, int32_t *var,
/* Difference between the optimal variance and the variance along the
orthogonal direction. Again, the sum(x^2) terms cancel out. */
*var = best_cost - cost[(best_dir + 4) & 7];
/* We'd normally divide by 840, but dividing by 1024 is close enough
for what we're going to do with this. */
*var >>= 10;
return best_dir;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment