diff --git a/vp10/common/od_dering.c b/vp10/common/od_dering.c
index 66eecea60e733d0e1082f99598595b989ae24cdf..af89b80dbac1c76a0d06ac0c6907c53aa0a5864d 100644
--- a/vp10/common/od_dering.c
+++ b/vp10/common/od_dering.c
@@ -61,15 +61,21 @@ const double OD_DERING_GAIN_TABLE[OD_DERING_LEVELS] = {
 static int od_dir_find8(const od_dering_in *img, int stride, int32_t *var,
     int coeff_shift) {
   int i;
-  int cost[8] = {0};
+  int32_t cost[8] = {0};
   int partial[8][15] = {{0}};
-  int best_cost = 0;
+  int32_t best_cost = 0;
   int best_dir = 0;
+  /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
+     The output is then 840 times larger, but we don't care for finding
+     the max. */
+  static const int div_table[] = {0, 840, 420, 280, 210, 168, 140, 120, 105};
   for (i = 0; i < 8; i++) {
     int j;
     for (j = 0; j < 8; j++) {
       int x;
-      x = img[i*stride + j] >> coeff_shift;
+      /* We subtract 128 here to reduce the maximum range of the squared
+         partial sums. */
+      x = (img[i*stride + j] >> coeff_shift) - 128;
       partial[0][i + j] += x;
       partial[1][i + j/2] += x;
       partial[2][i] += x;
@@ -81,25 +87,28 @@ static int od_dir_find8(const od_dering_in *img, int stride, int32_t *var,
     }
   }
   for (i = 0; i < 8; i++) {
-    cost[2] += partial[2][i]*partial[2][i] >> 3;
-    cost[6] += partial[6][i]*partial[6][i] >> 3;
+    cost[2] += partial[2][i]*partial[2][i];
+    cost[6] += partial[6][i]*partial[6][i];
   }
+  cost[2] *= div_table[8];
+  cost[6] *= div_table[8];
   for (i = 0; i < 7; i++) {
-    cost[0] += OD_DIVU_SMALL(partial[0][i]*partial[0][i], i + 1)
-     + OD_DIVU_SMALL(partial[0][14 - i]*partial[0][14 - i], i + 1);
-    cost[4] += OD_DIVU_SMALL(partial[4][i]*partial[4][i], i + 1)
-     + OD_DIVU_SMALL(partial[4][14 - i]*partial[4][14 - i], i + 1);
+    cost[0] += (partial[0][i]*partial[0][i]
+     + partial[0][14 - i]*partial[0][14 - i])*div_table[i + 1];
+    cost[4] += (partial[4][i]*partial[4][i]
+     + partial[4][14 - i]*partial[4][14 - i])*div_table[i + 1];
   }
-  cost[0] += partial[0][7]*partial[0][8 - 1] >> 3;
-  cost[4] += partial[4][7]*partial[4][8 - 1] >> 3;
+  cost[0] += partial[0][7]*partial[0][7]*div_table[8];
+  cost[4] += partial[4][7]*partial[4][7]*div_table[8];
   for (i = 1; i < 8; i += 2) {
     int j;
     for (j = 0; j < 4 + 1; j++) {
-      cost[i] += partial[i][3 + j]*partial[i][3 + j] >> 3;
+      cost[i] += partial[i][3 + j]*partial[i][3 + j];
     }
+    cost[i] *= div_table[8];
     for (j = 0; j < 4 - 1; j++) {
-      cost[i] += OD_DIVU_SMALL(partial[i][j]*partial[i][j], 2*j + 2)
-       + OD_DIVU_SMALL(partial[i][10 - j]*partial[i][10 - j], 2*j + 2);
+      cost[i] += (partial[i][j]*partial[i][j]
+       + partial[i][10 - j]*partial[i][10 - j])*div_table[2*j + 2];
     }
   }
   for (i = 0; i < 8; i++) {
@@ -111,6 +120,9 @@ static int od_dir_find8(const od_dering_in *img, int stride, int32_t *var,
   /* Difference between the optimal variance and the variance along the
      orthogonal direction. Again, the sum(x^2) terms cancel out. */
   *var = best_cost - cost[(best_dir + 4) & 7];
+  /* We'd normally divide by 840, but dividing by 1024 is close enough
+     for what we're going to do with this. */
+  *var >>= 10;
   return best_dir;
 }