From 15338d5f40b3361ce82840573f65dc42ce899fba Mon Sep 17 00:00:00 2001
From: David Barker <david.barker@argondesign.com>
Date: Mon, 13 Feb 2017 15:30:59 +0000
Subject: [PATCH] Speed up global motion determination

When global-motion is enabled, a considerable amount
of encoder time is spent in the functions in corner_match.c.
This patch optimizes those functions to be 3.5-4x as fast,
leading to an end-to-end encoder speed improvement
(on 20 frames of tempete_cif.y4m) of:

 200kbps: ~26% faster
 800kbps: ~19% faster
2800kbps: ~12% faster

Change-Id: I04d3f87484c36c41eb5a1e86e814f2accbe86297
---
 av1/encoder/corner_match.c  | 140 ++++++++++++++++--------------------
 av1/encoder/corner_match.h  |   6 +-
 av1/encoder/global_motion.c |   6 +-
 av1/encoder/ransac.c        |  16 ++---
 av1/encoder/ransac.h        |  16 ++---
 5 files changed, 81 insertions(+), 103 deletions(-)

diff --git a/av1/encoder/corner_match.c b/av1/encoder/corner_match.c
index 63225a8b32..64ee0c5ae1 100644
--- a/av1/encoder/corner_match.c
+++ b/av1/encoder/corner_match.c
@@ -24,11 +24,13 @@
 
 #define THRESHOLD_NCC 0.75
 
-static double compute_variance(unsigned char *im, int stride, int x, int y,
-                               double *mean) {
-  double sum = 0.0;
-  double sumsq = 0.0;
-  double var;
+/* Compute var(im) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of im,
+   centered at (x, y).
+*/
+static double compute_variance(unsigned char *im, int stride, int x, int y) {
+  int sum = 0.0;
+  int sumsq = 0.0;
+  int var;
   int i, j;
   for (i = 0; i < MATCH_SZ; ++i)
     for (j = 0; j < MATCH_SZ; ++j) {
@@ -36,39 +38,45 @@ static double compute_variance(unsigned char *im, int stride, int x, int y,
       sumsq += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] *
                im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
     }
-  var = (sumsq * MATCH_SZ_SQ - sum * sum) / (MATCH_SZ_SQ * MATCH_SZ_SQ);
-  if (mean) *mean = sum / MATCH_SZ_SQ;
-  return var;
+  var = sumsq * MATCH_SZ_SQ - sum * sum;
+  return (double)var;
 }
 
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+   correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+   of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
 static double compute_cross_correlation(unsigned char *im1, int stride1, int x1,
                                         int y1, unsigned char *im2, int stride2,
                                         int x2, int y2) {
-  double sum1 = 0;
-  double sum2 = 0;
-  double cross = 0;
-  double corr;
+  int v1, v2;
+  int sum1 = 0;
+  int sum2 = 0;
+  int sumsq2 = 0;
+  int cross = 0;
+  int var2, cov;
   int i, j;
   for (i = 0; i < MATCH_SZ; ++i)
     for (j = 0; j < MATCH_SZ; ++j) {
-      sum1 += im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)];
-      sum2 += im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)];
-      cross +=
-          im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)] *
-          im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)];
+      v1 = im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)];
+      v2 = im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)];
+      sum1 += v1;
+      sum2 += v2;
+      sumsq2 += v2 * v2;
+      cross += v1 * v2;
     }
-  corr = (cross * MATCH_SZ_SQ - sum1 * sum2) / (MATCH_SZ_SQ * MATCH_SZ_SQ);
-  return corr;
+  var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+  cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+  return cov / sqrt((double)var2);
 }
 
-static int is_eligible_point(double pointx, double pointy, int width,
-                             int height) {
+static int is_eligible_point(int pointx, int pointy, int width, int height) {
   return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 &&
           pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height);
 }
 
-static int is_eligible_distance(double point1x, double point1y, double point2x,
-                                double point2y, int width, int height) {
+static int is_eligible_distance(int point1x, int point1y, int point2x,
+                                int point2y, int width, int height) {
   const int thresh = (width < height ? height : width) >> 4;
   return ((point1x - point2x) * (point1x - point2x) +
           (point1y - point2y) * (point1y - point2y)) <= thresh * thresh;
@@ -81,32 +89,21 @@ static void improve_correspondence(unsigned char *frm, unsigned char *ref,
                                    int num_correspondences) {
   int i;
   for (i = 0; i < num_correspondences; ++i) {
-    double template_norm =
-        compute_variance(frm, frm_stride, (int)correspondences[i].x,
-                         (int)correspondences[i].y, NULL);
     int x, y, best_x = 0, best_y = 0;
     double best_match_ncc = 0.0;
     for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) {
       for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
         double match_ncc;
-        double subimage_norm;
-        if (!is_eligible_point((int)correspondences[i].rx + x,
-                               (int)correspondences[i].ry + y, width, height))
+        if (!is_eligible_point(correspondences[i].rx + x,
+                               correspondences[i].ry + y, width, height))
           continue;
-        if (!is_eligible_distance(
-                (int)correspondences[i].x, (int)correspondences[i].y,
-                (int)correspondences[i].rx + x, (int)correspondences[i].ry + y,
-                width, height))
+        if (!is_eligible_distance(correspondences[i].x, correspondences[i].y,
+                                  correspondences[i].rx + x,
+                                  correspondences[i].ry + y, width, height))
           continue;
-        subimage_norm =
-            compute_variance(ref, ref_stride, (int)correspondences[i].rx + x,
-                             (int)correspondences[i].ry + y, NULL);
         match_ncc = compute_cross_correlation(
-                        frm, frm_stride, (int)correspondences[i].x,
-                        (int)correspondences[i].y, ref, ref_stride,
-                        (int)correspondences[i].rx + x,
-                        (int)correspondences[i].ry + y) /
-                    sqrt(template_norm * subimage_norm);
+            frm, frm_stride, correspondences[i].x, correspondences[i].y, ref,
+            ref_stride, correspondences[i].rx + x, correspondences[i].ry + y);
         if (match_ncc > best_match_ncc) {
           best_match_ncc = match_ncc;
           best_y = y;
@@ -114,36 +111,25 @@ static void improve_correspondence(unsigned char *frm, unsigned char *ref,
         }
       }
     }
-    correspondences[i].rx += (double)best_x;
-    correspondences[i].ry += (double)best_y;
+    correspondences[i].rx += best_x;
+    correspondences[i].ry += best_y;
   }
   for (i = 0; i < num_correspondences; ++i) {
-    double template_norm =
-        compute_variance(ref, ref_stride, (int)correspondences[i].rx,
-                         (int)correspondences[i].ry, NULL);
     int x, y, best_x = 0, best_y = 0;
     double best_match_ncc = 0.0;
     for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y)
       for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
         double match_ncc;
-        double subimage_norm;
-        if (!is_eligible_point((int)correspondences[i].x + x,
-                               (int)correspondences[i].y + y, width, height))
+        if (!is_eligible_point(correspondences[i].x + x,
+                               correspondences[i].y + y, width, height))
           continue;
-        if (!is_eligible_distance((int)correspondences[i].x + x,
-                                  (int)correspondences[i].y + y,
-                                  (int)correspondences[i].rx,
-                                  (int)correspondences[i].ry, width, height))
+        if (!is_eligible_distance(
+                correspondences[i].x + x, correspondences[i].y + y,
+                correspondences[i].rx, correspondences[i].ry, width, height))
           continue;
-        subimage_norm =
-            compute_variance(frm, frm_stride, (int)correspondences[i].x + x,
-                             (int)correspondences[i].y + y, NULL);
-        match_ncc =
-            compute_cross_correlation(
-                frm, frm_stride, (int)correspondences[i].x + x,
-                (int)correspondences[i].y + y, ref, ref_stride,
-                (int)correspondences[i].rx, (int)correspondences[i].ry) /
-            sqrt(template_norm * subimage_norm);
+        match_ncc = compute_cross_correlation(
+            ref, ref_stride, correspondences[i].rx, correspondences[i].ry, frm,
+            frm_stride, correspondences[i].x + x, correspondences[i].y + y);
         if (match_ncc > best_match_ncc) {
           best_match_ncc = match_ncc;
           best_y = y;
@@ -159,7 +145,7 @@ int determine_correspondence(unsigned char *frm, int *frm_corners,
                              int num_frm_corners, unsigned char *ref,
                              int *ref_corners, int num_ref_corners, int width,
                              int height, int frm_stride, int ref_stride,
-                             double *correspondence_pts) {
+                             int *correspondence_pts) {
   // TODO(sarahparker) Improve this to include 2-way match
   int i, j;
   Correspondence *correspondences = (Correspondence *)correspondence_pts;
@@ -171,11 +157,8 @@ int determine_correspondence(unsigned char *frm, int *frm_corners,
     if (!is_eligible_point(frm_corners[2 * i], frm_corners[2 * i + 1], width,
                            height))
       continue;
-    template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i],
-                                     frm_corners[2 * i + 1], NULL);
     for (j = 0; j < num_ref_corners; ++j) {
       double match_ncc;
-      double subimage_norm;
       if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width,
                              height))
         continue;
@@ -183,25 +166,24 @@ int determine_correspondence(unsigned char *frm, int *frm_corners,
                                 ref_corners[2 * j], ref_corners[2 * j + 1],
                                 width, height))
         continue;
-      subimage_norm = compute_variance(ref, ref_stride, ref_corners[2 * j],
-                                       ref_corners[2 * j + 1], NULL);
-      match_ncc = compute_cross_correlation(frm, frm_stride, frm_corners[2 * i],
-                                            frm_corners[2 * i + 1], ref,
-                                            ref_stride, ref_corners[2 * j],
-                                            ref_corners[2 * j + 1]) /
-                  sqrt(template_norm * subimage_norm);
+      match_ncc = compute_cross_correlation(
+          frm, frm_stride, frm_corners[2 * i], frm_corners[2 * i + 1], ref,
+          ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]);
       if (match_ncc > best_match_ncc) {
         best_match_ncc = match_ncc;
         best_match_j = j;
       }
     }
-    if (best_match_ncc > THRESHOLD_NCC) {
-      correspondences[num_correspondences].x = (double)frm_corners[2 * i];
-      correspondences[num_correspondences].y = (double)frm_corners[2 * i + 1];
-      correspondences[num_correspondences].rx =
-          (double)ref_corners[2 * best_match_j];
+    // Note: We want to test if the best correlation is >= THRESHOLD_NCC,
+    // but need to account for the normalization in compute_cross_correlation.
+    template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i],
+                                     frm_corners[2 * i + 1]);
+    if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) {
+      correspondences[num_correspondences].x = frm_corners[2 * i];
+      correspondences[num_correspondences].y = frm_corners[2 * i + 1];
+      correspondences[num_correspondences].rx = ref_corners[2 * best_match_j];
       correspondences[num_correspondences].ry =
-          (double)ref_corners[2 * best_match_j + 1];
+          ref_corners[2 * best_match_j + 1];
       num_correspondences++;
     }
   }
diff --git a/av1/encoder/corner_match.h b/av1/encoder/corner_match.h
index 3618aa0c65..c0458642c1 100644
--- a/av1/encoder/corner_match.h
+++ b/av1/encoder/corner_match.h
@@ -16,14 +16,14 @@
 #include <memory.h>
 
 typedef struct {
-  double x, y;
-  double rx, ry;
+  int x, y;
+  int rx, ry;
 } Correspondence;
 
 int determine_correspondence(unsigned char *frm, int *frm_corners,
                              int num_frm_corners, unsigned char *ref,
                              int *ref_corners, int num_ref_corners, int width,
                              int height, int frm_stride, int ref_stride,
-                             double *correspondence_pts);
+                             int *correspondence_pts);
 
 #endif  // AV1_ENCODER_CORNER_MATCH_H_
diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c
index 526e897a76..8259c45426 100644
--- a/av1/encoder/global_motion.c
+++ b/av1/encoder/global_motion.c
@@ -218,7 +218,7 @@ static INLINE RansacFunc get_ransac_type(TransformationType type) {
 
 // computes global motion parameters by fitting a model using RANSAC
 static int compute_global_motion_params(TransformationType type,
-                                        double *correspondences,
+                                        int *correspondences,
                                         int num_correspondences, double *params,
                                         int *inlier_map) {
   int result;
@@ -259,7 +259,7 @@ int compute_global_motion_feature_based(TransformationType type,
                                         double *params) {
   int num_frm_corners, num_ref_corners;
   int num_correspondences;
-  double *correspondences;
+  int *correspondences;
   int num_inliers;
   int frm_corners[2 * MAX_CORNERS], ref_corners[2 * MAX_CORNERS];
   int *inlier_map = NULL;
@@ -291,7 +291,7 @@ int compute_global_motion_feature_based(TransformationType type,
 
   // find correspondences between the two images
   correspondences =
-      (double *)malloc(num_frm_corners * 4 * sizeof(*correspondences));
+      (int *)malloc(num_frm_corners * 4 * sizeof(*correspondences));
   num_correspondences = determine_correspondence(
       frm_buffer, (int *)frm_corners, num_frm_corners, ref_buffer,
       (int *)ref_corners, num_ref_corners, frm->y_width, frm->y_height,
diff --git a/av1/encoder/ransac.c b/av1/encoder/ransac.c
index ff62105e7a..766a1cf513 100644
--- a/av1/encoder/ransac.c
+++ b/av1/encoder/ransac.c
@@ -116,7 +116,7 @@ static int get_rand_indices(int npoints, int minpts, int *indices,
   return 1;
 }
 
-static int ransac(double *matched_points, int npoints, int *number_of_inliers,
+static int ransac(int *matched_points, int npoints, int *number_of_inliers,
                   int *best_inlier_mask, double *best_params, const int minpts,
                   IsDegenerateFunc is_degenerate,
                   FindTransformationFunc find_transformation,
@@ -305,31 +305,29 @@ static int is_degenerate_homography(double *p) {
          is_collinear3(p, p + 4, p + 6) || is_collinear3(p + 2, p + 4, p + 6);
 }
 
-int ransac_translation(double *matched_points, int npoints,
-                       int *number_of_inliers, int *best_inlier_mask,
-                       double *best_params) {
+int ransac_translation(int *matched_points, int npoints, int *number_of_inliers,
+                       int *best_inlier_mask, double *best_params) {
   return ransac(matched_points, npoints, number_of_inliers, best_inlier_mask,
                 best_params, 3, is_degenerate_translation, find_translation,
                 project_points_double_translation);
 }
 
-int ransac_rotzoom(double *matched_points, int npoints, int *number_of_inliers,
+int ransac_rotzoom(int *matched_points, int npoints, int *number_of_inliers,
                    int *best_inlier_mask, double *best_params) {
   return ransac(matched_points, npoints, number_of_inliers, best_inlier_mask,
                 best_params, 3, is_degenerate_affine, find_rotzoom,
                 project_points_double_rotzoom);
 }
 
-int ransac_affine(double *matched_points, int npoints, int *number_of_inliers,
+int ransac_affine(int *matched_points, int npoints, int *number_of_inliers,
                   int *best_inlier_mask, double *best_params) {
   return ransac(matched_points, npoints, number_of_inliers, best_inlier_mask,
                 best_params, 3, is_degenerate_affine, find_affine,
                 project_points_double_affine);
 }
 
-int ransac_homography(double *matched_points, int npoints,
-                      int *number_of_inliers, int *best_inlier_mask,
-                      double *best_params) {
+int ransac_homography(int *matched_points, int npoints, int *number_of_inliers,
+                      int *best_inlier_mask, double *best_params) {
   return ransac(matched_points, npoints, number_of_inliers, best_inlier_mask,
                 best_params, 4, is_degenerate_homography, find_homography,
                 project_points_double_homography);
diff --git a/av1/encoder/ransac.h b/av1/encoder/ransac.h
index 3a749b2cc6..c45603ca75 100644
--- a/av1/encoder/ransac.h
+++ b/av1/encoder/ransac.h
@@ -19,20 +19,18 @@
 
 #include "av1/common/warped_motion.h"
 
-typedef int (*RansacFunc)(double *matched_points, int npoints,
+typedef int (*RansacFunc)(int *matched_points, int npoints,
                           int *number_of_inliers, int *best_inlier_mask,
                           double *best_params);
 
 /* Each of these functions fits a motion model from a set of
 corresponding points in 2 frames using RANSAC.*/
-int ransac_homography(double *matched_points, int npoints,
-                      int *number_of_inliers, int *best_inlier_indices,
-                      double *best_params);
-int ransac_affine(double *matched_points, int npoints, int *number_of_inliers,
+int ransac_homography(int *matched_points, int npoints, int *number_of_inliers,
+                      int *best_inlier_indices, double *best_params);
+int ransac_affine(int *matched_points, int npoints, int *number_of_inliers,
                   int *best_inlier_indices, double *best_params);
-int ransac_rotzoom(double *matched_points, int npoints, int *number_of_inliers,
+int ransac_rotzoom(int *matched_points, int npoints, int *number_of_inliers,
                    int *best_inlier_indices, double *best_params);
-int ransac_translation(double *matched_points, int npoints,
-                       int *number_of_inliers, int *best_inlier_indices,
-                       double *best_params);
+int ransac_translation(int *matched_points, int npoints, int *number_of_inliers,
+                       int *best_inlier_indices, double *best_params);
 #endif  // AV1_ENCODER_RANSAC_H_
-- 
GitLab