From 15338d5f40b3361ce82840573f65dc42ce899fba Mon Sep 17 00:00:00 2001 From: David Barker <david.barker@argondesign.com> Date: Mon, 13 Feb 2017 15:30:59 +0000 Subject: [PATCH] Speed up global motion determination When global-motion is enabled, a considerable amount of encoder time is spent in the functions in corner_match.c. This patch optimizes those functions to be 3.5-4x as fast, leading to an end-to-end encoder speed improvement (on 20 frames of tempete_cif.y4m) of: 200kbps: ~26% faster 800kbps: ~19% faster 2800kbps: ~12% faster Change-Id: I04d3f87484c36c41eb5a1e86e814f2accbe86297 --- av1/encoder/corner_match.c | 140 ++++++++++++++++-------------------- av1/encoder/corner_match.h | 6 +- av1/encoder/global_motion.c | 6 +- av1/encoder/ransac.c | 16 ++--- av1/encoder/ransac.h | 16 ++--- 5 files changed, 81 insertions(+), 103 deletions(-) diff --git a/av1/encoder/corner_match.c b/av1/encoder/corner_match.c index 63225a8b32..64ee0c5ae1 100644 --- a/av1/encoder/corner_match.c +++ b/av1/encoder/corner_match.c @@ -24,11 +24,13 @@ #define THRESHOLD_NCC 0.75 -static double compute_variance(unsigned char *im, int stride, int x, int y, - double *mean) { - double sum = 0.0; - double sumsq = 0.0; - double var; +/* Compute var(im) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of im, + centered at (x, y). +*/ +static double compute_variance(unsigned char *im, int stride, int x, int y) { + int sum = 0.0; + int sumsq = 0.0; + int var; int i, j; for (i = 0; i < MATCH_SZ; ++i) for (j = 0; j < MATCH_SZ; ++j) { @@ -36,39 +38,45 @@ static double compute_variance(unsigned char *im, int stride, int x, int y, sumsq += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] * im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; } - var = (sumsq * MATCH_SZ_SQ - sum * sum) / (MATCH_SZ_SQ * MATCH_SZ_SQ); - if (mean) *mean = sum / MATCH_SZ_SQ; - return var; + var = sumsq * MATCH_SZ_SQ - sum * sum; + return (double)var; } +/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the + correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows + of each image, centered at (x1, y1) and (x2, y2) respectively. +*/ static double compute_cross_correlation(unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2) { - double sum1 = 0; - double sum2 = 0; - double cross = 0; - double corr; + int v1, v2; + int sum1 = 0; + int sum2 = 0; + int sumsq2 = 0; + int cross = 0; + int var2, cov; int i, j; for (i = 0; i < MATCH_SZ; ++i) for (j = 0; j < MATCH_SZ; ++j) { - sum1 += im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)]; - sum2 += im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)]; - cross += - im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)] * - im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)]; + v1 = im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)]; + v2 = im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)]; + sum1 += v1; + sum2 += v2; + sumsq2 += v2 * v2; + cross += v1 * v2; } - corr = (cross * MATCH_SZ_SQ - sum1 * sum2) / (MATCH_SZ_SQ * MATCH_SZ_SQ); - return corr; + var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; + cov = cross * MATCH_SZ_SQ - sum1 * sum2; + return cov / sqrt((double)var2); } -static int is_eligible_point(double pointx, double pointy, int width, - int height) { +static int is_eligible_point(int pointx, int pointy, int width, int height) { return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 && pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height); } -static int is_eligible_distance(double point1x, double point1y, double point2x, - double point2y, int width, int height) { +static int is_eligible_distance(int point1x, int point1y, int point2x, + int point2y, int width, int height) { const int thresh = (width < height ? height : width) >> 4; return ((point1x - point2x) * (point1x - point2x) + (point1y - point2y) * (point1y - point2y)) <= thresh * thresh; @@ -81,32 +89,21 @@ static void improve_correspondence(unsigned char *frm, unsigned char *ref, int num_correspondences) { int i; for (i = 0; i < num_correspondences; ++i) { - double template_norm = - compute_variance(frm, frm_stride, (int)correspondences[i].x, - (int)correspondences[i].y, NULL); int x, y, best_x = 0, best_y = 0; double best_match_ncc = 0.0; for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) { for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { double match_ncc; - double subimage_norm; - if (!is_eligible_point((int)correspondences[i].rx + x, - (int)correspondences[i].ry + y, width, height)) + if (!is_eligible_point(correspondences[i].rx + x, + correspondences[i].ry + y, width, height)) continue; - if (!is_eligible_distance( - (int)correspondences[i].x, (int)correspondences[i].y, - (int)correspondences[i].rx + x, (int)correspondences[i].ry + y, - width, height)) + if (!is_eligible_distance(correspondences[i].x, correspondences[i].y, + correspondences[i].rx + x, + correspondences[i].ry + y, width, height)) continue; - subimage_norm = - compute_variance(ref, ref_stride, (int)correspondences[i].rx + x, - (int)correspondences[i].ry + y, NULL); match_ncc = compute_cross_correlation( - frm, frm_stride, (int)correspondences[i].x, - (int)correspondences[i].y, ref, ref_stride, - (int)correspondences[i].rx + x, - (int)correspondences[i].ry + y) / - sqrt(template_norm * subimage_norm); + frm, frm_stride, correspondences[i].x, correspondences[i].y, ref, + ref_stride, correspondences[i].rx + x, correspondences[i].ry + y); if (match_ncc > best_match_ncc) { best_match_ncc = match_ncc; best_y = y; @@ -114,36 +111,25 @@ static void improve_correspondence(unsigned char *frm, unsigned char *ref, } } } - correspondences[i].rx += (double)best_x; - correspondences[i].ry += (double)best_y; + correspondences[i].rx += best_x; + correspondences[i].ry += best_y; } for (i = 0; i < num_correspondences; ++i) { - double template_norm = - compute_variance(ref, ref_stride, (int)correspondences[i].rx, - (int)correspondences[i].ry, NULL); int x, y, best_x = 0, best_y = 0; double best_match_ncc = 0.0; for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { double match_ncc; - double subimage_norm; - if (!is_eligible_point((int)correspondences[i].x + x, - (int)correspondences[i].y + y, width, height)) + if (!is_eligible_point(correspondences[i].x + x, + correspondences[i].y + y, width, height)) continue; - if (!is_eligible_distance((int)correspondences[i].x + x, - (int)correspondences[i].y + y, - (int)correspondences[i].rx, - (int)correspondences[i].ry, width, height)) + if (!is_eligible_distance( + correspondences[i].x + x, correspondences[i].y + y, + correspondences[i].rx, correspondences[i].ry, width, height)) continue; - subimage_norm = - compute_variance(frm, frm_stride, (int)correspondences[i].x + x, - (int)correspondences[i].y + y, NULL); - match_ncc = - compute_cross_correlation( - frm, frm_stride, (int)correspondences[i].x + x, - (int)correspondences[i].y + y, ref, ref_stride, - (int)correspondences[i].rx, (int)correspondences[i].ry) / - sqrt(template_norm * subimage_norm); + match_ncc = compute_cross_correlation( + ref, ref_stride, correspondences[i].rx, correspondences[i].ry, frm, + frm_stride, correspondences[i].x + x, correspondences[i].y + y); if (match_ncc > best_match_ncc) { best_match_ncc = match_ncc; best_y = y; @@ -159,7 +145,7 @@ int determine_correspondence(unsigned char *frm, int *frm_corners, int num_frm_corners, unsigned char *ref, int *ref_corners, int num_ref_corners, int width, int height, int frm_stride, int ref_stride, - double *correspondence_pts) { + int *correspondence_pts) { // TODO(sarahparker) Improve this to include 2-way match int i, j; Correspondence *correspondences = (Correspondence *)correspondence_pts; @@ -171,11 +157,8 @@ int determine_correspondence(unsigned char *frm, int *frm_corners, if (!is_eligible_point(frm_corners[2 * i], frm_corners[2 * i + 1], width, height)) continue; - template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i], - frm_corners[2 * i + 1], NULL); for (j = 0; j < num_ref_corners; ++j) { double match_ncc; - double subimage_norm; if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width, height)) continue; @@ -183,25 +166,24 @@ int determine_correspondence(unsigned char *frm, int *frm_corners, ref_corners[2 * j], ref_corners[2 * j + 1], width, height)) continue; - subimage_norm = compute_variance(ref, ref_stride, ref_corners[2 * j], - ref_corners[2 * j + 1], NULL); - match_ncc = compute_cross_correlation(frm, frm_stride, frm_corners[2 * i], - frm_corners[2 * i + 1], ref, - ref_stride, ref_corners[2 * j], - ref_corners[2 * j + 1]) / - sqrt(template_norm * subimage_norm); + match_ncc = compute_cross_correlation( + frm, frm_stride, frm_corners[2 * i], frm_corners[2 * i + 1], ref, + ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]); if (match_ncc > best_match_ncc) { best_match_ncc = match_ncc; best_match_j = j; } } - if (best_match_ncc > THRESHOLD_NCC) { - correspondences[num_correspondences].x = (double)frm_corners[2 * i]; - correspondences[num_correspondences].y = (double)frm_corners[2 * i + 1]; - correspondences[num_correspondences].rx = - (double)ref_corners[2 * best_match_j]; + // Note: We want to test if the best correlation is >= THRESHOLD_NCC, + // but need to account for the normalization in compute_cross_correlation. + template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i], + frm_corners[2 * i + 1]); + if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) { + correspondences[num_correspondences].x = frm_corners[2 * i]; + correspondences[num_correspondences].y = frm_corners[2 * i + 1]; + correspondences[num_correspondences].rx = ref_corners[2 * best_match_j]; correspondences[num_correspondences].ry = - (double)ref_corners[2 * best_match_j + 1]; + ref_corners[2 * best_match_j + 1]; num_correspondences++; } } diff --git a/av1/encoder/corner_match.h b/av1/encoder/corner_match.h index 3618aa0c65..c0458642c1 100644 --- a/av1/encoder/corner_match.h +++ b/av1/encoder/corner_match.h @@ -16,14 +16,14 @@ #include <memory.h> typedef struct { - double x, y; - double rx, ry; + int x, y; + int rx, ry; } Correspondence; int determine_correspondence(unsigned char *frm, int *frm_corners, int num_frm_corners, unsigned char *ref, int *ref_corners, int num_ref_corners, int width, int height, int frm_stride, int ref_stride, - double *correspondence_pts); + int *correspondence_pts); #endif // AV1_ENCODER_CORNER_MATCH_H_ diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c index 526e897a76..8259c45426 100644 --- a/av1/encoder/global_motion.c +++ b/av1/encoder/global_motion.c @@ -218,7 +218,7 @@ static INLINE RansacFunc get_ransac_type(TransformationType type) { // computes global motion parameters by fitting a model using RANSAC static int compute_global_motion_params(TransformationType type, - double *correspondences, + int *correspondences, int num_correspondences, double *params, int *inlier_map) { int result; @@ -259,7 +259,7 @@ int compute_global_motion_feature_based(TransformationType type, double *params) { int num_frm_corners, num_ref_corners; int num_correspondences; - double *correspondences; + int *correspondences; int num_inliers; int frm_corners[2 * MAX_CORNERS], ref_corners[2 * MAX_CORNERS]; int *inlier_map = NULL; @@ -291,7 +291,7 @@ int compute_global_motion_feature_based(TransformationType type, // find correspondences between the two images correspondences = - (double *)malloc(num_frm_corners * 4 * sizeof(*correspondences)); + (int *)malloc(num_frm_corners * 4 * sizeof(*correspondences)); num_correspondences = determine_correspondence( frm_buffer, (int *)frm_corners, num_frm_corners, ref_buffer, (int *)ref_corners, num_ref_corners, frm->y_width, frm->y_height, diff --git a/av1/encoder/ransac.c b/av1/encoder/ransac.c index ff62105e7a..766a1cf513 100644 --- a/av1/encoder/ransac.c +++ b/av1/encoder/ransac.c @@ -116,7 +116,7 @@ static int get_rand_indices(int npoints, int minpts, int *indices, return 1; } -static int ransac(double *matched_points, int npoints, int *number_of_inliers, +static int ransac(int *matched_points, int npoints, int *number_of_inliers, int *best_inlier_mask, double *best_params, const int minpts, IsDegenerateFunc is_degenerate, FindTransformationFunc find_transformation, @@ -305,31 +305,29 @@ static int is_degenerate_homography(double *p) { is_collinear3(p, p + 4, p + 6) || is_collinear3(p + 2, p + 4, p + 6); } -int ransac_translation(double *matched_points, int npoints, - int *number_of_inliers, int *best_inlier_mask, - double *best_params) { +int ransac_translation(int *matched_points, int npoints, int *number_of_inliers, + int *best_inlier_mask, double *best_params) { return ransac(matched_points, npoints, number_of_inliers, best_inlier_mask, best_params, 3, is_degenerate_translation, find_translation, project_points_double_translation); } -int ransac_rotzoom(double *matched_points, int npoints, int *number_of_inliers, +int ransac_rotzoom(int *matched_points, int npoints, int *number_of_inliers, int *best_inlier_mask, double *best_params) { return ransac(matched_points, npoints, number_of_inliers, best_inlier_mask, best_params, 3, is_degenerate_affine, find_rotzoom, project_points_double_rotzoom); } -int ransac_affine(double *matched_points, int npoints, int *number_of_inliers, +int ransac_affine(int *matched_points, int npoints, int *number_of_inliers, int *best_inlier_mask, double *best_params) { return ransac(matched_points, npoints, number_of_inliers, best_inlier_mask, best_params, 3, is_degenerate_affine, find_affine, project_points_double_affine); } -int ransac_homography(double *matched_points, int npoints, - int *number_of_inliers, int *best_inlier_mask, - double *best_params) { +int ransac_homography(int *matched_points, int npoints, int *number_of_inliers, + int *best_inlier_mask, double *best_params) { return ransac(matched_points, npoints, number_of_inliers, best_inlier_mask, best_params, 4, is_degenerate_homography, find_homography, project_points_double_homography); diff --git a/av1/encoder/ransac.h b/av1/encoder/ransac.h index 3a749b2cc6..c45603ca75 100644 --- a/av1/encoder/ransac.h +++ b/av1/encoder/ransac.h @@ -19,20 +19,18 @@ #include "av1/common/warped_motion.h" -typedef int (*RansacFunc)(double *matched_points, int npoints, +typedef int (*RansacFunc)(int *matched_points, int npoints, int *number_of_inliers, int *best_inlier_mask, double *best_params); /* Each of these functions fits a motion model from a set of corresponding points in 2 frames using RANSAC.*/ -int ransac_homography(double *matched_points, int npoints, - int *number_of_inliers, int *best_inlier_indices, - double *best_params); -int ransac_affine(double *matched_points, int npoints, int *number_of_inliers, +int ransac_homography(int *matched_points, int npoints, int *number_of_inliers, + int *best_inlier_indices, double *best_params); +int ransac_affine(int *matched_points, int npoints, int *number_of_inliers, int *best_inlier_indices, double *best_params); -int ransac_rotzoom(double *matched_points, int npoints, int *number_of_inliers, +int ransac_rotzoom(int *matched_points, int npoints, int *number_of_inliers, int *best_inlier_indices, double *best_params); -int ransac_translation(double *matched_points, int npoints, - int *number_of_inliers, int *best_inlier_indices, - double *best_params); +int ransac_translation(int *matched_points, int npoints, int *number_of_inliers, + int *best_inlier_indices, double *best_params); #endif // AV1_ENCODER_RANSAC_H_ -- GitLab