From 118bf67cb61f492023da085eb7446f6d9cbd598c Mon Sep 17 00:00:00 2001
From: Angie Chiang <angiebird@google.com>
Date: Fri, 3 Feb 2017 17:12:44 -0800
Subject: [PATCH] Implement shorter-tap first in convolve_round

The performance change is 0.004% on lowres

Change-Id: If3702ba6377ac42997e7d49b8959ff16fb182daa
---
 av1/common/convolve.c | 48 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 6cd24e3d0e..1bd1ab4b94 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -252,6 +252,24 @@ void av1_convolve_2d(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
   }
 }
 
+static INLINE void transpose_uint8(uint8_t *dst, int dst_stride,
+                                   const uint8_t *src, int src_stride, int w,
+                                   int h) {
+  int r, c;
+  for (r = 0; r < h; ++r)
+    for (c = 0; c < w; ++c)
+      dst[c * (dst_stride) + r] = src[r * (src_stride) + c];
+}
+
+static INLINE void transpose_int32(int32_t *dst, int dst_stride,
+                                   const int32_t *src, int src_stride, int w,
+                                   int h) {
+  int r, c;
+  for (r = 0; r < h; ++r)
+    for (c = 0; c < w; ++c)
+      dst[c * (dst_stride) + r] = src[r * (src_stride) + c];
+}
+
 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
                             int dst_stride, int w, int h,
                             const InterpFilter *interp_filter,
@@ -272,9 +290,33 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
     // This will reduce hardware implementation cost.
     filter_params_y = av1_get_interp_filter_params(EIGHTTAP_SHARP);
   }
-  av1_convolve_2d(src, src_stride, conv_params->dst, conv_params->dst_stride, w,
-                  h, &filter_params_x, &filter_params_y, subpel_x_q4,
-                  subpel_y_q4, conv_params);
+
+  if (filter_params_y.taps < filter_params_x.taps) {
+    uint8_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
+                   (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
+    int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
+    CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
+    int tr_dst_stride = MAX_SB_SIZE;
+    int fo_vert = filter_params_y.taps / 2 - 1;
+    int fo_horiz = filter_params_x.taps / 2 - 1;
+
+    transpose_uint8(tr_src, tr_src_stride,
+                    src - fo_vert * src_stride - fo_horiz, src_stride,
+                    w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
+    transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
+                    conv_params->dst_stride, w, h);
+
+    // horizontal and vertical parameters are swapped because of the transpose
+    av1_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride,
+                    tr_dst, tr_dst_stride, h, w, &filter_params_y,
+                    &filter_params_x, subpel_y_q4, subpel_x_q4, conv_params);
+    transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
+                    tr_dst_stride, h, w);
+  } else {
+    av1_convolve_2d(src, src_stride, conv_params->dst, conv_params->dst_stride,
+                    w, h, &filter_params_x, &filter_params_y, subpel_x_q4,
+                    subpel_y_q4, conv_params);
+  }
 }
 
 #endif  // CONFIG_CONVOLVE_ROUND
-- 
GitLab