aom_convolve.c 52.7 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
John Koleszar's avatar
John Koleszar committed
3
 *
Yaowu Xu's avatar
Yaowu Xu committed
4 5 6 7 8 9
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
John Koleszar's avatar
John Koleszar committed
10
 */
Christian Duvivier's avatar
Christian Duvivier committed
11

John Koleszar's avatar
John Koleszar committed
12
#include <assert.h>
Zoe Liu's avatar
Zoe Liu committed
13
#include <string.h>
John Koleszar's avatar
John Koleszar committed
14

Yaowu Xu's avatar
Yaowu Xu committed
15 16 17 18 19 20
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
21
#include "aom_ports/mem.h"
John Koleszar's avatar
John Koleszar committed
22

23 24 25 26 27 28 29 30 31 32 33 34 35
static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
  int sum = 0;
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
  return sum;
}

static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
                                      const int16_t *b) {
  int sum = 0;
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
  return sum;
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
36 37
static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
38 39
                           const InterpKernel *x_filters, int x0_q4,
                           int x_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
40
  src -= SUBPEL_TAPS / 2 - 1;
41
  for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
42
    int x_q4 = x0_q4;
43
    for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
44 45
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
46
      const int sum = horz_scalar_product(src_x, x_filter);
47
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
John Koleszar's avatar
John Koleszar committed
48 49 50 51 52 53 54
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Fergus Simpson's avatar
Fergus Simpson committed
55 56 57 58 59
static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_qn,
                                   int x_step_qn, int w, int h) {
  src -= SUBPEL_TAPS / 2 - 1;
60
  for (int y = 0; y < h; ++y) {
Fergus Simpson's avatar
Fergus Simpson committed
61
    int x_qn = x0_qn;
62
    for (int x = 0; x < w; ++x) {
Fergus Simpson's avatar
Fergus Simpson committed
63 64 65 66
      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];  // q8
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
      assert(x_filter_idx < SUBPEL_SHIFTS);
      const int16_t *const x_filter = x_filters[x_filter_idx];
67
      const int sum = horz_scalar_product(src_x, x_filter);
Fergus Simpson's avatar
Fergus Simpson committed
68 69 70 71 72 73 74 75
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
      x_qn += x_step_qn;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
76 77
static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
78 79
                               const InterpKernel *x_filters, int x0_q4,
                               int x_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
80
  src -= SUBPEL_TAPS / 2 - 1;
81
  for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
82
    int x_q4 = x0_q4;
83
    for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
84 85
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
86
      const int sum = horz_scalar_product(src_x, x_filter);
clang-format's avatar
clang-format committed
87 88
      dst[x] = ROUND_POWER_OF_TWO(
          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
John Koleszar's avatar
John Koleszar committed
89 90 91 92 93 94 95
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Fergus Simpson's avatar
Fergus Simpson committed
96 97 98 99 100
static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const InterpKernel *x_filters, int x0_qn,
                                       int x_step_qn, int w, int h) {
  src -= SUBPEL_TAPS / 2 - 1;
101
  for (int y = 0; y < h; ++y) {
Fergus Simpson's avatar
Fergus Simpson committed
102
    int x_qn = x0_qn;
103
    for (int x = 0; x < w; ++x) {
Fergus Simpson's avatar
Fergus Simpson committed
104 105 106 107
      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
      assert(x_filter_idx < SUBPEL_SHIFTS);
      const int16_t *const x_filter = x_filters[x_filter_idx];
108
      const int sum = horz_scalar_product(src_x, x_filter);
Fergus Simpson's avatar
Fergus Simpson committed
109 110 111 112 113 114 115 116 117
      dst[x] = ROUND_POWER_OF_TWO(
          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
      x_qn += x_step_qn;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
118 119
static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
120 121
                          const InterpKernel *y_filters, int y0_q4,
                          int y_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
122
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
John Koleszar's avatar
John Koleszar committed
123

124
  for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
125
    int y_q4 = y0_q4;
126
    for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
127 128
      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
129
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
130
      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
John Koleszar's avatar
John Koleszar committed
131 132 133 134 135 136 137
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

Fergus Simpson's avatar
Fergus Simpson committed
138 139 140 141 142 143
static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_qn,
                                  int y_step_qn, int w, int h) {
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

144
  for (int x = 0; x < w; ++x) {
Fergus Simpson's avatar
Fergus Simpson committed
145
    int y_qn = y0_qn;
146
    for (int y = 0; y < h; ++y) {
Fergus Simpson's avatar
Fergus Simpson committed
147 148 149 150
      const unsigned char *src_y =
          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter =
          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
151
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Fergus Simpson's avatar
Fergus Simpson committed
152 153 154 155 156 157 158 159
      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
      y_qn += y_step_qn;
    }
    ++src;
    ++dst;
  }
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
160 161
static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
162 163
                              const InterpKernel *y_filters, int y0_q4,
                              int y_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
164
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
John Koleszar's avatar
John Koleszar committed
165

166
  for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
167
    int y_q4 = y0_q4;
168
    for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
169 170
      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
171
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
clang-format's avatar
clang-format committed
172 173 174 175
      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
          dst[y * dst_stride] +
              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
          1);
John Koleszar's avatar
John Koleszar committed
176 177 178 179 180 181 182
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

Fergus Simpson's avatar
Fergus Simpson committed
183 184 185 186 187 188
static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const InterpKernel *y_filters, int y0_qn,
                                      int y_step_qn, int w, int h) {
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

189
  for (int x = 0; x < w; ++x) {
Fergus Simpson's avatar
Fergus Simpson committed
190
    int y_qn = y0_qn;
191
    for (int y = 0; y < h; ++y) {
Fergus Simpson's avatar
Fergus Simpson committed
192 193 194 195
      const unsigned char *src_y =
          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter =
          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
196
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Fergus Simpson's avatar
Fergus Simpson committed
197 198 199 200 201 202 203 204 205 206 207
      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
          dst[y * dst_stride] +
              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
          1);
      y_qn += y_step_qn;
    }
    ++src;
    ++dst;
  }
}

clang-format's avatar
clang-format committed
208 209
static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
Dmitry Kovalev's avatar
Dmitry Kovalev committed
210
                     int x0_q4, int x_step_q4,
clang-format's avatar
clang-format committed
211 212
                     const InterpKernel *const y_filters, int y0_q4,
                     int y_step_q4, int w, int h) {
213 214 215 216 217 218 219 220 221 222 223 224
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
225
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
226
  const int intermediate_height =
clang-format's avatar
clang-format committed
227
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
228

229 230
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
231

232 233
  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);
234

clang-format's avatar
clang-format committed
235 236 237 238 239
  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
                 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
                 intermediate_height);
  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
                dst_stride, y_filters, y0_q4, y_step_q4, w, h);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
240 241
}

Fergus Simpson's avatar
Fergus Simpson committed
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const InterpKernel *const x_filters, int x0_qn,
                             int x_step_qn, const InterpKernel *const y_filters,
                             int y0_qn, int y_step_qn, int w, int h) {
  // TODO(afergs): Update comment here
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
261
  const int intermediate_height =
Fergus Simpson's avatar
Fergus Simpson committed
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
      (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  assert(y_step_qn <= SCALE_SUBPEL_BITS * 2);
  assert(x_step_qn <= SCALE_SUBPEL_BITS * 2);

  convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
                         temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w,
                         intermediate_height);
  convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
                        dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h);
}

277
static const InterpKernel *get_filter_base(const int16_t *filter) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
278 279
  // NOTE: This assumes that the filter table is 256-byte aligned.
  // TODO(agrange) Modify to make independent of table alignment.
280
  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
Dmitry Kovalev's avatar
Dmitry Kovalev committed
281 282
}

283
static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
284
  return (int)((const InterpKernel *)(intptr_t)f - base);
John Koleszar's avatar
John Koleszar committed
285 286
}

Yaowu Xu's avatar
Yaowu Xu committed
287
void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
288
                           uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
289
                           const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
290 291
                           const int16_t *filter_y, int y_step_q4, int w,
                           int h) {
292
  const InterpKernel *const filters_x = get_filter_base(filter_x);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
293 294
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

295 296 297
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
298 299
  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
                 w, h);
John Koleszar's avatar
John Koleszar committed
300 301
}

Fergus Simpson's avatar
Fergus Simpson committed
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int subpel_x,
                                 int x_step_qn, const int16_t *filter_y,
                                 int subpel_y, int y_step_qn, int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);

  (void)subpel_y;
  (void)filter_y;
  (void)y_step_qn;

  convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
                         x_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
317
void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
318
                               uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
319
                               const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
320 321
                               const int16_t *filter_y, int y_step_q4, int w,
                               int h) {
322
  const InterpKernel *const filters_x = get_filter_base(filter_x);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
323 324
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

325 326 327
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
328 329
  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                     x_step_q4, w, h);
John Koleszar's avatar
John Koleszar committed
330 331
}

Fergus Simpson's avatar
Fergus Simpson committed
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                     uint8_t *dst, ptrdiff_t dst_stride,
                                     const int16_t *filter_x, int subpel_x,
                                     int x_step_qn, const int16_t *filter_y,
                                     int subpel_y, int y_step_qn, int w,
                                     int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);

  (void)subpel_y;
  (void)filter_y;
  (void)y_step_qn;

  convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x,
                             subpel_x, x_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
348
void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
349
                          uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
350
                          const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
351 352
                          const int16_t *filter_y, int y_step_q4, int w,
                          int h) {
353
  const InterpKernel *const filters_y = get_filter_base(filter_y);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
354
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
355 356 357 358

  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
359 360
  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
                w, h);
John Koleszar's avatar
John Koleszar committed
361 362
}

Fergus Simpson's avatar
Fergus Simpson committed
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int subpel_x,
                                int x_step_qn, const int16_t *filter_y,
                                int subpel_y, int y_step_qn, int w, int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);

  (void)subpel_x;
  (void)filter_x;
  (void)x_step_qn;

  convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y,
                        y_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
378
void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
379
                              uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
380
                              const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
381 382
                              const int16_t *filter_y, int y_step_q4, int w,
                              int h) {
383
  const InterpKernel *const filters_y = get_filter_base(filter_y);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
384
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
385 386 387 388

  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
389 390
  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                    y_step_q4, w, h);
John Koleszar's avatar
John Koleszar committed
391 392
}

Fergus Simpson's avatar
Fergus Simpson committed
393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const int16_t *filter_x, int subpel_x,
                                    int x_step_qn, const int16_t *filter_y,
                                    int subpel_y, int y_step_qn, int w, int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);

  (void)subpel_x;
  (void)filter_x;
  (void)x_step_qn;

  convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y,
                            subpel_y, y_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
408
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
409 410
                     ptrdiff_t dst_stride, const int16_t *filter_x,
                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
John Koleszar's avatar
John Koleszar committed
411
                     int w, int h) {
412
  const InterpKernel *const filters_x = get_filter_base(filter_x);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
413 414
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

415
  const InterpKernel *const filters_y = get_filter_base(filter_y);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
416 417
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

clang-format's avatar
clang-format committed
418
  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
Dmitry Kovalev's avatar
Dmitry Kovalev committed
419
           filters_y, y0_q4, y_step_q4, w, h);
John Koleszar's avatar
John Koleszar committed
420 421
}

Fergus Simpson's avatar
Fergus Simpson committed
422 423 424 425 426 427 428 429 430 431 432 433 434
void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int subpel_x, int x_step_qn,
                           const int16_t *filter_y, int subpel_y, int y_step_qn,
                           int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);

  convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
                   x_step_qn, filters_y, subpel_y, y_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
435
void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
436 437
                         ptrdiff_t dst_stride, const int16_t *filter_x,
                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
John Koleszar's avatar
John Koleszar committed
438
                         int w, int h) {
Christian Duvivier's avatar
Christian Duvivier committed
439
  /* Fixed size intermediate buffer places limits on parameters. */
440 441 442
  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
Christian Duvivier's avatar
Christian Duvivier committed
443

Yaowu Xu's avatar
Yaowu Xu committed
444
  aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
clang-format's avatar
clang-format committed
445
                  filter_y, y_step_q4, w, h);
Yaowu Xu's avatar
Yaowu Xu committed
446
  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
clang-format's avatar
clang-format committed
447
                     h);
John Koleszar's avatar
John Koleszar committed
448
}
449

Fergus Simpson's avatar
Fergus Simpson committed
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int subpel_x,
                               int x_step_qn, const int16_t *filter_y,
                               int subpel_y, int y_step_qn, int w, int h) {
  /* Fixed size intermediate buffer places limits on parameters. */
  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x,
                        x_step_qn, filter_y, subpel_y, y_step_qn, w, h);
  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
                     h);
}

Yaowu Xu's avatar
Yaowu Xu committed
466
void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
467 468 469
                         ptrdiff_t dst_stride, const int16_t *filter_x,
                         int filter_x_stride, const int16_t *filter_y,
                         int filter_y_stride, int w, int h) {
470 471
  int r;

clang-format's avatar
clang-format committed
472 473 474 475
  (void)filter_x;
  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
476

477
  for (r = h; r > 0; --r) {
James Zern's avatar
James Zern committed
478
    memcpy(dst, src, w);
479 480
    src += src_stride;
    dst += dst_stride;
481 482 483
  }
}

Yaowu Xu's avatar
Yaowu Xu committed
484
void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
485 486 487 488 489 490 491
                        ptrdiff_t dst_stride, const int16_t *filter_x,
                        int filter_x_stride, const int16_t *filter_y,
                        int filter_y_stride, int w, int h) {
  (void)filter_x;
  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
492

493 494
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
495

496 497 498 499
    src += src_stride;
    dst += dst_stride;
  }
}
500

Yaowu Xu's avatar
Yaowu Xu committed
501
void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
502 503
                        ptrdiff_t dst_stride, const int16_t *filter_x,
                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
504
                        int w, int h) {
Yaowu Xu's avatar
Yaowu Xu committed
505
  aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
506 507 508
                        filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
509
void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
510 511
                       ptrdiff_t dst_stride, const int16_t *filter_x,
                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
512
                       int w, int h) {
Yaowu Xu's avatar
Yaowu Xu committed
513
  aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
514 515 516
                       filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
517
void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
518 519
                     ptrdiff_t dst_stride, const int16_t *filter_x,
                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
520
                     int w, int h) {
Yaowu Xu's avatar
Yaowu Xu committed
521
  aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
522 523 524
                  filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
525
void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
526 527
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
528 529
                            const int16_t *filter_y, int y_step_q4, int w,
                            int h) {
Yaowu Xu's avatar
Yaowu Xu committed
530
  aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
531 532 533
                            x_step_q4, filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
534
void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
535 536
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
537 538
                           const int16_t *filter_y, int y_step_q4, int w,
                           int h) {
Yaowu Xu's avatar
Yaowu Xu committed
539
  aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
540 541 542
                           x_step_q4, filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
543
void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
544 545 546
                         ptrdiff_t dst_stride, const int16_t *filter_x,
                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
Yaowu Xu's avatar
Yaowu Xu committed
547
  aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
548 549 550
                      filter_y, y_step_q4, w, h);
}

551 552 553 554 555 556 557 558
static INLINE int highbd_vert_scalar_product(const uint16_t *a,
                                             ptrdiff_t a_stride,
                                             const int16_t *b) {
  int sum = 0;
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
  return sum;
}

Fergus Simpson's avatar
Fergus Simpson committed
559
// TODO(afergs): Make sure this works too
560 561 562 563 564 565
#if CONFIG_LOOP_RESTORATION
static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_q4,
                                   int x_step_q4, int w, int h) {
  src -= SUBPEL_TAPS / 2 - 1;
566
  for (int y = 0; y < h; ++y) {
567
    int x_q4 = x0_q4;
568
    for (int x = 0; x < w; ++x) {
569 570
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
571 572

      const int sum = horz_scalar_product(src_x, x_filter);
573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
                          src_x[SUBPEL_TAPS / 2 - 1]);
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_q4,
                                  int y_step_q4, int w, int h) {
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

588
  for (int x = 0; x < w; ++x) {
589
    int y_q4 = y0_q4;
590
    for (int y = 0; y < h; ++y) {
591 592
      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
593
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609
      dst[y * dst_stride] =
          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
                     src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const InterpKernel *const x_filters, int x0_q4,
                             int x_step_q4, const InterpKernel *const y_filters,
                             int y0_q4, int y_step_q4, int w, int h) {
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
610
  const int intermediate_height =
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

  convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
                         temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
                         intermediate_height);
  convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
                        dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
}

void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
                                   const int16_t *filter_y, int y_step_q4,
                                   int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  (void)filter_y;
  (void)y_step_q4;

  convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                         x_step_q4, w, h);
}

void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4, int w,
                                  int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  (void)filter_x;
  (void)x_step_q4;

  convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                        y_step_q4, w, h);
}

void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4, int w,
                             int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
}
670 671 672 673 674

static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
                                       uint16_t *dst, ptrdiff_t dst_stride,
                                       const InterpKernel *x_filters, int x0_q4,
                                       int x_step_q4, int w, int h) {
675
  const int bd = 8;
676
  src -= SUBPEL_TAPS / 2 - 1;
677
  for (int y = 0; y < h; ++y) {
678
    int x_q4 = x0_q4;
679
    for (int x = 0; x < w; ++x) {
680 681
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
682 683 684
      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
                           (1 << (bd + FILTER_BITS - 1));
      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
685 686
      dst[x] =
          (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
687
                          0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
688 689 690 691 692 693 694 695 696 697 698
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const InterpKernel *y_filters, int y0_q4,
                                      int y_step_q4, int w, int h) {
699
  const int bd = 8;
700 701
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

702
  for (int x = 0; x < w; ++x) {
703
    int y_q4 = y0_q4;
704
    for (int y = 0; y < h; ++y) {
705 706
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
707
      const int rounding =
708 709
          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
          (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
710 711
      const int sum =
          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
      dst[y * dst_stride] =
          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const InterpKernel *const x_filters, int x0_q4,
                                 int x_step_q4,
                                 const InterpKernel *const y_filters, int y0_q4,
                                 int y_step_q4, int w, int h) {
  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
728
  const int intermediate_height =
729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
                             src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
                             x_step_q4, w, intermediate_height);
  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
                            y_step_q4, w, h);
}

void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
                                       uint16_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
                                       const int16_t *filter_y, int y_step_q4,
                                       int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  (void)filter_y;
  (void)y_step_q4;

  convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                             x_step_q4, w, h);
}

void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  (void)filter_x;
  (void)x_step_q4;

  convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                            y_step_q4, w, h);
}

void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
                                 const int16_t *filter_y, int y_step_q4, int w,
                                 int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                       x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
}
789 790
#endif  // CONFIG_LOOP_RESTORATION

791 792 793 794 795 796 797
static INLINE int highbd_horz_scalar_product(const uint16_t *a,
                                             const int16_t *b) {
  int sum = 0;
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
  return sum;
}

798 799
static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                  uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
800 801
                                  const InterpKernel *x_filters, int x0_q4,
                                  int x_step_q4, int w, int h, int bd) {
802 803 804
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= SUBPEL_TAPS / 2 - 1;
805
  for (int y = 0; y < h; ++y) {
806
    int x_q4 = x0_q4;
807
    for (int x = 0; x < w; ++x) {
808 809
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
810
      const int sum = highbd_horz_scalar_product(src_x, x_filter);
811
      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
812 813 814 815 816 817 818
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

819 820
static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                      uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
821 822
                                      const InterpKernel *x_filters, int x0_q4,
                                      int x_step_q4, int w, int h, int bd) {
823 824 825
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= SUBPEL_TAPS / 2 - 1;
826
  for (int y = 0; y < h; ++y) {
827
    int x_q4 = x0_q4;
828
    for (int x = 0; x < w; ++x) {
829 830
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
831
      const int sum = highbd_horz_scalar_product(src_x, x_filter);
clang-format's avatar
clang-format committed
832 833 834
      dst[x] = ROUND_POWER_OF_TWO(
          dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
          1);
835 836 837 838 839 840 841
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

842 843
static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                 uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
844 845
                                 const InterpKernel *y_filters, int y0_q4,
                                 int y_step_q4, int w, int h, int bd) {
846 847 848
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
849
  for (int x = 0; x < w; ++x) {
850
    int y_q4 = y0_q4;
851
    for (int y = 0; y < h; ++y) {
852 853
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
854
      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
clang-format's avatar
clang-format committed
855 856
      dst[y * dst_stride] =
          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
857 858 859 860 861 862 863
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

864 865
static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                     uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
866 867
                                     const InterpKernel *y_filters, int y0_q4,
                                     int y_step_q4, int w, int h, int bd) {
868 869 870
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
871
  for (int x = 0; x < w; ++x) {
872
    int y_q4 = y0_q4;
873
    for (int y = 0; y < h; ++y) {
874 875
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
876
      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
clang-format's avatar
clang-format committed
877 878 879 880
      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
          dst[y * dst_stride] +
              clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
          1);
881 882 883 884 885 886 887
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

888 889
static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
890 891 892
                            const InterpKernel *const x_filters, int x0_q4,
                            int x_step_q4, const InterpKernel *const y_filters,
                            int y0_q4, int y_step_q4, int w, int h, int bd) {
893 894 895 896 897 898 899 900 901 902 903 904
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
905
  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
906
  const int intermediate_height =
clang-format's avatar
clang-format committed
907
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
908

909 910
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
911 912 913
  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

914
  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
clang-format's avatar
clang-format committed
915 916
                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
                        x_step_q4, w, intermediate_height, bd);
917
  highbd_convolve_vert(
clang-format's avatar
clang-format committed
918 919
      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
920 921
}

Yaowu Xu's avatar
Yaowu Xu committed
922
void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
923 924
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
925 926
                                  const int16_t *filter_y, int y_step_q4, int w,
                                  int h, int bd) {
927 928 929 930 931
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
932 933
  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                        x_step_q4, w, h, bd);
934 935
}

Yaowu Xu's avatar
Yaowu Xu committed
936
void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
937 938 939 940
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h, int bd) {
941 942 943 944 945
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
946 947
  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                            x_step_q4, w, h, bd);
948 949
}

Yaowu Xu's avatar
Yaowu Xu committed
950
void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
951 952
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
953 954
                                 const int16_t *filter_y, int y_step_q4, int w,
                                 int h, int bd) {
955 956 957 958 959
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
960 961
  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                       y_step_q4, w, h, bd);
962 963
}

Yaowu Xu's avatar
Yaowu Xu committed
964
void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
965 966 967 968
                                     uint8_t *dst, ptrdiff_t dst_stride,
                                     const int16_t *filter_x, int x_step_q4,
                                     const int16_t *filter_y, int y_step_q4,
                                     int w, int h, int bd) {
969 970 971 972 973
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
974 975
  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                           y_step_q4, w, h, bd);
976 977
}

Yaowu Xu's avatar
Yaowu Xu committed
978
void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
979 980
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
981 982
                            const int16_t *filter_y, int y_step_q4, int w,
                            int h, int bd) {
983 984 985 986 987 988
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

clang-format's avatar
clang-format committed
989
  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
990
                  filters_y, y0_q4, y_step_q4, w, h, bd);
991 992
}

Yaowu Xu's avatar
Yaowu Xu committed
993
void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
994 995
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
996 997
                                const int16_t *filter_y, int y_step_q4, int w,
                                int h, int bd) {
998
  // Fixed size intermediate buffer places limits on parameters.
999 1000 1001
  DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
1002

Yaowu Xu's avatar
Yaowu Xu committed
1003
  aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
1004
                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
Yaowu Xu's avatar
Yaowu Xu committed
1005
  aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
clang-format's avatar
clang-format committed
1006
                            dst_stride, NULL, 0, NULL, 0, w, h, bd);