aom_convolve.c 52.7 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
John Koleszar's avatar
John Koleszar committed
3
 *
4 5 6 7 8 9
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
John Koleszar's avatar
John Koleszar committed
10
 */
Christian Duvivier's avatar
Christian Duvivier committed
11

John Koleszar's avatar
John Koleszar committed
12
#include <assert.h>
Zoe Liu's avatar
Zoe Liu committed
13
#include <string.h>
John Koleszar's avatar
John Koleszar committed
14

15 16 17 18 19 20
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
21
#include "aom_ports/mem.h"
John Koleszar's avatar
John Koleszar committed
22

23 24 25 26 27 28 29 30 31 32 33 34 35
static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
  int sum = 0;
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
  return sum;
}

static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
                                      const int16_t *b) {
  int sum = 0;
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
  return sum;
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
36 37
static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
38 39
                           const InterpKernel *x_filters, int x0_q4,
                           int x_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
40
  src -= SUBPEL_TAPS / 2 - 1;
41
  for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
42
    int x_q4 = x0_q4;
43
    for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
44 45
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
46
      const int sum = horz_scalar_product(src_x, x_filter);
47
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
John Koleszar's avatar
John Koleszar committed
48 49 50 51 52 53 54
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

55 56 57 58 59
static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_qn,
                                   int x_step_qn, int w, int h) {
  src -= SUBPEL_TAPS / 2 - 1;
60
  for (int y = 0; y < h; ++y) {
61
    int x_qn = x0_qn;
62
    for (int x = 0; x < w; ++x) {
63 64 65 66
      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];  // q8
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
      assert(x_filter_idx < SUBPEL_SHIFTS);
      const int16_t *const x_filter = x_filters[x_filter_idx];
67
      const int sum = horz_scalar_product(src_x, x_filter);
68 69 70 71 72 73 74 75
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
      x_qn += x_step_qn;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
76 77
static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
78 79
                               const InterpKernel *x_filters, int x0_q4,
                               int x_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
80
  src -= SUBPEL_TAPS / 2 - 1;
81
  for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
82
    int x_q4 = x0_q4;
83
    for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
84 85
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
86
      const int sum = horz_scalar_product(src_x, x_filter);
clang-format's avatar
clang-format committed
87 88
      dst[x] = ROUND_POWER_OF_TWO(
          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
John Koleszar's avatar
John Koleszar committed
89 90 91 92 93 94 95
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

96 97 98 99 100
static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const InterpKernel *x_filters, int x0_qn,
                                       int x_step_qn, int w, int h) {
  src -= SUBPEL_TAPS / 2 - 1;
101
  for (int y = 0; y < h; ++y) {
102
    int x_qn = x0_qn;
103
    for (int x = 0; x < w; ++x) {
104 105 106 107
      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
      assert(x_filter_idx < SUBPEL_SHIFTS);
      const int16_t *const x_filter = x_filters[x_filter_idx];
108
      const int sum = horz_scalar_product(src_x, x_filter);
109 110 111 112 113 114 115 116 117
      dst[x] = ROUND_POWER_OF_TWO(
          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
      x_qn += x_step_qn;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
118 119
static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
120 121
                          const InterpKernel *y_filters, int y0_q4,
                          int y_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
122
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
John Koleszar's avatar
John Koleszar committed
123

124
  for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
125
    int y_q4 = y0_q4;
126
    for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
127 128
      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
129
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
130
      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
John Koleszar's avatar
John Koleszar committed
131 132 133 134 135 136 137
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

138 139 140 141 142 143
static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_qn,
                                  int y_step_qn, int w, int h) {
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

144
  for (int x = 0; x < w; ++x) {
145
    int y_qn = y0_qn;
146
    for (int y = 0; y < h; ++y) {
147 148 149 150
      const unsigned char *src_y =
          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter =
          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
151
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
152 153 154 155 156 157 158 159
      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
      y_qn += y_step_qn;
    }
    ++src;
    ++dst;
  }
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
160 161
static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
162 163
                              const InterpKernel *y_filters, int y0_q4,
                              int y_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
164
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
John Koleszar's avatar
John Koleszar committed
165

166
  for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
167
    int y_q4 = y0_q4;
168
    for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
169 170
      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
171
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
clang-format's avatar
clang-format committed
172 173 174 175
      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
          dst[y * dst_stride] +
              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
          1);
John Koleszar's avatar
John Koleszar committed
176 177 178 179 180 181 182
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

183 184 185 186 187 188
static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const InterpKernel *y_filters, int y0_qn,
                                      int y_step_qn, int w, int h) {
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

189
  for (int x = 0; x < w; ++x) {
190
    int y_qn = y0_qn;
191
    for (int y = 0; y < h; ++y) {
192 193 194 195
      const unsigned char *src_y =
          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter =
          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
196
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
197 198 199 200 201 202 203 204 205 206 207
      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
          dst[y * dst_stride] +
              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
          1);
      y_qn += y_step_qn;
    }
    ++src;
    ++dst;
  }
}

clang-format's avatar
clang-format committed
208 209
static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
Dmitry Kovalev's avatar
Dmitry Kovalev committed
210
                     int x0_q4, int x_step_q4,
clang-format's avatar
clang-format committed
211 212
                     const InterpKernel *const y_filters, int y0_q4,
                     int y_step_q4, int w, int h) {
213 214 215 216 217 218 219 220 221 222 223 224
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
225
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
226
  const int intermediate_height =
clang-format's avatar
clang-format committed
227
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
228

229 230
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
231

232 233
  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);
234

clang-format's avatar
clang-format committed
235 236 237 238 239
  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
                 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
                 intermediate_height);
  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
                dst_stride, y_filters, y0_q4, y_step_q4, w, h);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
240 241
}

242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const InterpKernel *const x_filters, int x0_qn,
                             int x_step_qn, const InterpKernel *const y_filters,
                             int y0_qn, int y_step_qn, int w, int h) {
  // TODO(afergs): Update comment here
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
261
  const int intermediate_height =
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
      (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  assert(y_step_qn <= SCALE_SUBPEL_BITS * 2);
  assert(x_step_qn <= SCALE_SUBPEL_BITS * 2);

  convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
                         temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w,
                         intermediate_height);
  convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
                        dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h);
}

277
static const InterpKernel *get_filter_base(const int16_t *filter) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
278 279
  // NOTE: This assumes that the filter table is 256-byte aligned.
  // TODO(agrange) Modify to make independent of table alignment.
280
  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
Dmitry Kovalev's avatar
Dmitry Kovalev committed
281 282
}

283
static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
284
  return (int)((const InterpKernel *)(intptr_t)f - base);
John Koleszar's avatar
John Koleszar committed
285 286
}

287
void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
288
                           uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
289
                           const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
290 291
                           const int16_t *filter_y, int y_step_q4, int w,
                           int h) {
292
  const InterpKernel *const filters_x = get_filter_base(filter_x);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
293 294
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

295 296 297
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
298 299
  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
                 w, h);
John Koleszar's avatar
John Koleszar committed
300 301
}

302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int subpel_x,
                                 int x_step_qn, const int16_t *filter_y,
                                 int subpel_y, int y_step_qn, int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);

  (void)subpel_y;
  (void)filter_y;
  (void)y_step_qn;

  convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
                         x_step_qn, w, h);
}

317
void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
318
                               uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
319
                               const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
320 321
                               const int16_t *filter_y, int y_step_q4, int w,
                               int h) {
322
  const InterpKernel *const filters_x = get_filter_base(filter_x);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
323 324
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

325 326 327
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
328 329
  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                     x_step_q4, w, h);
John Koleszar's avatar
John Koleszar committed
330 331
}

332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                     uint8_t *dst, ptrdiff_t dst_stride,
                                     const int16_t *filter_x, int subpel_x,
                                     int x_step_qn, const int16_t *filter_y,
                                     int subpel_y, int y_step_qn, int w,
                                     int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);

  (void)subpel_y;
  (void)filter_y;
  (void)y_step_qn;

  convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x,
                             subpel_x, x_step_qn, w, h);
}

348
void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
349
                          uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
350
                          const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
351 352
                          const int16_t *filter_y, int y_step_q4, int w,
                          int h) {
353
  const InterpKernel *const filters_y = get_filter_base(filter_y);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
354
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
355 356 357 358

  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
359 360
  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
                w, h);
John Koleszar's avatar
John Koleszar committed
361 362
}

363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int subpel_x,
                                int x_step_qn, const int16_t *filter_y,
                                int subpel_y, int y_step_qn, int w, int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);

  (void)subpel_x;
  (void)filter_x;
  (void)x_step_qn;

  convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y,
                        y_step_qn, w, h);
}

378
void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
379
                              uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
380
                              const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
381 382
                              const int16_t *filter_y, int y_step_q4, int w,
                              int h) {
383
  const InterpKernel *const filters_y = get_filter_base(filter_y);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
384
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
385 386 387 388

  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
389 390
  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                    y_step_q4, w, h);
John Koleszar's avatar
John Koleszar committed
391 392
}

393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const int16_t *filter_x, int subpel_x,
                                    int x_step_qn, const int16_t *filter_y,
                                    int subpel_y, int y_step_qn, int w, int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);

  (void)subpel_x;
  (void)filter_x;
  (void)x_step_qn;

  convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y,
                            subpel_y, y_step_qn, w, h);
}

408
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
409 410
                     ptrdiff_t dst_stride, const int16_t *filter_x,
                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
John Koleszar's avatar
John Koleszar committed
411
                     int w, int h) {
412
  const InterpKernel *const filters_x = get_filter_base(filter_x);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
413 414
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

415
  const InterpKernel *const filters_y = get_filter_base(filter_y);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
416 417
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

clang-format's avatar
clang-format committed
418
  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
Dmitry Kovalev's avatar
Dmitry Kovalev committed
419
           filters_y, y0_q4, y_step_q4, w, h);
John Koleszar's avatar
John Koleszar committed
420 421
}

422 423 424 425 426 427 428 429 430 431 432 433 434
void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int subpel_x, int x_step_qn,
                           const int16_t *filter_y, int subpel_y, int y_step_qn,
                           int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);

  convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
                   x_step_qn, filters_y, subpel_y, y_step_qn, w, h);
}

435
void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
436 437
                         ptrdiff_t dst_stride, const int16_t *filter_x,
                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
John Koleszar's avatar
John Koleszar committed
438
                         int w, int h) {
Christian Duvivier's avatar
Christian Duvivier committed
439
  /* Fixed size intermediate buffer places limits on parameters. */
440 441 442
  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
Christian Duvivier's avatar
Christian Duvivier committed
443

444
  aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
clang-format's avatar
clang-format committed
445
                  filter_y, y_step_q4, w, h);
446
  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
clang-format's avatar
clang-format committed
447
                     h);
John Koleszar's avatar
John Koleszar committed
448
}
449

450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int subpel_x,
                               int x_step_qn, const int16_t *filter_y,
                               int subpel_y, int y_step_qn, int w, int h) {
  /* Fixed size intermediate buffer places limits on parameters. */
  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x,
                        x_step_qn, filter_y, subpel_y, y_step_qn, w, h);
  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
                     h);
}

466
void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
467 468 469
                         ptrdiff_t dst_stride, const int16_t *filter_x,
                         int filter_x_stride, const int16_t *filter_y,
                         int filter_y_stride, int w, int h) {
470 471
  int r;

clang-format's avatar
clang-format committed
472 473 474 475
  (void)filter_x;
  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
476

477
  for (r = h; r > 0; --r) {
James Zern's avatar
James Zern committed
478
    memcpy(dst, src, w);
479 480
    src += src_stride;
    dst += dst_stride;
481 482 483
  }
}

484
void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
485 486 487 488 489 490 491
                        ptrdiff_t dst_stride, const int16_t *filter_x,
                        int filter_x_stride, const int16_t *filter_y,
                        int filter_y_stride, int w, int h) {
  (void)filter_x;
  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
492

493 494
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
495

496 497 498 499
    src += src_stride;
    dst += dst_stride;
  }
}
500

501
void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
502 503
                        ptrdiff_t dst_stride, const int16_t *filter_x,
                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
504
                        int w, int h) {
505
  aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
506 507 508
                        filter_y, y_step_q4, w, h);
}

509
void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
510 511
                       ptrdiff_t dst_stride, const int16_t *filter_x,
                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
512
                       int w, int h) {
513
  aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
514 515 516
                       filter_y, y_step_q4, w, h);
}

517
void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
518 519
                     ptrdiff_t dst_stride, const int16_t *filter_x,
                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
520
                     int w, int h) {
521
  aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
522 523 524
                  filter_y, y_step_q4, w, h);
}

525
void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
526 527
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
528 529
                            const int16_t *filter_y, int y_step_q4, int w,
                            int h) {
530
  aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
531 532 533
                            x_step_q4, filter_y, y_step_q4, w, h);
}

534
void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
535 536
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
537 538
                           const int16_t *filter_y, int y_step_q4, int w,
                           int h) {
539
  aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
540 541 542
                           x_step_q4, filter_y, y_step_q4, w, h);
}

543
void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
544 545 546
                         ptrdiff_t dst_stride, const int16_t *filter_x,
                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
547
  aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
548 549 550
                      filter_y, y_step_q4, w, h);
}

551 552 553 554 555 556 557 558
static INLINE int highbd_vert_scalar_product(const uint16_t *a,
                                             ptrdiff_t a_stride,
                                             const int16_t *b) {
  int sum = 0;
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
  return sum;
}

559
// TODO(afergs): Make sure this works too
560 561 562 563 564 565
#if CONFIG_LOOP_RESTORATION
static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_q4,
                                   int x_step_q4, int w, int h) {
  src -= SUBPEL_TAPS / 2 - 1;
566
  for (int y = 0; y < h; ++y) {
567
    int x_q4 = x0_q4;
568
    for (int x = 0; x < w; ++x) {
569 570
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
571 572

      const int sum = horz_scalar_product(src_x, x_filter);
573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
                          src_x[SUBPEL_TAPS / 2 - 1]);
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_q4,
                                  int y_step_q4, int w, int h) {
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

588
  for (int x = 0; x < w; ++x) {
589
    int y_q4 = y0_q4;
590
    for (int y = 0; y < h; ++y) {
591 592
      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
593
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609
      dst[y * dst_stride] =
          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
                     src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const InterpKernel *const x_filters, int x0_q4,
                             int x_step_q4, const InterpKernel *const y_filters,
                             int y0_q4, int y_step_q4, int w, int h) {
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
610
  const int intermediate_height =
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

  convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
                         temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
                         intermediate_height);
  convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
                        dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
}

void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
                                   const int16_t *filter_y, int y_step_q4,
                                   int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  (void)filter_y;
  (void)y_step_q4;

  convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                         x_step_q4, w, h);
}

void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4, int w,
                                  int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  (void)filter_x;
  (void)x_step_q4;

  convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                        y_step_q4, w, h);
}

void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4, int w,
                             int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
}
670 671 672 673 674

static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
                                       uint16_t *dst, ptrdiff_t dst_stride,
                                       const InterpKernel *x_filters, int x0_q4,
                                       int x_step_q4, int w, int h) {
675
  const int bd = 8;
676
  src -= SUBPEL_TAPS / 2 - 1;
677
  for (int y = 0; y < h; ++y) {
678
    int x_q4 = x0_q4;
679
    for (int x = 0; x < w; ++x) {
680 681
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
682 683 684
      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
                           (1 << (bd + FILTER_BITS - 1));
      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
685 686
      dst[x] =
          (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
687
                          0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
688 689 690 691 692 693 694 695 696 697 698
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const InterpKernel *y_filters, int y0_q4,
                                      int y_step_q4, int w, int h) {
699
  const int bd = 8;
700 701
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

702
  for (int x = 0; x < w; ++x) {
703
    int y_q4 = y0_q4;
704
    for (int y = 0; y < h; ++y) {
705 706
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
707
      const int rounding =
708 709
          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
          (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
710 711
      const int sum =
          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
      dst[y * dst_stride] =
          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const InterpKernel *const x_filters, int x0_q4,
                                 int x_step_q4,
                                 const InterpKernel *const y_filters, int y0_q4,
                                 int y_step_q4, int w, int h) {
  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
728
  const int intermediate_height =
729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
                             src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
                             x_step_q4, w, intermediate_height);
  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
                            y_step_q4, w, h);
}

void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
                                       uint16_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
                                       const int16_t *filter_y, int y_step_q4,
                                       int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  (void)filter_y;
  (void)y_step_q4;

  convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                             x_step_q4, w, h);
}

void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  (void)filter_x;
  (void)x_step_q4;

  convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                            y_step_q4, w, h);
}

void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
                                 const int16_t *filter_y, int y_step_q4, int w,
                                 int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                       x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
}
789 790
#endif  // CONFIG_LOOP_RESTORATION

791 792 793 794 795 796 797
static INLINE int highbd_horz_scalar_product(const uint16_t *a,
                                             const int16_t *b) {
  int sum = 0;
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
  return sum;
}

798 799
static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                  uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
800 801
                                  const InterpKernel *x_filters, int x0_q4,
                                  int x_step_q4, int w, int h, int bd) {
802 803 804
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= SUBPEL_TAPS / 2 - 1;
805
  for (int y = 0; y < h; ++y) {
806
    int x_q4 = x0_q4;
807
    for (int x = 0; x < w; ++x) {
808 809
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
810
      const int sum = highbd_horz_scalar_product(src_x, x_filter);
811
      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
812 813 814 815 816 817 818
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

819 820
static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                      uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
821 822
                                      const InterpKernel *x_filters, int x0_q4,
                                      int x_step_q4, int w, int h, int bd) {
823 824 825
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= SUBPEL_TAPS / 2 - 1;
826
  for (int y = 0; y < h; ++y) {
827
    int x_q4 = x0_q4;
828
    for (int x = 0; x < w; ++x) {
829 830
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
831
      const int sum = highbd_horz_scalar_product(src_x, x_filter);
clang-format's avatar
clang-format committed
832 833 834
      dst[x] = ROUND_POWER_OF_TWO(
          dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
          1);
835 836 837 838 839 840 841
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

842 843
static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                 uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
844 845
                                 const InterpKernel *y_filters, int y0_q4,
                                 int y_step_q4, int w, int h, int bd) {
846 847 848
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
849
  for (int x = 0; x < w; ++x) {
850
    int y_q4 = y0_q4;
851
    for (int y = 0; y < h; ++y) {
852 853
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
854
      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
clang-format's avatar
clang-format committed
855 856
      dst[y * dst_stride] =
          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
857 858 859 860 861 862 863
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

864 865
static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                     uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
866 867
                                     const InterpKernel *y_filters, int y0_q4,
                                     int y_step_q4, int w, int h, int bd) {
868 869 870
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
871
  for (int x = 0; x < w; ++x) {
872
    int y_q4 = y0_q4;
873
    for (int y = 0; y < h; ++y) {
874 875
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
876
      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
clang-format's avatar
clang-format committed
877 878 879 880
      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
          dst[y * dst_stride] +
              clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
          1);
881 882 883 884 885 886 887
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

888 889
static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
890 891 892
                            const InterpKernel *const x_filters, int x0_q4,
                            int x_step_q4, const InterpKernel *const y_filters,
                            int y0_q4, int y_step_q4, int w, int h, int bd) {
893 894 895 896 897 898 899 900 901 902 903 904
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
905
  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
906
  const int intermediate_height =
clang-format's avatar
clang-format committed
907
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
908

909 910
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
911 912 913
  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

914
  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
clang-format's avatar
clang-format committed
915 916
                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
                        x_step_q4, w, intermediate_height, bd);
917
  highbd_convolve_vert(
clang-format's avatar
clang-format committed
918 919
      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
920 921
}

922
void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
923 924
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
925 926
                                  const int16_t *filter_y, int y_step_q4, int w,
                                  int h, int bd) {
927 928 929 930 931
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
932 933
  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                        x_step_q4, w, h, bd);
934 935
}

936
void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
937 938 939 940
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h, int bd) {
941 942 943 944 945
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
946 947
  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                            x_step_q4, w, h, bd);
948 949
}

950
void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
951 952
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
953 954
                                 const int16_t *filter_y, int y_step_q4, int w,
                                 int h, int bd) {
955 956 957 958 959
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
960 961
  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                       y_step_q4, w, h, bd);
962 963
}

964
void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
965 966 967 968
                                     uint8_t *dst, ptrdiff_t dst_stride,
                                     const int16_t *filter_x, int x_step_q4,
                                     const int16_t *filter_y, int y_step_q4,
                                     int w, int h, int bd) {
969 970 971 972 973
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
974 975
  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                           y_step_q4, w, h, bd);
976 977
}

978
void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
979 980
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
981 982
                            const int16_t *filter_y, int y_step_q4, int w,
                            int h, int bd) {
983 984 985 986 987 988
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

clang-format's avatar
clang-format committed
989
  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
990
                  filters_y, y0_q4, y_step_q4, w, h, bd);
991 992
}

993
void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
994 995
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
996 997
                                const int16_t *filter_y, int y_step_q4, int w,
                                int h, int bd) {
998
  // Fixed size intermediate buffer places limits on parameters.
999 1000 1001
  DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
1002

1003
  aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
1004
                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
1005
  aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
clang-format's avatar
clang-format committed
1006
                            dst_stride, NULL, 0, NULL, 0, w, h, bd);
1007 1008
}

1009
void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
1010 1011 1012 1013
                                uint8_t *dst8, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int filter_x_stride,
                                const int16_t *filter_y, int filter_y_stride,
                                int w, int h, int bd) {
1014 1015 1016 1017 1018 1019 1020 1021 1022 1023
  int r;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  (void)filter_x;
  (void)filter_y;
  (void)filter_x_stride;
  (void)filter_y_stride;
  (void)bd;

  for (r = h; r > 0; --r) {
James Zern's avatar
James Zern committed
1024
    memcpy(dst, src, w * sizeof(uint16_t));
1025 1026 1027 1028 1029
    src += src_stride;
    dst += dst_stride;
  }
}

1030
void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
1031 1032 1033 1034
                               uint8_t *dst8, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int filter_x_stride,
                               const int16_t *filter_y, int filter_y_stride,
                               int w, int h, int bd) {
1035 1036 1037 1038 1039 1040 1041 1042
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  (void)filter_x;
  (void)filter_y;
  (void)filter_x_stride;
  (void)filter_y_stride;
  (void)bd;

1043 1044
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
1045 1046 1047 1048 1049 1050
      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
    }
    src += src_stride;
    dst += dst_stride;
  }
}
1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061

#if CONFIG_LOOP_RESTORATION
static void highbd_convolve_add_src_horiz(const uint8_t *src8,
                                          ptrdiff_t src_stride, uint8_t *dst8,
                                          ptrdiff_t dst_stride,
                                          const InterpKernel *x_filters,
                                          int x0_q4, int x_step_q4, int w,
                                          int h, int bd) {
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= SUBPEL_TAPS / 2 - 1;
1062
  for (int y = 0; y < h; ++y) {
1063
    int x_q4 = x0_q4;
1064
    for (int x = 0; x < w; ++x) {
1065 1066
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1067
      const int sum = highbd_horz_scalar_product(src_x, x_filter);
1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086
      dst[x] = clip_pixel_highbd(
          ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
          bd);
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

static void highbd_convolve_add_src_vert(const uint8_t *src8,
                                         ptrdiff_t src_stride, uint8_t *dst8,
                                         ptrdiff_t dst_stride,
                                         const InterpKernel *y_filters,
                                         int y0_q4, int y_step_q4, int w, int h,
                                         int bd) {
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1087
  for (int x = 0; x < w; ++x) {
1088
    int y_q4 = y0_q4;
1089
    for (int y = 0; y < h; ++y) {
1090 1091
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1092
      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123
      dst[y * dst_stride] =
          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
                                src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
                            bd);
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const InterpKernel *const x_filters,
                                    int x0_q4, int x_step_q4,
                                    const InterpKernel *const y_filters,
                                    int y0_q4, int y_step_q4, int w, int h,
                                    int bd) {
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
1124
  const int intermediate_height =
1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

  highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
                                src_stride, CONVERT_TO_BYTEPTR(temp),
                                MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
                                intermediate_height, bd);
  highbd_convolve_add_src_vert(
      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
}

void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const int16_t *filter_x, int x_step_q4,
                                    const int16_t *filter_y, int y_step_q4,
                                    int w, int h, int bd) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                          x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
}
1155 1156 1157 1158 1159

static void highbd_convolve_add_src_horiz_hip(
    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
    int x_step_q4, int w, int h, int bd) {
1160
  const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
1161 1162
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  src -= SUBPEL_TAPS / 2 - 1;
1163
  for (int y = 0; y < h; ++y) {
1164
    int x_q4 = x0_q4;
1165
    for (int x = 0; x < w; ++x) {
1166 1167
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1168 1169