aom_convolve.c 52.8 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
John Koleszar's avatar
John Koleszar committed
3
 *
Yaowu Xu's avatar
Yaowu Xu committed
4 5 6 7 8 9
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
John Koleszar's avatar
John Koleszar committed
10
 */
Christian Duvivier's avatar
Christian Duvivier committed
11

John Koleszar's avatar
John Koleszar committed
12
#include <assert.h>
Zoe Liu's avatar
Zoe Liu committed
13
#include <string.h>
John Koleszar's avatar
John Koleszar committed
14

Yaowu Xu's avatar
Yaowu Xu committed
15 16 17 18 19 20
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
21
#include "aom_ports/mem.h"
John Koleszar's avatar
John Koleszar committed
22

23 24 25 26 27 28 29 30 31 32 33 34 35
static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
  int sum = 0;
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
  return sum;
}

static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
                                      const int16_t *b) {
  int sum = 0;
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
  return sum;
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
36 37
static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
38 39
                           const InterpKernel *x_filters, int x0_q4,
                           int x_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
40
  src -= SUBPEL_TAPS / 2 - 1;
41
  for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
42
    int x_q4 = x0_q4;
43
    for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
44 45
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
46
      const int sum = horz_scalar_product(src_x, x_filter);
47
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
John Koleszar's avatar
John Koleszar committed
48 49 50 51 52 53 54
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Fergus Simpson's avatar
Fergus Simpson committed
55 56 57 58 59
static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_qn,
                                   int x_step_qn, int w, int h) {
  src -= SUBPEL_TAPS / 2 - 1;
60
  for (int y = 0; y < h; ++y) {
Fergus Simpson's avatar
Fergus Simpson committed
61
    int x_qn = x0_qn;
62
    for (int x = 0; x < w; ++x) {
Fergus Simpson's avatar
Fergus Simpson committed
63 64 65 66
      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];  // q8
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
      assert(x_filter_idx < SUBPEL_SHIFTS);
      const int16_t *const x_filter = x_filters[x_filter_idx];
67
      const int sum = horz_scalar_product(src_x, x_filter);
Fergus Simpson's avatar
Fergus Simpson committed
68 69 70 71 72 73 74 75
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
      x_qn += x_step_qn;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
76 77
static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
78 79
                               const InterpKernel *x_filters, int x0_q4,
                               int x_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
80
  src -= SUBPEL_TAPS / 2 - 1;
81
  for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
82
    int x_q4 = x0_q4;
83
    for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
84 85
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
86
      const int sum = horz_scalar_product(src_x, x_filter);
clang-format's avatar
clang-format committed
87 88
      dst[x] = ROUND_POWER_OF_TWO(
          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
John Koleszar's avatar
John Koleszar committed
89 90 91 92 93 94 95
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Fergus Simpson's avatar
Fergus Simpson committed
96 97 98 99 100
static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const InterpKernel *x_filters, int x0_qn,
                                       int x_step_qn, int w, int h) {
  src -= SUBPEL_TAPS / 2 - 1;
101
  for (int y = 0; y < h; ++y) {
Fergus Simpson's avatar
Fergus Simpson committed
102
    int x_qn = x0_qn;
103
    for (int x = 0; x < w; ++x) {
Fergus Simpson's avatar
Fergus Simpson committed
104 105 106 107
      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
      assert(x_filter_idx < SUBPEL_SHIFTS);
      const int16_t *const x_filter = x_filters[x_filter_idx];
108
      const int sum = horz_scalar_product(src_x, x_filter);
Fergus Simpson's avatar
Fergus Simpson committed
109 110 111 112 113 114 115 116 117
      dst[x] = ROUND_POWER_OF_TWO(
          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
      x_qn += x_step_qn;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
118 119
static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
120 121
                          const InterpKernel *y_filters, int y0_q4,
                          int y_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
122
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
John Koleszar's avatar
John Koleszar committed
123

124
  for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
125
    int y_q4 = y0_q4;
126
    for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
127 128
      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
129
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
130
      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
John Koleszar's avatar
John Koleszar committed
131 132 133 134 135 136 137
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

Fergus Simpson's avatar
Fergus Simpson committed
138 139 140 141 142 143
static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_qn,
                                  int y_step_qn, int w, int h) {
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

144
  for (int x = 0; x < w; ++x) {
Fergus Simpson's avatar
Fergus Simpson committed
145
    int y_qn = y0_qn;
146
    for (int y = 0; y < h; ++y) {
Fergus Simpson's avatar
Fergus Simpson committed
147 148 149 150
      const unsigned char *src_y =
          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter =
          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
151
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Fergus Simpson's avatar
Fergus Simpson committed
152 153 154 155 156 157 158 159
      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
      y_qn += y_step_qn;
    }
    ++src;
    ++dst;
  }
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
160 161
static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
162 163
                              const InterpKernel *y_filters, int y0_q4,
                              int y_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
164
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
John Koleszar's avatar
John Koleszar committed
165

166
  for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
167
    int y_q4 = y0_q4;
168
    for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
169 170
      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
171
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
clang-format's avatar
clang-format committed
172 173 174 175
      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
          dst[y * dst_stride] +
              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
          1);
John Koleszar's avatar
John Koleszar committed
176 177 178 179 180 181 182
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

Fergus Simpson's avatar
Fergus Simpson committed
183 184 185 186 187 188
static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const InterpKernel *y_filters, int y0_qn,
                                      int y_step_qn, int w, int h) {
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

189
  for (int x = 0; x < w; ++x) {
Fergus Simpson's avatar
Fergus Simpson committed
190
    int y_qn = y0_qn;
191
    for (int y = 0; y < h; ++y) {
Fergus Simpson's avatar
Fergus Simpson committed
192 193 194 195
      const unsigned char *src_y =
          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter =
          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
196
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
Fergus Simpson's avatar
Fergus Simpson committed
197 198 199 200 201 202 203 204 205 206 207
      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
          dst[y * dst_stride] +
              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
          1);
      y_qn += y_step_qn;
    }
    ++src;
    ++dst;
  }
}

clang-format's avatar
clang-format committed
208 209
static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
Dmitry Kovalev's avatar
Dmitry Kovalev committed
210
                     int x0_q4, int x_step_q4,
clang-format's avatar
clang-format committed
211 212
                     const InterpKernel *const y_filters, int y0_q4,
                     int y_step_q4, int w, int h) {
213 214 215 216 217 218 219 220 221 222 223 224
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
225
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
226
  int intermediate_height =
clang-format's avatar
clang-format committed
227
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
228

229 230
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
231

232 233
  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);
234

clang-format's avatar
clang-format committed
235 236 237 238 239
  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
                 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
                 intermediate_height);
  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
                dst_stride, y_filters, y0_q4, y_step_q4, w, h);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
240 241
}

Fergus Simpson's avatar
Fergus Simpson committed
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const InterpKernel *const x_filters, int x0_qn,
                             int x_step_qn, const InterpKernel *const y_filters,
                             int y0_qn, int y_step_qn, int w, int h) {
  // TODO(afergs): Update comment here
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
  int intermediate_height =
      (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  assert(y_step_qn <= SCALE_SUBPEL_BITS * 2);
  assert(x_step_qn <= SCALE_SUBPEL_BITS * 2);

  convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
                         temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w,
                         intermediate_height);
  convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
                        dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h);
}

277
static const InterpKernel *get_filter_base(const int16_t *filter) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
278 279
  // NOTE: This assumes that the filter table is 256-byte aligned.
  // TODO(agrange) Modify to make independent of table alignment.
280
  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
Dmitry Kovalev's avatar
Dmitry Kovalev committed
281 282
}

283
static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
284
  return (int)((const InterpKernel *)(intptr_t)f - base);
John Koleszar's avatar
John Koleszar committed
285 286
}

Yaowu Xu's avatar
Yaowu Xu committed
287
void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
288
                           uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
289
                           const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
290 291
                           const int16_t *filter_y, int y_step_q4, int w,
                           int h) {
292
  const InterpKernel *const filters_x = get_filter_base(filter_x);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
293 294
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

295 296 297
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
298 299
  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
                 w, h);
John Koleszar's avatar
John Koleszar committed
300 301
}

Fergus Simpson's avatar
Fergus Simpson committed
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int subpel_x,
                                 int x_step_qn, const int16_t *filter_y,
                                 int subpel_y, int y_step_qn, int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);

  (void)subpel_y;
  (void)filter_y;
  (void)y_step_qn;

  convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
                         x_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
317
void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
318
                               uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
319
                               const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
320 321
                               const int16_t *filter_y, int y_step_q4, int w,
                               int h) {
322
  const InterpKernel *const filters_x = get_filter_base(filter_x);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
323 324
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

325 326 327
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
328 329
  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                     x_step_q4, w, h);
John Koleszar's avatar
John Koleszar committed
330 331
}

Fergus Simpson's avatar
Fergus Simpson committed
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347
void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                     uint8_t *dst, ptrdiff_t dst_stride,
                                     const int16_t *filter_x, int subpel_x,
                                     int x_step_qn, const int16_t *filter_y,
                                     int subpel_y, int y_step_qn, int w,
                                     int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);

  (void)subpel_y;
  (void)filter_y;
  (void)y_step_qn;

  convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x,
                             subpel_x, x_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
348
void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
349
                          uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
350
                          const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
351 352
                          const int16_t *filter_y, int y_step_q4, int w,
                          int h) {
353
  const InterpKernel *const filters_y = get_filter_base(filter_y);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
354
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
355 356 357 358

  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
359 360
  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
                w, h);
John Koleszar's avatar
John Koleszar committed
361 362
}

Fergus Simpson's avatar
Fergus Simpson committed
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int subpel_x,
                                int x_step_qn, const int16_t *filter_y,
                                int subpel_y, int y_step_qn, int w, int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);

  (void)subpel_x;
  (void)filter_x;
  (void)x_step_qn;

  convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y,
                        y_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
378
void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
379
                              uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
380
                              const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
381 382
                              const int16_t *filter_y, int y_step_q4, int w,
                              int h) {
383
  const InterpKernel *const filters_y = get_filter_base(filter_y);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
384
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
385 386 387 388

  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
389 390
  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                    y_step_q4, w, h);
John Koleszar's avatar
John Koleszar committed
391 392
}

Fergus Simpson's avatar
Fergus Simpson committed
393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const int16_t *filter_x, int subpel_x,
                                    int x_step_qn, const int16_t *filter_y,
                                    int subpel_y, int y_step_qn, int w, int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);

  (void)subpel_x;
  (void)filter_x;
  (void)x_step_qn;

  convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y,
                            subpel_y, y_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
408
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
409 410
                     ptrdiff_t dst_stride, const int16_t *filter_x,
                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
John Koleszar's avatar
John Koleszar committed
411
                     int w, int h) {
412
  const InterpKernel *const filters_x = get_filter_base(filter_x);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
413 414
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

415
  const InterpKernel *const filters_y = get_filter_base(filter_y);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
416 417
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

clang-format's avatar
clang-format committed
418
  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
Dmitry Kovalev's avatar
Dmitry Kovalev committed
419
           filters_y, y0_q4, y_step_q4, w, h);
John Koleszar's avatar
John Koleszar committed
420 421
}

Fergus Simpson's avatar
Fergus Simpson committed
422 423 424 425 426 427 428 429 430 431 432 433 434
void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int subpel_x, int x_step_qn,
                           const int16_t *filter_y, int subpel_y, int y_step_qn,
                           int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);

  convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
                   x_step_qn, filters_y, subpel_y, y_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
435
void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
436 437
                         ptrdiff_t dst_stride, const int16_t *filter_x,
                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
John Koleszar's avatar
John Koleszar committed
438
                         int w, int h) {
Christian Duvivier's avatar
Christian Duvivier committed
439
  /* Fixed size intermediate buffer places limits on parameters. */
440 441 442
  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
Christian Duvivier's avatar
Christian Duvivier committed
443

Yaowu Xu's avatar
Yaowu Xu committed
444
  aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
clang-format's avatar
clang-format committed
445
                  filter_y, y_step_q4, w, h);
Yaowu Xu's avatar
Yaowu Xu committed
446
  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
clang-format's avatar
clang-format committed
447
                     h);
John Koleszar's avatar
John Koleszar committed
448
}
449

Fergus Simpson's avatar
Fergus Simpson committed
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int subpel_x,
                               int x_step_qn, const int16_t *filter_y,
                               int subpel_y, int y_step_qn, int w, int h) {
  /* Fixed size intermediate buffer places limits on parameters. */
  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x,
                        x_step_qn, filter_y, subpel_y, y_step_qn, w, h);
  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
                     h);
}

Yaowu Xu's avatar
Yaowu Xu committed
466
void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
467 468 469
                         ptrdiff_t dst_stride, const int16_t *filter_x,
                         int filter_x_stride, const int16_t *filter_y,
                         int filter_y_stride, int w, int h) {
470 471
  int r;

clang-format's avatar
clang-format committed
472 473 474 475
  (void)filter_x;
  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
476

477
  for (r = h; r > 0; --r) {
James Zern's avatar
James Zern committed
478
    memcpy(dst, src, w);
479 480
    src += src_stride;
    dst += dst_stride;
481 482 483
  }
}

Yaowu Xu's avatar
Yaowu Xu committed
484
void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
485 486 487 488 489 490 491
                        ptrdiff_t dst_stride, const int16_t *filter_x,
                        int filter_x_stride, const int16_t *filter_y,
                        int filter_y_stride, int w, int h) {
  (void)filter_x;
  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
492

493 494
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
495

496 497 498 499
    src += src_stride;
    dst += dst_stride;
  }
}
500

Yaowu Xu's avatar
Yaowu Xu committed
501
void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
502 503
                        ptrdiff_t dst_stride, const int16_t *filter_x,
                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
504
                        int w, int h) {
Yaowu Xu's avatar
Yaowu Xu committed
505
  aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
506 507 508
                        filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
509
void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
510 511
                       ptrdiff_t dst_stride, const int16_t *filter_x,
                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
512
                       int w, int h) {
Yaowu Xu's avatar
Yaowu Xu committed
513
  aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
514 515 516
                       filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
517
void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
518 519
                     ptrdiff_t dst_stride, const int16_t *filter_x,
                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
520
                     int w, int h) {
Yaowu Xu's avatar
Yaowu Xu committed
521
  aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
522 523 524
                  filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
525
void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
526 527
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
528 529
                            const int16_t *filter_y, int y_step_q4, int w,
                            int h) {
Yaowu Xu's avatar
Yaowu Xu committed
530
  aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
531 532 533
                            x_step_q4, filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
534
void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
535 536
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
537 538
                           const int16_t *filter_y, int y_step_q4, int w,
                           int h) {
Yaowu Xu's avatar
Yaowu Xu committed
539
  aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
540 541 542
                           x_step_q4, filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
543
void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
544 545 546
                         ptrdiff_t dst_stride, const int16_t *filter_x,
                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
Yaowu Xu's avatar
Yaowu Xu committed
547
  aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
548 549 550
                      filter_y, y_step_q4, w, h);
}

551 552 553 554 555 556 557 558 559 560
#if CONFIG_HIGHBITDEPTH || CONFIG_LOOP_RESTORATION
static INLINE int highbd_vert_scalar_product(const uint16_t *a,
                                             ptrdiff_t a_stride,
                                             const int16_t *b) {
  int sum = 0;
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
  return sum;
}
#endif

Fergus Simpson's avatar
Fergus Simpson committed
561
// TODO(afergs): Make sure this works too
562 563 564 565 566 567
#if CONFIG_LOOP_RESTORATION
static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_q4,
                                   int x_step_q4, int w, int h) {
  src -= SUBPEL_TAPS / 2 - 1;
568
  for (int y = 0; y < h; ++y) {
569
    int x_q4 = x0_q4;
570
    for (int x = 0; x < w; ++x) {
571 572
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
573 574

      const int sum = horz_scalar_product(src_x, x_filter);
575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
                          src_x[SUBPEL_TAPS / 2 - 1]);
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_q4,
                                  int y_step_q4, int w, int h) {
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

590
  for (int x = 0; x < w; ++x) {
591
    int y_q4 = y0_q4;
592
    for (int y = 0; y < h; ++y) {
593 594
      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
595
      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671
      dst[y * dst_stride] =
          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
                     src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const InterpKernel *const x_filters, int x0_q4,
                             int x_step_q4, const InterpKernel *const y_filters,
                             int y0_q4, int y_step_q4, int w, int h) {
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
  int intermediate_height =
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

  convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
                         temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
                         intermediate_height);
  convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
                        dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
}

void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
                                   const int16_t *filter_y, int y_step_q4,
                                   int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  (void)filter_y;
  (void)y_step_q4;

  convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                         x_step_q4, w, h);
}

void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4, int w,
                                  int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  (void)filter_x;
  (void)x_step_q4;

  convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                        y_step_q4, w, h);
}

void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4, int w,
                             int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
}
672 673 674 675 676

static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
                                       uint16_t *dst, ptrdiff_t dst_stride,
                                       const InterpKernel *x_filters, int x0_q4,
                                       int x_step_q4, int w, int h) {
677
  const int bd = 8;
678
  src -= SUBPEL_TAPS / 2 - 1;
679
  for (int y = 0; y < h; ++y) {
680
    int x_q4 = x0_q4;
681
    for (int x = 0; x < w; ++x) {
682 683
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
684 685 686
      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
                           (1 << (bd + FILTER_BITS - 1));
      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
687 688
      dst[x] =
          (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
689
                          0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
690 691 692 693 694 695 696 697 698 699 700
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const InterpKernel *y_filters, int y0_q4,
                                      int y_step_q4, int w, int h) {
701
  const int bd = 8;
702 703
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

704
  for (int x = 0; x < w; ++x) {
705
    int y_q4 = y0_q4;
706
    for (int y = 0; y < h; ++y) {
707 708
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
709
      const int rounding =
710 711
          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
          (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
712 713
      const int sum =
          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790
      dst[y * dst_stride] =
          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const InterpKernel *const x_filters, int x0_q4,
                                 int x_step_q4,
                                 const InterpKernel *const y_filters, int y0_q4,
                                 int y_step_q4, int w, int h) {
  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
  int intermediate_height =
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
                             src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
                             x_step_q4, w, intermediate_height);
  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
                            y_step_q4, w, h);
}

void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
                                       uint16_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
                                       const int16_t *filter_y, int y_step_q4,
                                       int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  (void)filter_y;
  (void)y_step_q4;

  convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                             x_step_q4, w, h);
}

void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  (void)filter_x;
  (void)x_step_q4;

  convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                            y_step_q4, w, h);
}

void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
                                 const int16_t *filter_y, int y_step_q4, int w,
                                 int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                       x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
}
791 792
#endif  // CONFIG_LOOP_RESTORATION

Fergus Simpson's avatar
Fergus Simpson committed
793
// TODO(afergs): Make sure this works too
794
#if CONFIG_HIGHBITDEPTH
795 796 797 798 799 800 801 802

static INLINE int highbd_horz_scalar_product(const uint16_t *a,
                                             const int16_t *b) {
  int sum = 0;
  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
  return sum;
}

803 804
static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                  uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
805 806
                                  const InterpKernel *x_filters, int x0_q4,
                                  int x_step_q4, int w, int h, int bd) {
807 808 809
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= SUBPEL_TAPS / 2 - 1;
810
  for (int y = 0; y < h; ++y) {
811
    int x_q4 = x0_q4;
812
    for (int x = 0; x < w; ++x) {
813 814
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
815
      const int sum = highbd_horz_scalar_product(src_x, x_filter);
816
      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
817 818 819 820 821 822 823
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

824 825
static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                      uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
826 827
                                      const InterpKernel *x_filters, int x0_q4,
                                      int x_step_q4, int w, int h, int bd) {
828 829 830
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= SUBPEL_TAPS / 2 - 1;
831
  for (int y = 0; y < h; ++y) {
832
    int x_q4 = x0_q4;
833
    for (int x = 0; x < w; ++x) {
834 835
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
836
      const int sum = highbd_horz_scalar_product(src_x, x_filter);
clang-format's avatar
clang-format committed
837 838 839
      dst[x] = ROUND_POWER_OF_TWO(
          dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
          1);
840 841 842 843 844 845 846
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

847 848
static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                 uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
849 850
                                 const InterpKernel *y_filters, int y0_q4,
                                 int y_step_q4, int w, int h, int bd) {
851 852 853
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
854
  for (int x = 0; x < w; ++x) {
855
    int y_q4 = y0_q4;
856
    for (int y = 0; y < h; ++y) {
857 858
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
859
      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
clang-format's avatar
clang-format committed
860 861
      dst[y * dst_stride] =
          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
862 863 864 865 866 867 868
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

869 870
static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                     uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
871 872
                                     const InterpKernel *y_filters, int y0_q4,
                                     int y_step_q4, int w, int h, int bd) {
873 874 875
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
876
  for (int x = 0; x < w; ++x) {
877
    int y_q4 = y0_q4;
878
    for (int y = 0; y < h; ++y) {
879 880
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
881
      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
clang-format's avatar
clang-format committed
882 883 884 885
      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
          dst[y * dst_stride] +
              clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
          1);
886 887 888 889 890 891 892
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

893 894
static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
895 896 897
                            const InterpKernel *const x_filters, int x0_q4,
                            int x_step_q4, const InterpKernel *const y_filters,
                            int y0_q4, int y_step_q4, int w, int h, int bd) {
898 899 900 901 902 903 904 905 906 907 908 909
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
910
  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
911
  int intermediate_height =
clang-format's avatar
clang-format committed
912
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
913

914 915
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
916 917 918
  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

919
  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
clang-format's avatar
clang-format committed
920 921
                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
                        x_step_q4, w, intermediate_height, bd);
922
  highbd_convolve_vert(
clang-format's avatar
clang-format committed
923 924
      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
925 926
}

Yaowu Xu's avatar
Yaowu Xu committed
927
void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
928 929
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
930 931
                                  const int16_t *filter_y, int y_step_q4, int w,
                                  int h, int bd) {
932 933 934 935 936
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
937 938
  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                        x_step_q4, w, h, bd);
939 940
}

Yaowu Xu's avatar
Yaowu Xu committed
941
void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
942 943 944 945
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h, int bd) {
946 947 948 949 950
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
951 952
  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                            x_step_q4, w, h, bd);
953 954
}

Yaowu Xu's avatar
Yaowu Xu committed
955
void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
956 957
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
958 959
                                 const int16_t *filter_y, int y_step_q4, int w,
                                 int h, int bd) {
960 961 962 963 964
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
965 966
  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                       y_step_q4, w, h, bd);
967 968
}

Yaowu Xu's avatar
Yaowu Xu committed
969
void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
970 971 972 973
                                     uint8_t *dst, ptrdiff_t dst_stride,
                                     const int16_t *filter_x, int x_step_q4,
                                     const int16_t *filter_y, int y_step_q4,
                                     int w, int h, int bd) {
974 975 976 977 978
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
979 980
  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                           y_step_q4, w, h, bd);
981 982
}

Yaowu Xu's avatar
Yaowu Xu committed
983
void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
984 985
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
986 987
                            const int16_t *filter_y, int y_step_q4, int w,
                            int h, int bd) {
988 989 990 991 992 993
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

clang-format's avatar
clang-format committed
994
  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
995
                  filters_y, y0_q4, y_step_q4, w, h, bd);
996 997
}

Yaowu Xu's avatar
Yaowu Xu committed
998
void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
999 1000
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
1001 1002
                                const int16_t *filter_y, int y_step_q4, int w,
                                int h, int bd) {
1003
  // Fixed size intermediate buffer places limits on parameters.
1004 1005 1006
  DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
1007

Yaowu Xu's avatar
Yaowu Xu committed
1008
  aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
1009
                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
Yaowu Xu's avatar
Yaowu Xu committed
1010
  aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
clang-format's avatar
clang-format committed
1011
                            dst_stride, NULL, 0, NULL, 0, w, h, bd);
1012 1013
}

Yaowu Xu's avatar
Yaowu Xu committed
1014
void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
1015 1016 1017 1018
                                uint8_t *dst8, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int filter_x_stride,
                                const int16_t *filter_y, int filter_y_stride,
                                int w, int h, int bd) {
1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
  int r;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  (void)filter_x;
  (void)filter_y;
  (void)filter_x_stride;
  (void)filter_y_stride;
  (void)bd;

  for (r = h; r > 0; --r) {
James Zern's avatar
James Zern committed
1029
    memcpy(dst, src, w * sizeof(uint16_t));
1030 1031 1032 1033 1034
    src += src_stride;
    dst += dst_stride;
  }
}

Yaowu Xu's avatar
Yaowu Xu committed
1035
void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
1036 1037 1038 1039
                               uint8_t *dst8, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int filter_x_stride,
                               const int16_t *filter_y, int filter_y_stride,
                               int w, int h, int bd) {
1040 1041 1042 1043 1044 1045 1046 1047
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  (void)filter_x;
  (void)filter_y;
  (void)filter_x_stride;
  (void)filter_y_stride;
  (void)bd;

1048 1049
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
1050 1051 1052 1053 1054 1055
      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
    }
    src += src_stride;
    dst += dst_stride;
  }
}
1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066

#if CONFIG_LOOP_RESTORATION
static void highbd_convolve_add_src_horiz(const uint8_t *src8,
                                          ptrdiff_t src_stride, uint8_t *dst8,
                                          ptrdiff_t dst_stride,
                                          const InterpKernel *x_filters,
                                          int x0_q4, int x_step_q4, int w,
                                          int h, int bd) {
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= SUBPEL_TAPS / 2 - 1;
1067
  for (int y = 0; y < h; ++y) {
1068
    int x_q4 = x0_q4;
1069
    for (int x = 0; x < w; ++x) {
1070 1071
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1072
      const int sum = highbd_horz_scalar_product(src_x, x_filter);
1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
      dst[x] = clip_pixel_highbd(
          ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
          bd);
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

static void highbd_convolve_add_src_vert(const uint8_t *src8,
                                         ptrdiff_t src_stride, uint8_t *dst8,
                                         ptrdiff_t dst_stride,
                                         const InterpKernel *y_filters,
                                         int y0_q4, int y_step_q4, int w, int h,
                                         int bd) {
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1092
  for (int x = 0; x < w; ++x) {
1093
    int y_q4 = y0_q4;
1094
    for (int y = 0; y < h; ++y) {
1095 1096
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1097
      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159
      dst[y * dst_stride] =
          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
                                src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
                            bd);
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const InterpKernel *const x_filters,
                                    int x0_q4, int x_step_q4,
                                    const InterpKernel *const y_filters,
                                    int y0_q4, int y_step_q4, int w, int h,
                                    int bd) {
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
  int intermediate_height =
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

  highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
                                src_stride, CONVERT_TO_BYTEPTR(temp),
                                MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
                                intermediate_height, bd);
  highbd_convolve_add_src_vert(
      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
}

void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const int16_t *filter_x, int x_step_q4,
                                    const int16_t *filter_y, int y_step_q4,
                                    int w, int h, int bd) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                          x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
}
1160 1161 1162 1163 1164

static void highbd_convolve_add_src_horiz_hip(
    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
    int x_step_q4, int w, int h, int bd) {
1165
  const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
1166 1167
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  src -= SUBPEL_TAPS / 2 - 1;
1168
  for (int y = 0; y < h; ++y) {
1169
    int x_q4 = x0_q4;
1170
    for (int x = 0; x < w; ++x) {
1171 1172
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1173 1174 1175
      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
                           (1 << (bd + FILTER_BITS - 1));
      const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191
      dst[x] =
          (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
                          0, extraprec_clamp_limit - 1);
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

static void highbd_convolve_a