aom_convolve.c 54.6 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
John Koleszar's avatar
John Koleszar committed
3
 *
Yaowu Xu's avatar
Yaowu Xu committed
4
5
6
7
8
9
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
John Koleszar's avatar
John Koleszar committed
10
 */
Christian Duvivier's avatar
Christian Duvivier committed
11

John Koleszar's avatar
John Koleszar committed
12
#include <assert.h>
Zoe Liu's avatar
Zoe Liu committed
13
#include <string.h>
John Koleszar's avatar
John Koleszar committed
14

Yaowu Xu's avatar
Yaowu Xu committed
15
16
17
18
19
20
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
21
#include "aom_ports/mem.h"
John Koleszar's avatar
John Koleszar committed
22

Dmitry Kovalev's avatar
Dmitry Kovalev committed
23
24
static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
25
26
                           const InterpKernel *x_filters, int x0_q4,
                           int x_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
27
  src -= SUBPEL_TAPS / 2 - 1;
28
  for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
29
    int x_q4 = x0_q4;
30
    for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
31
32
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
33
34
      int sum = 0;
      for (int k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
35
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
John Koleszar's avatar
John Koleszar committed
36
37
38
39
40
41
42
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Fergus Simpson's avatar
Fergus Simpson committed
43
44
45
46
47
static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_qn,
                                   int x_step_qn, int w, int h) {
  src -= SUBPEL_TAPS / 2 - 1;
48
  for (int y = 0; y < h; ++y) {
Fergus Simpson's avatar
Fergus Simpson committed
49
    int x_qn = x0_qn;
50
    for (int x = 0; x < w; ++x) {
Fergus Simpson's avatar
Fergus Simpson committed
51
52
53
54
      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];  // q8
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
      assert(x_filter_idx < SUBPEL_SHIFTS);
      const int16_t *const x_filter = x_filters[x_filter_idx];
55
56
      int sum = 0;
      for (int k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
Fergus Simpson's avatar
Fergus Simpson committed
57
58
59
60
61
62
63
64
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
      x_qn += x_step_qn;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
65
66
static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
67
68
                               const InterpKernel *x_filters, int x0_q4,
                               int x_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
69
  src -= SUBPEL_TAPS / 2 - 1;
70
  for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
71
    int x_q4 = x0_q4;
72
    for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
73
74
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
75
76
      int sum = 0;
      for (int k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
clang-format's avatar
clang-format committed
77
78
      dst[x] = ROUND_POWER_OF_TWO(
          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
John Koleszar's avatar
John Koleszar committed
79
80
81
82
83
84
85
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Fergus Simpson's avatar
Fergus Simpson committed
86
87
88
89
90
static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const InterpKernel *x_filters, int x0_qn,
                                       int x_step_qn, int w, int h) {
  src -= SUBPEL_TAPS / 2 - 1;
91
  for (int y = 0; y < h; ++y) {
Fergus Simpson's avatar
Fergus Simpson committed
92
    int x_qn = x0_qn;
93
    for (int x = 0; x < w; ++x) {
Fergus Simpson's avatar
Fergus Simpson committed
94
95
96
97
      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
      assert(x_filter_idx < SUBPEL_SHIFTS);
      const int16_t *const x_filter = x_filters[x_filter_idx];
98
99
      int sum = 0;
      for (int k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
Fergus Simpson's avatar
Fergus Simpson committed
100
101
102
103
104
105
106
107
108
      dst[x] = ROUND_POWER_OF_TWO(
          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
      x_qn += x_step_qn;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
109
110
static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
111
112
                          const InterpKernel *y_filters, int y0_q4,
                          int y_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
113
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
John Koleszar's avatar
John Koleszar committed
114

115
  for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
116
    int y_q4 = y0_q4;
117
    for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
118
119
      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
120
121
      int sum = 0;
      for (int k = 0; k < SUBPEL_TAPS; ++k)
Dmitry Kovalev's avatar
Dmitry Kovalev committed
122
123
        sum += src_y[k * src_stride] * y_filter[k];
      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
John Koleszar's avatar
John Koleszar committed
124
125
126
127
128
129
130
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

Fergus Simpson's avatar
Fergus Simpson committed
131
132
133
134
135
136
static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_qn,
                                  int y_step_qn, int w, int h) {
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

137
  for (int x = 0; x < w; ++x) {
Fergus Simpson's avatar
Fergus Simpson committed
138
    int y_qn = y0_qn;
139
    for (int y = 0; y < h; ++y) {
Fergus Simpson's avatar
Fergus Simpson committed
140
141
142
143
      const unsigned char *src_y =
          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter =
          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
144
145
      int sum = 0;
      for (int k = 0; k < SUBPEL_TAPS; ++k)
Fergus Simpson's avatar
Fergus Simpson committed
146
147
148
149
150
151
152
153
154
        sum += src_y[k * src_stride] * y_filter[k];
      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
      y_qn += y_step_qn;
    }
    ++src;
    ++dst;
  }
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
155
156
static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
157
158
                              const InterpKernel *y_filters, int y0_q4,
                              int y_step_q4, int w, int h) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
159
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
John Koleszar's avatar
John Koleszar committed
160

161
  for (int x = 0; x < w; ++x) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
162
    int y_q4 = y0_q4;
163
    for (int y = 0; y < h; ++y) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
164
165
      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
166
167
      int sum = 0;
      for (int k = 0; k < SUBPEL_TAPS; ++k)
Dmitry Kovalev's avatar
Dmitry Kovalev committed
168
        sum += src_y[k * src_stride] * y_filter[k];
clang-format's avatar
clang-format committed
169
170
171
172
      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
          dst[y * dst_stride] +
              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
          1);
John Koleszar's avatar
John Koleszar committed
173
174
175
176
177
178
179
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

Fergus Simpson's avatar
Fergus Simpson committed
180
181
182
183
184
185
static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const InterpKernel *y_filters, int y0_qn,
                                      int y_step_qn, int w, int h) {
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

186
  for (int x = 0; x < w; ++x) {
Fergus Simpson's avatar
Fergus Simpson committed
187
    int y_qn = y0_qn;
188
    for (int y = 0; y < h; ++y) {
Fergus Simpson's avatar
Fergus Simpson committed
189
190
191
192
      const unsigned char *src_y =
          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter =
          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
193
194
      int sum = 0;
      for (int k = 0; k < SUBPEL_TAPS; ++k)
Fergus Simpson's avatar
Fergus Simpson committed
195
196
197
198
199
200
201
202
203
204
205
206
        sum += src_y[k * src_stride] * y_filter[k];
      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
          dst[y * dst_stride] +
              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
          1);
      y_qn += y_step_qn;
    }
    ++src;
    ++dst;
  }
}

clang-format's avatar
clang-format committed
207
208
static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
Dmitry Kovalev's avatar
Dmitry Kovalev committed
209
                     int x0_q4, int x_step_q4,
clang-format's avatar
clang-format committed
210
211
                     const InterpKernel *const y_filters, int y0_q4,
                     int y_step_q4, int w, int h) {
212
213
214
215
216
217
218
219
220
221
222
223
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
224
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
225
  int intermediate_height =
clang-format's avatar
clang-format committed
226
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
227

228
229
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
230

231
232
  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);
233

clang-format's avatar
clang-format committed
234
235
236
237
238
  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
                 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
                 intermediate_height);
  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
                dst_stride, y_filters, y0_q4, y_step_q4, w, h);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
239
240
}

Fergus Simpson's avatar
Fergus Simpson committed
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const InterpKernel *const x_filters, int x0_qn,
                             int x_step_qn, const InterpKernel *const y_filters,
                             int y0_qn, int y_step_qn, int w, int h) {
  // TODO(afergs): Update comment here
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
  int intermediate_height =
      (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  assert(y_step_qn <= SCALE_SUBPEL_BITS * 2);
  assert(x_step_qn <= SCALE_SUBPEL_BITS * 2);

  convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
                         temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w,
                         intermediate_height);
  convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
                        dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h);
}

276
static const InterpKernel *get_filter_base(const int16_t *filter) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
277
278
  // NOTE: This assumes that the filter table is 256-byte aligned.
  // TODO(agrange) Modify to make independent of table alignment.
279
  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
Dmitry Kovalev's avatar
Dmitry Kovalev committed
280
281
}

282
static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
283
  return (int)((const InterpKernel *)(intptr_t)f - base);
John Koleszar's avatar
John Koleszar committed
284
285
}

Yaowu Xu's avatar
Yaowu Xu committed
286
void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
287
                           uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
288
                           const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
289
290
                           const int16_t *filter_y, int y_step_q4, int w,
                           int h) {
291
  const InterpKernel *const filters_x = get_filter_base(filter_x);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
292
293
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

294
295
296
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
297
298
  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
                 w, h);
John Koleszar's avatar
John Koleszar committed
299
300
}

Fergus Simpson's avatar
Fergus Simpson committed
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int subpel_x,
                                 int x_step_qn, const int16_t *filter_y,
                                 int subpel_y, int y_step_qn, int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);

  (void)subpel_y;
  (void)filter_y;
  (void)y_step_qn;

  convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
                         x_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
316
void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
317
                               uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
318
                               const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
319
320
                               const int16_t *filter_y, int y_step_q4, int w,
                               int h) {
321
  const InterpKernel *const filters_x = get_filter_base(filter_x);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
322
323
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

324
325
326
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
327
328
  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                     x_step_q4, w, h);
John Koleszar's avatar
John Koleszar committed
329
330
}

Fergus Simpson's avatar
Fergus Simpson committed
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                     uint8_t *dst, ptrdiff_t dst_stride,
                                     const int16_t *filter_x, int subpel_x,
                                     int x_step_qn, const int16_t *filter_y,
                                     int subpel_y, int y_step_qn, int w,
                                     int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);

  (void)subpel_y;
  (void)filter_y;
  (void)y_step_qn;

  convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x,
                             subpel_x, x_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
347
void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
348
                          uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
349
                          const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
350
351
                          const int16_t *filter_y, int y_step_q4, int w,
                          int h) {
352
  const InterpKernel *const filters_y = get_filter_base(filter_y);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
353
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
354
355
356
357

  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
358
359
  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
                w, h);
John Koleszar's avatar
John Koleszar committed
360
361
}

Fergus Simpson's avatar
Fergus Simpson committed
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int subpel_x,
                                int x_step_qn, const int16_t *filter_y,
                                int subpel_y, int y_step_qn, int w, int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);

  (void)subpel_x;
  (void)filter_x;
  (void)x_step_qn;

  convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y,
                        y_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
377
void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
378
                              uint8_t *dst, ptrdiff_t dst_stride,
John Koleszar's avatar
John Koleszar committed
379
                              const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
380
381
                              const int16_t *filter_y, int y_step_q4, int w,
                              int h) {
382
  const InterpKernel *const filters_y = get_filter_base(filter_y);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
383
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
384
385
386
387

  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
388
389
  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                    y_step_q4, w, h);
John Koleszar's avatar
John Koleszar committed
390
391
}

Fergus Simpson's avatar
Fergus Simpson committed
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const int16_t *filter_x, int subpel_x,
                                    int x_step_qn, const int16_t *filter_y,
                                    int subpel_y, int y_step_qn, int w, int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);

  (void)subpel_x;
  (void)filter_x;
  (void)x_step_qn;

  convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y,
                            subpel_y, y_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
407
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
408
409
                     ptrdiff_t dst_stride, const int16_t *filter_x,
                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
John Koleszar's avatar
John Koleszar committed
410
                     int w, int h) {
411
  const InterpKernel *const filters_x = get_filter_base(filter_x);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
412
413
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

414
  const InterpKernel *const filters_y = get_filter_base(filter_y);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
415
416
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

clang-format's avatar
clang-format committed
417
  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
Dmitry Kovalev's avatar
Dmitry Kovalev committed
418
           filters_y, y0_q4, y_step_q4, w, h);
John Koleszar's avatar
John Koleszar committed
419
420
}

Fergus Simpson's avatar
Fergus Simpson committed
421
422
423
424
425
426
427
428
429
430
431
432
433
void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int subpel_x, int x_step_qn,
                           const int16_t *filter_y, int subpel_y, int y_step_qn,
                           int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);

  convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
                   x_step_qn, filters_y, subpel_y, y_step_qn, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
434
void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
435
436
                         ptrdiff_t dst_stride, const int16_t *filter_x,
                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
John Koleszar's avatar
John Koleszar committed
437
                         int w, int h) {
Christian Duvivier's avatar
Christian Duvivier committed
438
  /* Fixed size intermediate buffer places limits on parameters. */
439
440
441
  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
Christian Duvivier's avatar
Christian Duvivier committed
442

Yaowu Xu's avatar
Yaowu Xu committed
443
  aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
clang-format's avatar
clang-format committed
444
                  filter_y, y_step_q4, w, h);
Yaowu Xu's avatar
Yaowu Xu committed
445
  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
clang-format's avatar
clang-format committed
446
                     h);
John Koleszar's avatar
John Koleszar committed
447
}
448

Fergus Simpson's avatar
Fergus Simpson committed
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int subpel_x,
                               int x_step_qn, const int16_t *filter_y,
                               int subpel_y, int y_step_qn, int w, int h) {
  /* Fixed size intermediate buffer places limits on parameters. */
  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x,
                        x_step_qn, filter_y, subpel_y, y_step_qn, w, h);
  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
                     h);
}

Yaowu Xu's avatar
Yaowu Xu committed
465
void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
466
467
468
                         ptrdiff_t dst_stride, const int16_t *filter_x,
                         int filter_x_stride, const int16_t *filter_y,
                         int filter_y_stride, int w, int h) {
469
470
  int r;

clang-format's avatar
clang-format committed
471
472
473
474
  (void)filter_x;
  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
475

476
  for (r = h; r > 0; --r) {
James Zern's avatar
James Zern committed
477
    memcpy(dst, src, w);
478
479
    src += src_stride;
    dst += dst_stride;
480
481
482
  }
}

Yaowu Xu's avatar
Yaowu Xu committed
483
void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
484
485
486
487
488
489
490
                        ptrdiff_t dst_stride, const int16_t *filter_x,
                        int filter_x_stride, const int16_t *filter_y,
                        int filter_y_stride, int w, int h) {
  (void)filter_x;
  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
491

492
493
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
494

495
496
497
498
    src += src_stride;
    dst += dst_stride;
  }
}
499

Yaowu Xu's avatar
Yaowu Xu committed
500
void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
501
502
                        ptrdiff_t dst_stride, const int16_t *filter_x,
                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
503
                        int w, int h) {
Yaowu Xu's avatar
Yaowu Xu committed
504
  aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
505
506
507
                        filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
508
void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
509
510
                       ptrdiff_t dst_stride, const int16_t *filter_x,
                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
511
                       int w, int h) {
Yaowu Xu's avatar
Yaowu Xu committed
512
  aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
513
514
515
                       filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
516
void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
517
518
                     ptrdiff_t dst_stride, const int16_t *filter_x,
                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
519
                     int w, int h) {
Yaowu Xu's avatar
Yaowu Xu committed
520
  aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
521
522
523
                  filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
524
void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
525
526
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
527
528
                            const int16_t *filter_y, int y_step_q4, int w,
                            int h) {
Yaowu Xu's avatar
Yaowu Xu committed
529
  aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
530
531
532
                            x_step_q4, filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
533
void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
534
535
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
536
537
                           const int16_t *filter_y, int y_step_q4, int w,
                           int h) {
Yaowu Xu's avatar
Yaowu Xu committed
538
  aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
539
540
541
                           x_step_q4, filter_y, y_step_q4, w, h);
}

Yaowu Xu's avatar
Yaowu Xu committed
542
void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
clang-format's avatar
clang-format committed
543
544
545
                         ptrdiff_t dst_stride, const int16_t *filter_x,
                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
Yaowu Xu's avatar
Yaowu Xu committed
546
  aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
547
548
549
                      filter_y, y_step_q4, w, h);
}

Fergus Simpson's avatar
Fergus Simpson committed
550
// TODO(afergs): Make sure this works too
551
552
553
554
555
556
#if CONFIG_LOOP_RESTORATION
static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *x_filters, int x0_q4,
                                   int x_step_q4, int w, int h) {
  src -= SUBPEL_TAPS / 2 - 1;
557
  for (int y = 0; y < h; ++y) {
558
    int x_q4 = x0_q4;
559
    for (int x = 0; x < w; ++x) {
560
561
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
562
      int sum = 0;
563
      for (int k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
                          src_x[SUBPEL_TAPS / 2 - 1]);
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const InterpKernel *y_filters, int y0_q4,
                                  int y_step_q4, int w, int h) {
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

579
  for (int x = 0; x < w; ++x) {
580
    int y_q4 = y0_q4;
581
    for (int y = 0; y < h; ++y) {
582
583
      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
584
      int sum = 0;
585
      for (int k = 0; k < SUBPEL_TAPS; ++k)
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
        sum += src_y[k * src_stride] * y_filter[k];
      dst[y * dst_stride] =
          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
                     src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const InterpKernel *const x_filters, int x0_q4,
                             int x_step_q4, const InterpKernel *const y_filters,
                             int y0_q4, int y_step_q4, int w, int h) {
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
  int intermediate_height =
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

  convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
                         temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
                         intermediate_height);
  convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
                        dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
}

void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
                                   const int16_t *filter_y, int y_step_q4,
                                   int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  (void)filter_y;
  (void)y_step_q4;

  convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                         x_step_q4, w, h);
}

void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4, int w,
                                  int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  (void)filter_x;
  (void)x_step_q4;

  convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                        y_step_q4, w, h);
}

void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4, int w,
                             int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
}
663
664
665
666
667

static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
                                       uint16_t *dst, ptrdiff_t dst_stride,
                                       const InterpKernel *x_filters, int x0_q4,
                                       int x_step_q4, int w, int h) {
668
  const int bd = 8;
669
  src -= SUBPEL_TAPS / 2 - 1;
670
  for (int y = 0; y < h; ++y) {
671
    int x_q4 = x0_q4;
672
    for (int x = 0; x < w; ++x) {
673
674
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
675
676
      int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
                (1 << (bd + FILTER_BITS - 1));
677
      for (int k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
678
679
      dst[x] =
          (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
680
                          0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
681
682
683
684
685
686
687
688
689
690
691
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const InterpKernel *y_filters, int y0_q4,
                                      int y_step_q4, int w, int h) {
692
  const int bd = 8;
693
694
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);

695
  for (int x = 0; x < w; ++x) {
696
    int y_q4 = y0_q4;
697
    for (int y = 0; y < h; ++y) {
698
699
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
700
701
702
      int sum =
          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
          (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
703
      for (int k = 0; k < SUBPEL_TAPS; ++k)
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
        sum += src_y[k * src_stride] * y_filter[k];
      dst[y * dst_stride] =
          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const InterpKernel *const x_filters, int x0_q4,
                                 int x_step_q4,
                                 const InterpKernel *const y_filters, int y0_q4,
                                 int y_step_q4, int w, int h) {
  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
  int intermediate_height =
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);

  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
                             src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
                             x_step_q4, w, intermediate_height);
  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
                            y_step_q4, w, h);
}

void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
                                       uint16_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
                                       const int16_t *filter_y, int y_step_q4,
                                       int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  (void)filter_y;
  (void)y_step_q4;

  convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                             x_step_q4, w, h);
}

void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  (void)filter_x;
  (void)x_step_q4;

  convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                            y_step_q4, w, h);
}

void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
                                 const int16_t *filter_y, int y_step_q4, int w,
                                 int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

  convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                       x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
}
782
783
#endif  // CONFIG_LOOP_RESTORATION

Fergus Simpson's avatar
Fergus Simpson committed
784
// TODO(afergs): Make sure this works too
785
#if CONFIG_HIGHBITDEPTH
786
787
static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                  uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
788
789
                                  const InterpKernel *x_filters, int x0_q4,
                                  int x_step_q4, int w, int h, int bd) {
790
791
792
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= SUBPEL_TAPS / 2 - 1;
793
  for (int y = 0; y < h; ++y) {
794
    int x_q4 = x0_q4;
795
    for (int x = 0; x < w; ++x) {
796
797
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
798
799
      int sum = 0;
      for (int k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
800
      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
801
802
803
804
805
806
807
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

808
809
static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                      uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
810
811
                                      const InterpKernel *x_filters, int x0_q4,
                                      int x_step_q4, int w, int h, int bd) {
812
813
814
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= SUBPEL_TAPS / 2 - 1;
815
  for (int y = 0; y < h; ++y) {
816
    int x_q4 = x0_q4;
817
    for (int x = 0; x < w; ++x) {
818
819
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
820
821
      int sum = 0;
      for (int k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
clang-format's avatar
clang-format committed
822
823
824
      dst[x] = ROUND_POWER_OF_TWO(
          dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
          1);
825
826
827
828
829
830
831
      x_q4 += x_step_q4;
    }
    src += src_stride;
    dst += dst_stride;
  }
}

832
833
static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                 uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
834
835
                                 const InterpKernel *y_filters, int y0_q4,
                                 int y_step_q4, int w, int h, int bd) {
836
837
838
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
839
  for (int x = 0; x < w; ++x) {
840
    int y_q4 = y0_q4;
841
    for (int y = 0; y < h; ++y) {
842
843
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
844
845
      int sum = 0;
      for (int k = 0; k < SUBPEL_TAPS; ++k)
846
        sum += src_y[k * src_stride] * y_filter[k];
clang-format's avatar
clang-format committed
847
848
      dst[y * dst_stride] =
          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
849
850
851
852
853
854
855
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

856
857
static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                     uint8_t *dst8, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
858
859
                                     const InterpKernel *y_filters, int y0_q4,
                                     int y_step_q4, int w, int h, int bd) {
860
861
862
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
863
  for (int x = 0; x < w; ++x) {
864
    int y_q4 = y0_q4;
865
    for (int y = 0; y < h; ++y) {
866
867
      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
868
869
      int sum = 0;
      for (int k = 0; k < SUBPEL_TAPS; ++k)
870
        sum += src_y[k * src_stride] * y_filter[k];
clang-format's avatar
clang-format committed
871
872
873
874
      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
          dst[y * dst_stride] +
              clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
          1);
875
876
877
878
879
880
881
      y_q4 += y_step_q4;
    }
    ++src;
    ++dst;
  }
}

882
883
static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
clang-format's avatar
clang-format committed
884
885
886
                            const InterpKernel *const x_filters, int x0_q4,
                            int x_step_q4, const InterpKernel *const y_filters,
                            int y0_q4, int y_step_q4, int w, int h, int bd) {
887
888
889
890
891
892
893
894
895
896
897
898
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
899
  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
900
  int intermediate_height =
clang-format's avatar
clang-format committed
901
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
902

903
904
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
905
906
907
  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);

908
  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
clang-format's avatar
clang-format committed
909
910
                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
                        x_step_q4, w, intermediate_height, bd);
911
  highbd_convolve_vert(
clang-format's avatar
clang-format committed
912
913
      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
914
915
}

Yaowu Xu's avatar
Yaowu Xu committed
916
void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
917
918
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
919
920
                                  const int16_t *filter_y, int y_step_q4, int w,
                                  int h, int bd) {
921
922
923
924
925
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
926
927
  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                        x_step_q4, w, h, bd);
928
929
}

Yaowu Xu's avatar
Yaowu Xu committed
930
void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
931
932
933
934
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
                                      int w, int h, int bd) {
935
936
937
938
939
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
  (void)filter_y;
  (void)y_step_q4;

clang-format's avatar
clang-format committed
940
941
  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
                            x_step_q4, w, h, bd);
942
943
}

Yaowu Xu's avatar
Yaowu Xu committed
944
void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
945
946
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
947
948
                                 const int16_t *filter_y, int y_step_q4, int w,
                                 int h, int bd) {
949
950
951
952
953
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
954
955
  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                       y_step_q4, w, h, bd);
956
957
}

Yaowu Xu's avatar
Yaowu Xu committed
958
void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
959
960
961
962
                                     uint8_t *dst, ptrdiff_t dst_stride,
                                     const int16_t *filter_x, int x_step_q4,
                                     const int16_t *filter_y, int y_step_q4,
                                     int w, int h, int bd) {
963
964
965
966
967
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
  (void)filter_x;
  (void)x_step_q4;

clang-format's avatar
clang-format committed
968
969
  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
                           y_step_q4, w, h, bd);
970
971
}

Yaowu Xu's avatar
Yaowu Xu committed
972
void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
973
974
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
975
976
                            const int16_t *filter_y, int y_step_q4, int w,
                            int h, int bd) {
977
978
979
980
981
982
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);

  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);

clang-format's avatar
clang-format committed
983
  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
984
                  filters_y, y0_q4, y_step_q4, w, h, bd);
985
986
}

Yaowu Xu's avatar
Yaowu Xu committed
987
void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
988
989
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
clang-format's avatar
clang-format committed
990
991
                                const int16_t *filter_y, int y_step_q4, int w,
                                int h, int bd) {
992
  // Fixed size intermediate buffer places limits on parameters.
993
994
995
  DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
996

Yaowu Xu's avatar
Yaowu Xu committed
997
  aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
998
                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
Yaowu Xu's avatar
Yaowu Xu committed
999
  aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
clang-format's avatar
clang-format committed
1000
                            dst_stride, NULL, 0, NULL, 0, w, h, bd);
1001
1002
}

Yaowu Xu's avatar
Yaowu Xu committed
1003
void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
1004
1005
1006
1007
                                uint8_t *dst8, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int filter_x_stride,
                                const int16_t *filter_y, int filter_y_stride,
                                int w, int h, int bd) {
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
  int r;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  (void)filter_x;
  (void)filter_y;
  (void)filter_x_stride;
  (void)filter_y_stride;
  (void)bd;

  for (r = h; r > 0; --r) {
James Zern's avatar
James Zern committed
1018
    memcpy(dst, src, w * sizeof(uint16_t));
1019
1020
1021
1022
1023
    src += src_stride;
    dst += dst_stride;
  }
}

Yaowu Xu's avatar
Yaowu Xu committed
1024
void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
1025
1026
1027
1028
                               uint8_t *dst8, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int filter_x_stride,
                               const int16_t *filter_y, int filter_y_stride,
                               int w, int h, int bd) {
1029
1030
1031
1032
1033
1034
1035
1036
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  (void)filter_x;
  (void)filter_y;
  (void)filter_x_stride;
  (void)filter_y_stride;
  (void)bd;

1037
1038
  for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
1039
1040
1041
1042
1043
1044
      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
    }
    src += src_stride;
    dst += dst_stride;
  }
}
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055

#if CONFIG_LOOP_RESTORATION
static void highbd_convolve_add_src_horiz(const uint8_t *src8,
                                          ptrdiff_t src_stride, uint8_t *dst8,
                                          ptrdiff_t dst_stride,
                                          const InterpKernel *x_filters,
                                          int x0_q4, int x_step_q4, int w,
                                          int h, int bd) {
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
  src -= SUBPEL_TAPS / 2 - 1;
1056
  for (int y = 0; y < h; ++y) {
1057
    int x_q4 = x0_q4;
1058
    for (int x = 0; x < w; ++x) {
1059
1060
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1061
      int sum = 0;
1062
      for (int k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];