restoration.c 55.6 KB
Newer Older
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
3
4
5
6
7
8
9
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
11
12
13
14
 *
 */

#include <math.h>

Yaowu Xu's avatar
Yaowu Xu committed
15
16
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
17
#include "./aom_scale_rtcd.h"
18
19
#include "av1/common/onyxc_int.h"
#include "av1/common/restoration.h"
Yaowu Xu's avatar
Yaowu Xu committed
20
21
#include "aom_dsp/aom_dsp_common.h"
#include "aom_mem/aom_mem.h"
22
#include "aom_ports/mem.h"
23

24
const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
25
26
27
28
29
30
31
#if USE_HIGHPASS_IN_SGRPROJ
  // corner, edge, r2, eps2
  { -1, 2, 1, 1 }, { -1, 2, 1, 2 }, { -1, 2, 1, 3 }, { -1, 2, 1, 4 },
  { -1, 2, 1, 5 }, { -2, 3, 1, 2 }, { -2, 3, 1, 3 }, { -2, 3, 1, 4 },
  { -2, 3, 1, 5 }, { -2, 3, 1, 6 }, { -3, 4, 1, 3 }, { -3, 4, 1, 4 },
  { -3, 4, 1, 5 }, { -3, 4, 1, 6 }, { -3, 4, 1, 7 }, { -3, 4, 1, 8 }
#else
32
  // r1, eps1, r2, eps2
33
34
35
36
  { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
  { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
  { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 3, 30, 1, 10 },
  { 3, 50, 1, 12 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 },
37
#endif
38
39
};

clang-format's avatar
clang-format committed
40
41
typedef void (*restore_func_type)(uint8_t *data8, int width, int height,
                                  int stride, RestorationInternal *rst,
42
                                  uint8_t *dst8, int dst_stride);
43
#if CONFIG_HIGHBITDEPTH
clang-format's avatar
clang-format committed
44
45
typedef void (*restore_func_highbd_type)(uint8_t *data8, int width, int height,
                                         int stride, RestorationInternal *rst,
46
47
                                         int bit_depth, uint8_t *dst8,
                                         int dst_stride);
48
#endif  // CONFIG_HIGHBITDEPTH
49

50
51
int av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rst_info,
                                 int width, int height) {
52
53
  const int ntiles = av1_get_rest_ntiles(
      width, height, rst_info->restoration_tilesize, NULL, NULL, NULL, NULL);
Alex Converse's avatar
Alex Converse committed
54
  aom_free(rst_info->restoration_type);
55
  CHECK_MEM_ERROR(cm, rst_info->restoration_type,
Alex Converse's avatar
Alex Converse committed
56
                  (RestorationType *)aom_malloc(
57
                      sizeof(*rst_info->restoration_type) * ntiles));
58
59
60
61
  aom_free(rst_info->wiener_info);
  CHECK_MEM_ERROR(
      cm, rst_info->wiener_info,
      (WienerInfo *)aom_memalign(16, sizeof(*rst_info->wiener_info) * ntiles));
62
  memset(rst_info->wiener_info, 0, sizeof(*rst_info->wiener_info) * ntiles);
Alex Converse's avatar
Alex Converse committed
63
  aom_free(rst_info->sgrproj_info);
64
65
  CHECK_MEM_ERROR(
      cm, rst_info->sgrproj_info,
Alex Converse's avatar
Alex Converse committed
66
      (SgrprojInfo *)aom_malloc(sizeof(*rst_info->sgrproj_info) * ntiles));
67
68
69
70
71
72
73
74
75
76
  return ntiles;
}

void av1_free_restoration_struct(RestorationInfo *rst_info) {
  aom_free(rst_info->restoration_type);
  rst_info->restoration_type = NULL;
  aom_free(rst_info->wiener_info);
  rst_info->wiener_info = NULL;
  aom_free(rst_info->sgrproj_info);
  rst_info->sgrproj_info = NULL;
77
}
78
79
80
81

#define MAX_RADIUS 3  // Only 1, 2, 3 allowed
#define MAX_EPS 80    // Max value of eps
#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
82
83
#define SGRPROJ_MTABLE_BITS 20
#define SGRPROJ_RECIP_BITS 12
84
85
86

// TODO(debargha): This table can be substantially reduced since only a few
// values are actually used.
David Barker's avatar
David Barker committed
87
int sgrproj_mtable[MAX_EPS][MAX_NELEM];
88
89
90
91
92
93
94
95
96
97

static void GenSgrprojVtable() {
  int e, n;
  for (e = 1; e <= MAX_EPS; ++e)
    for (n = 1; n <= MAX_NELEM; ++n) {
      const int n2e = n * n * e;
      sgrproj_mtable[e - 1][n - 1] =
          (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
    }
}
98
99

void av1_loop_restoration_precal() { GenSgrprojVtable(); }
100

101
static void loop_restoration_init(RestorationInternal *rst, int kf) {
102
  rst->keyframe = kf;
103
104
}

105
void extend_frame(uint8_t *data, int width, int height, int stride) {
106
107
108
109
  uint8_t *data_p;
  int i;
  for (i = 0; i < height; ++i) {
    data_p = data + i * stride;
110
111
    memset(data_p - WIENER_HALFWIN, data_p[0], WIENER_HALFWIN);
    memset(data_p + width, data_p[width - 1], WIENER_HALFWIN);
112
  }
113
114
115
  data_p = data - WIENER_HALFWIN;
  for (i = -WIENER_HALFWIN; i < 0; ++i) {
    memcpy(data_p + i * stride, data_p, width + 2 * WIENER_HALFWIN);
116
  }
117
  for (i = height; i < height + WIENER_HALFWIN; ++i) {
118
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
119
           width + 2 * WIENER_HALFWIN);
120
121
122
  }
}

123
124
125
126
static void loop_copy_tile(uint8_t *data, int tile_idx, int subtile_idx,
                           int subtile_bits, int width, int height, int stride,
                           RestorationInternal *rst, uint8_t *dst,
                           int dst_stride) {
127
128
  const int tile_width = rst->tile_width;
  const int tile_height = rst->tile_height;
129
130
131
132
133
134
135
136
137
138
  int i;
  int h_start, h_end, v_start, v_end;
  av1_get_rest_tile_limits(tile_idx, subtile_idx, subtile_bits, rst->nhtiles,
                           rst->nvtiles, tile_width, tile_height, width, height,
                           0, 0, &h_start, &h_end, &v_start, &v_end);
  for (i = v_start; i < v_end; ++i)
    memcpy(dst + i * dst_stride + h_start, data + i * stride + h_start,
           h_end - h_start);
}

139
140
static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
                                    int height, int stride,
141
                                    RestorationInternal *rst, uint8_t *dst,
142
                                    int dst_stride) {
143
144
  const int tile_width = rst->tile_width;
  const int tile_height = rst->tile_height;
145
146
  int i, j;
  int h_start, h_end, v_start, v_end;
147
  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
148
149
150
151
    loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
                   dst_stride);
    return;
  }
152
  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
153
                           tile_width, tile_height, width, height, 0, 0,
154
                           &h_start, &h_end, &v_start, &v_end);
155
156
157
158
159
160
161
162
  // Convolve the whole tile (done in blocks here to match the requirements
  // of the vectorized convolve functions, but the result is equivalent)
  for (i = v_start; i < v_end; i += MAX_SB_SIZE)
    for (j = h_start; j < h_end; j += MAX_SB_SIZE) {
      int w = AOMMIN(MAX_SB_SIZE, (h_end - j + 15) & ~15);
      int h = AOMMIN(MAX_SB_SIZE, (v_end - i + 15) & ~15);
      const uint8_t *data_p = data + i * stride + j;
      uint8_t *dst_p = dst + i * dst_stride + j;
163
164
165
166
167
168
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
      aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
                                rst->rsi->wiener_info[tile_idx].hfilter, 16,
                                rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
                                h);
#else
169
170
171
      aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
                            rst->rsi->wiener_info[tile_idx].hfilter, 16,
                            rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h);
172
#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
173
174
175
    }
}

clang-format's avatar
clang-format committed
176
static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
177
178
179
180
                               RestorationInternal *rst, uint8_t *dst,
                               int dst_stride) {
  int tile_idx;
  extend_frame(data, width, height, stride);
181
  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
182
183
    loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
                            dst_stride);
184
  }
185
}
186

187
188
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
   over the input. The window is of size (2r + 1)x(2r + 1), and we
189
   specialize to r = 1, 2, 3. A default function is used for r > 3.
190
191
192
193
194
195
196
197
198
199
200
201
202
203

   Each loop follows the same format: We keep a window's worth of input
   in individual variables and select data out of that as appropriate.
*/
static void boxsum1(int32_t *src, int width, int height, int src_stride,
                    int sqr, int32_t *dst, int dst_stride) {
  int i, j, a, b, c;

  // Vertical sum over 3-pixel regions, from src into dst.
  if (!sqr) {
    for (j = 0; j < width; ++j) {
      a = src[j];
      b = src[src_stride + j];
      c = src[2 * src_stride + j];
204

205
206
207
208
209
210
211
212
213
214
215
216
217
218
      dst[j] = a + b;
      for (i = 1; i < height - 2; ++i) {
        // Loop invariant: At the start of each iteration,
        // a = src[(i - 1) * src_stride + j]
        // b = src[(i    ) * src_stride + j]
        // c = src[(i + 1) * src_stride + j]
        dst[i * dst_stride + j] = a + b + c;
        a = b;
        b = c;
        c = src[(i + 2) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c;
      dst[(i + 1) * dst_stride + j] = b + c;
    }
219
  } else {
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
    for (j = 0; j < width; ++j) {
      a = src[j] * src[j];
      b = src[src_stride + j] * src[src_stride + j];
      c = src[2 * src_stride + j] * src[2 * src_stride + j];

      dst[j] = a + b;
      for (i = 1; i < height - 2; ++i) {
        dst[i * dst_stride + j] = a + b + c;
        a = b;
        b = c;
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c;
      dst[(i + 1) * dst_stride + j] = b + c;
    }
  }

  // Horizontal sum over 3-pixel regions of dst
  for (i = 0; i < height; ++i) {
    a = dst[i * dst_stride];
    b = dst[i * dst_stride + 1];
    c = dst[i * dst_stride + 2];

    dst[i * dst_stride] = a + b;
    for (j = 1; j < width - 2; ++j) {
      // Loop invariant: At the start of each iteration,
      // a = src[i * src_stride + (j - 1)]
      // b = src[i * src_stride + (j    )]
      // c = src[i * src_stride + (j + 1)]
      dst[i * dst_stride + j] = a + b + c;
      a = b;
      b = c;
      c = dst[i * dst_stride + (j + 2)];
    }
    dst[i * dst_stride + j] = a + b + c;
    dst[i * dst_stride + (j + 1)] = b + c;
  }
}

static void boxsum2(int32_t *src, int width, int height, int src_stride,
                    int sqr, int32_t *dst, int dst_stride) {
  int i, j, a, b, c, d, e;

  // Vertical sum over 5-pixel regions, from src into dst.
  if (!sqr) {
    for (j = 0; j < width; ++j) {
      a = src[j];
      b = src[src_stride + j];
      c = src[2 * src_stride + j];
      d = src[3 * src_stride + j];
      e = src[4 * src_stride + j];

      dst[j] = a + b + c;
      dst[dst_stride + j] = a + b + c + d;
      for (i = 2; i < height - 3; ++i) {
        // Loop invariant: At the start of each iteration,
        // a = src[(i - 2) * src_stride + j]
        // b = src[(i - 1) * src_stride + j]
        // c = src[(i    ) * src_stride + j]
        // d = src[(i + 1) * src_stride + j]
        // e = src[(i + 2) * src_stride + j]
        dst[i * dst_stride + j] = a + b + c + d + e;
        a = b;
        b = c;
        c = d;
        d = e;
        e = src[(i + 3) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e;
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
      dst[(i + 2) * dst_stride + j] = c + d + e;
    }
  } else {
    for (j = 0; j < width; ++j) {
      a = src[j] * src[j];
      b = src[src_stride + j] * src[src_stride + j];
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
      e = src[4 * src_stride + j] * src[4 * src_stride + j];

      dst[j] = a + b + c;
      dst[dst_stride + j] = a + b + c + d;
      for (i = 2; i < height - 3; ++i) {
        dst[i * dst_stride + j] = a + b + c + d + e;
        a = b;
        b = c;
        c = d;
        d = e;
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e;
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
      dst[(i + 2) * dst_stride + j] = c + d + e;
    }
  }

  // Horizontal sum over 5-pixel regions of dst
  for (i = 0; i < height; ++i) {
    a = dst[i * dst_stride];
    b = dst[i * dst_stride + 1];
    c = dst[i * dst_stride + 2];
    d = dst[i * dst_stride + 3];
    e = dst[i * dst_stride + 4];

    dst[i * dst_stride] = a + b + c;
    dst[i * dst_stride + 1] = a + b + c + d;
    for (j = 2; j < width - 3; ++j) {
      // Loop invariant: At the start of each iteration,
      // a = src[i * src_stride + (j - 2)]
      // b = src[i * src_stride + (j - 1)]
      // c = src[i * src_stride + (j    )]
      // d = src[i * src_stride + (j + 1)]
      // e = src[i * src_stride + (j + 2)]
      dst[i * dst_stride + j] = a + b + c + d + e;
      a = b;
      b = c;
      c = d;
      d = e;
      e = dst[i * dst_stride + (j + 3)];
    }
    dst[i * dst_stride + j] = a + b + c + d + e;
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
    dst[i * dst_stride + (j + 2)] = c + d + e;
  }
}

346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
static void boxsum3(int32_t *src, int width, int height, int src_stride,
                    int sqr, int32_t *dst, int dst_stride) {
  int i, j, a, b, c, d, e, f, g;

  // Vertical sum over 7-pixel regions, from src into dst.
  if (!sqr) {
    for (j = 0; j < width; ++j) {
      a = src[j];
      b = src[1 * src_stride + j];
      c = src[2 * src_stride + j];
      d = src[3 * src_stride + j];
      e = src[4 * src_stride + j];
      f = src[5 * src_stride + j];
      g = src[6 * src_stride + j];

      dst[j] = a + b + c + d;
      dst[dst_stride + j] = a + b + c + d + e;
      dst[2 * dst_stride + j] = a + b + c + d + e + f;
      for (i = 3; i < height - 4; ++i) {
        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
        a = b;
        b = c;
        c = d;
        d = e;
        e = f;
        f = g;
        g = src[(i + 4) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
      dst[(i + 3) * dst_stride + j] = d + e + f + g;
    }
  } else {
    for (j = 0; j < width; ++j) {
      a = src[j] * src[j];
      b = src[1 * src_stride + j] * src[1 * src_stride + j];
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
      e = src[4 * src_stride + j] * src[4 * src_stride + j];
      f = src[5 * src_stride + j] * src[5 * src_stride + j];
      g = src[6 * src_stride + j] * src[6 * src_stride + j];

      dst[j] = a + b + c + d;
      dst[dst_stride + j] = a + b + c + d + e;
      dst[2 * dst_stride + j] = a + b + c + d + e + f;
      for (i = 3; i < height - 4; ++i) {
        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
        a = b;
        b = c;
        c = d;
        d = e;
        e = f;
        f = g;
        g = src[(i + 4) * src_stride + j] * src[(i + 4) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
      dst[(i + 3) * dst_stride + j] = d + e + f + g;
    }
  }

  // Horizontal sum over 7-pixel regions of dst
  for (i = 0; i < height; ++i) {
    a = dst[i * dst_stride];
    b = dst[i * dst_stride + 1];
    c = dst[i * dst_stride + 2];
    d = dst[i * dst_stride + 3];
    e = dst[i * dst_stride + 4];
    f = dst[i * dst_stride + 5];
    g = dst[i * dst_stride + 6];

    dst[i * dst_stride] = a + b + c + d;
    dst[i * dst_stride + 1] = a + b + c + d + e;
    dst[i * dst_stride + 2] = a + b + c + d + e + f;
    for (j = 3; j < width - 4; ++j) {
      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
      a = b;
      b = c;
      c = d;
      d = e;
      e = f;
      f = g;
      g = dst[i * dst_stride + (j + 4)];
    }
    dst[i * dst_stride + j] = a + b + c + d + e + f + g;
    dst[i * dst_stride + (j + 1)] = b + c + d + e + f + g;
    dst[i * dst_stride + (j + 2)] = c + d + e + f + g;
    dst[i * dst_stride + (j + 3)] = d + e + f + g;
  }
}

// Generic version for any r. To be removed after experiments are done.
static void boxsumr(int32_t *src, int width, int height, int src_stride, int r,
                    int sqr, int32_t *dst, int dst_stride) {
  int32_t *tmp = aom_malloc(width * height * sizeof(*tmp));
  int tmp_stride = width;
  int i, j;
  if (sqr) {
    for (j = 0; j < width; ++j) tmp[j] = src[j] * src[j];
    for (j = 0; j < width; ++j)
      for (i = 1; i < height; ++i)
        tmp[i * tmp_stride + j] =
            tmp[(i - 1) * tmp_stride + j] +
            src[i * src_stride + j] * src[i * src_stride + j];
  } else {
    memcpy(tmp, src, sizeof(*tmp) * width);
    for (j = 0; j < width; ++j)
      for (i = 1; i < height; ++i)
        tmp[i * tmp_stride + j] =
            tmp[(i - 1) * tmp_stride + j] + src[i * src_stride + j];
  }
  for (i = 0; i <= r; ++i)
    memcpy(&dst[i * dst_stride], &tmp[(i + r) * tmp_stride],
           sizeof(*tmp) * width);
  for (i = r + 1; i < height - r; ++i)
    for (j = 0; j < width; ++j)
      dst[i * dst_stride + j] =
          tmp[(i + r) * tmp_stride + j] - tmp[(i - r - 1) * tmp_stride + j];
  for (i = height - r; i < height; ++i)
    for (j = 0; j < width; ++j)
      dst[i * dst_stride + j] = tmp[(height - 1) * tmp_stride + j] -
                                tmp[(i - r - 1) * tmp_stride + j];

  for (i = 0; i < height; ++i) tmp[i * tmp_stride] = dst[i * dst_stride];
  for (i = 0; i < height; ++i)
    for (j = 1; j < width; ++j)
      tmp[i * tmp_stride + j] =
          tmp[i * tmp_stride + j - 1] + dst[i * src_stride + j];

  for (j = 0; j <= r; ++j)
    for (i = 0; i < height; ++i)
      dst[i * dst_stride + j] = tmp[i * tmp_stride + j + r];
  for (j = r + 1; j < width - r; ++j)
    for (i = 0; i < height; ++i)
      dst[i * dst_stride + j] =
          tmp[i * tmp_stride + j + r] - tmp[i * tmp_stride + j - r - 1];
  for (j = width - r; j < width; ++j)
    for (i = 0; i < height; ++i)
      dst[i * dst_stride + j] =
          tmp[i * tmp_stride + width - 1] - tmp[i * tmp_stride + j - r - 1];
  aom_free(tmp);
}

491
492
493
494
495
496
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
                   int sqr, int32_t *dst, int dst_stride) {
  if (r == 1)
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
  else if (r == 2)
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
497
498
499
500
  else if (r == 3)
    boxsum3(src, width, height, src_stride, sqr, dst, dst_stride);
  else
    boxsumr(src, width, height, src_stride, r, sqr, dst, dst_stride);
501
502
503
504
}

static void boxnum(int width, int height, int r, int8_t *num, int num_stride) {
  int i, j;
505
506
507
  for (i = 0; i <= r; ++i) {
    for (j = 0; j <= r; ++j) {
      num[i * num_stride + j] = (r + 1 + i) * (r + 1 + j);
508
509
510
511
512
513
      num[i * num_stride + (width - 1 - j)] = num[i * num_stride + j];
      num[(height - 1 - i) * num_stride + j] = num[i * num_stride + j];
      num[(height - 1 - i) * num_stride + (width - 1 - j)] =
          num[i * num_stride + j];
    }
  }
514
515
  for (j = 0; j <= r; ++j) {
    const int val = (2 * r + 1) * (r + 1 + j);
516
517
518
519
520
    for (i = r + 1; i < height - r; ++i) {
      num[i * num_stride + j] = val;
      num[i * num_stride + (width - 1 - j)] = val;
    }
  }
521
522
  for (i = 0; i <= r; ++i) {
    const int val = (2 * r + 1) * (r + 1 + i);
523
524
525
526
527
528
529
    for (j = r + 1; j < width - r; ++j) {
      num[i * num_stride + j] = val;
      num[(height - 1 - i) * num_stride + j] = val;
    }
  }
  for (i = r + 1; i < height - r; ++i) {
    for (j = r + 1; j < width - r; ++j) {
530
      num[i * num_stride + j] = (2 * r + 1) * (2 * r + 1);
531
532
533
534
535
    }
  }
}

void decode_xq(int *xqd, int *xq) {
536
  xq[0] = xqd[0];
537
538
539
  xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
}

David Barker's avatar
David Barker committed
540
const int32_t x_by_xplus1[256] = {
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
  0,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  256,
};

David Barker's avatar
David Barker committed
561
const int32_t one_by_x[MAX_NELEM] = {
562
563
564
565
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164, 158,
  152,  146,  141,  137,  132, 128, 124, 120, 117, 114, 111, 108, 105,
  102,  100,  98,   95,   93,  91,  89,  87,  85,  84
566
567
};

568
569
570
571
static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
                                                int height, int stride,
                                                int bit_depth, int r, int eps,
                                                int32_t *tmpbuf) {
572
  int32_t *A = tmpbuf;
573
  int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
574
575
  int8_t num[RESTORATION_TILEPELS_MAX];
  int i, j;
David Barker's avatar
David Barker committed
576
577
578
579
580
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
  // leading to a significant speed improvement.
  // We also align the stride to a multiple of 16 bytes, for consistency
  // with the SIMD version of this function.
  int buf_stride = ((width + 3) & ~3) + 16;
581

582
583
584
  // Don't filter tiles with dimensions < 5 on any axis
  if ((width < 5) || (height < 5)) return;

David Barker's avatar
David Barker committed
585
586
  boxsum(dgd, width, height, stride, r, 0, B, buf_stride);
  boxsum(dgd, width, height, stride, r, 1, A, buf_stride);
587
  boxnum(width, height, r, num, width);
588
  assert(r <= 3);
589
590
  for (i = 0; i < height; ++i) {
    for (j = 0; j < width; ++j) {
David Barker's avatar
David Barker committed
591
592
      const int k = i * buf_stride + j;
      const int n = num[i * width + j];
593

594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
      // a < 2^16 * n < 2^22 regardless of bit depth
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
      // b < 2^8 * n < 2^14 regardless of bit depth
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);

      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
      // This is an artefact of rounding, and can only happen if all pixels
      // are (almost) identical, so in this case we saturate to p=0.
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
      uint32_t s = sgrproj_mtable[eps - 1][n - 1];

      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
      // (this holds even after accounting for the rounding in s)
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);

      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // < 2^8

      // SGRPROJ_SGR - A[k] < 2^8, B[k] < 2^(bit_depth) * n,
      // one_by_x[n - 1] = round(2^12 / n)
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
      // and B[k] is set to a value < 2^(8 + bit depth)
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
                                             (uint32_t)B[k] *
                                             (uint32_t)one_by_x[n - 1],
                                         SGRPROJ_RECIP_BITS);
622
623
624
625
626
    }
  }
  i = 0;
  j = 0;
  {
David Barker's avatar
David Barker committed
627
    const int k = i * buf_stride + j;
628
629
    const int l = i * stride + j;
    const int nb = 3;
630
    const int32_t a =
David Barker's avatar
David Barker committed
631
        3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + A[k + buf_stride + 1];
632
    const int32_t b =
David Barker's avatar
David Barker committed
633
        3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] + B[k + buf_stride + 1];
634
635
    const int32_t v = a * dgd[l] + b;
    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
636
637
638
639
  }
  i = 0;
  j = width - 1;
  {
David Barker's avatar
David Barker committed
640
    const int k = i * buf_stride + j;
641
642
    const int l = i * stride + j;
    const int nb = 3;
643
    const int32_t a =
David Barker's avatar
David Barker committed
644
        3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + A[k + buf_stride - 1];
645
    const int32_t b =
David Barker's avatar
David Barker committed
646
        3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] + B[k + buf_stride - 1];
647
648
    const int32_t v = a * dgd[l] + b;
    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
649
650
651
652
  }
  i = height - 1;
  j = 0;
  {
David Barker's avatar
David Barker committed
653
    const int k = i * buf_stride + j;
654
655
    const int l = i * stride + j;
    const int nb = 3;
656
    const int32_t a =
David Barker's avatar
David Barker committed
657
        3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + A[k - buf_stride + 1];
658
    const int32_t b =
David Barker's avatar
David Barker committed
659
        3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] + B[k - buf_stride + 1];
660
661
    const int32_t v = a * dgd[l] + b;
    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
662
663
664
665
  }
  i = height - 1;
  j = width - 1;
  {
David Barker's avatar
David Barker committed
666
    const int k = i * buf_stride + j;
667
668
    const int l = i * stride + j;
    const int nb = 3;
669
    const int32_t a =
David Barker's avatar
David Barker committed
670
        3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + A[k - buf_stride - 1];
671
    const int32_t b =
David Barker's avatar
David Barker committed
672
        3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] + B[k - buf_stride - 1];
673
674
    const int32_t v = a * dgd[l] + b;
    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
675
676
677
  }
  i = 0;
  for (j = 1; j < width - 1; ++j) {
David Barker's avatar
David Barker committed
678
    const int k = i * buf_stride + j;
679
680
    const int l = i * stride + j;
    const int nb = 3;
David Barker's avatar
David Barker committed
681
682
683
684
    const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
                      A[k + buf_stride - 1] + A[k + buf_stride + 1];
    const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
                      B[k + buf_stride - 1] + B[k + buf_stride + 1];
685
686
    const int32_t v = a * dgd[l] + b;
    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
687
688
689
  }
  i = height - 1;
  for (j = 1; j < width - 1; ++j) {
David Barker's avatar
David Barker committed
690
    const int k = i * buf_stride + j;
691
692
    const int l = i * stride + j;
    const int nb = 3;
David Barker's avatar
David Barker committed
693
694
695
696
    const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
                      A[k - buf_stride - 1] + A[k - buf_stride + 1];
    const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
                      B[k - buf_stride - 1] + B[k - buf_stride + 1];
697
698
    const int32_t v = a * dgd[l] + b;
    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
699
700
701
  }
  j = 0;
  for (i = 1; i < height - 1; ++i) {
David Barker's avatar
David Barker committed
702
    const int k = i * buf_stride + j;
703
704
    const int l = i * stride + j;
    const int nb = 3;
David Barker's avatar
David Barker committed
705
706
707
708
    const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
                      A[k + 1] + A[k - buf_stride + 1] + A[k + buf_stride + 1];
    const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
                      B[k + 1] + B[k - buf_stride + 1] + B[k + buf_stride + 1];
709
710
    const int32_t v = a * dgd[l] + b;
    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
711
712
713
  }
  j = width - 1;
  for (i = 1; i < height - 1; ++i) {
David Barker's avatar
David Barker committed
714
    const int k = i * buf_stride + j;
715
716
    const int l = i * stride + j;
    const int nb = 3;
David Barker's avatar
David Barker committed
717
718
719
720
    const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
                      A[k - 1] + A[k - buf_stride - 1] + A[k + buf_stride - 1];
    const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
                      B[k - 1] + B[k - buf_stride - 1] + B[k + buf_stride - 1];
721
722
    const int32_t v = a * dgd[l] + b;
    dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
723
724
725
  }
  for (i = 1; i < height - 1; ++i) {
    for (j = 1; j < width - 1; ++j) {
David Barker's avatar
David Barker committed
726
      const int k = i * buf_stride + j;
727
728
      const int l = i * stride + j;
      const int nb = 5;
729
      const int32_t a =
David Barker's avatar
David Barker committed
730
731
732
733
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
              4 +
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
734
              3;
735
      const int32_t b =
David Barker's avatar
David Barker committed
736
737
738
739
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
              4 +
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
740
              3;
741
742
      const int32_t v = a * dgd[l] + b;
      dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
743
744
745
746
    }
  }
}

747
748
void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
                                  int stride, int32_t *dst, int dst_stride,
749
                                  int r, int eps, int32_t *tmpbuf) {
750
751
752
753
754
755
  int i, j;
  for (i = 0; i < height; ++i) {
    for (j = 0; j < width; ++j) {
      dst[i * dst_stride + j] = dgd[i * stride + j];
    }
  }
756
757
  av1_selfguided_restoration_internal(dst, width, height, dst_stride, 8, r, eps,
                                      tmpbuf);
758
759
}

760
761
void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride,
                           int32_t *dst, int dst_stride, int corner, int edge) {
762
  int i, j;
763
  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
764
765
766
767
768

  i = 0;
  j = 0;
  {
    const int k = i * stride + j;
769
770
771
772
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
        corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
773
774
775
776
777
  }
  i = 0;
  j = width - 1;
  {
    const int k = i * stride + j;
778
779
780
781
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
        corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
782
783
784
785
786
  }
  i = height - 1;
  j = 0;
  {
    const int k = i * stride + j;
787
788
789
790
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
        corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
791
792
793
794
795
  }
  i = height - 1;
  j = width - 1;
  {
    const int k = i * stride + j;
796
797
798
799
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
        corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
800
801
802
803
  }
  i = 0;
  for (j = 1; j < width - 1; ++j) {
    const int k = i * stride + j;
804
805
806
807
808
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
             corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
                       dgd[k + 1]);
809
810
811
812
  }
  i = height - 1;
  for (j = 1; j < width - 1; ++j) {
    const int k = i * stride + j;
813
814
815
816
817
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
             corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
                       dgd[k + 1]);
818
819
820
821
  }
  j = 0;
  for (i = 1; i < height - 1; ++i) {
    const int k = i * stride + j;
822
823
824
825
826
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
             corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
                       dgd[k - stride] + dgd[k + stride]);
827
828
829
830
  }
  j = width - 1;
  for (i = 1; i < height - 1; ++i) {
    const int k = i * stride + j;
831
832
833
834
835
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
             corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
                       dgd[k - stride] + dgd[k + stride]);
836
837
838
839
  }
  for (i = 1; i < height - 1; ++i) {
    for (j = 1; j < width - 1; ++j) {
      const int k = i * stride + j;
840
841
842
843
844
845
      const int l = i * dst_stride + j;
      dst[l] =
          center * dgd[k] +
          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
                    dgd[k - stride + 1] + dgd[k + stride + 1]);
846
847
848
849
    }
  }
}

David Barker's avatar
David Barker committed
850
void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
851
852
                                    int stride, int eps, int *xqd, uint8_t *dst,
                                    int dst_stride, int32_t *tmpbuf) {
853
  int xq[2];
854
  int32_t *flt1 = tmpbuf;
855
  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
856
  int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
857
  int i, j;
858
  assert(width * height <= RESTORATION_TILEPELS_MAX);
859
860
#if USE_HIGHPASS_IN_SGRPROJ
  av1_highpass_filter_c(dat, width, height, stride, flt1, width,
861
                        sgr_params[eps].corner, sgr_params[eps].edge);
862
#else
863
  av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
864
                               sgr_params[eps].r1, sgr_params[eps].e1, tmpbuf2);
865
#endif  // USE_HIGHPASS_IN_SGRPROJ
866
  av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
867
                               sgr_params[eps].r2, sgr_params[eps].e2, tmpbuf2);
868
869
870
871
872
  decode_xq(xqd, xq);
  for (i = 0; i < height; ++i) {
    for (j = 0; j < width; ++j) {
      const int k = i * width + j;
      const int l = i * stride + j;
873
874
875
876
      const int m = i * dst_stride + j;
      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
      const int32_t f1 = (int32_t)flt1[k] - u;
      const int32_t f2 = (int32_t)flt2[k] - u;
David Barker's avatar
David Barker committed
877
      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
878
879
      const int16_t w =
          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
880
      dst[m] = clip_pixel(w);
881
882
883
884
885
886
    }
  }
}

static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
                                     int height, int stride,
887
888
                                     RestorationInternal *rst, uint8_t *dst,
                                     int dst_stride) {
889
890
  const int tile_width = rst->tile_width;
  const int tile_height = rst->tile_height;
891
  int h_start, h_end, v_start, v_end;
892
  uint8_t *data_p, *dst_p;
893

894
  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
895
896
897
898
    loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
                   dst_stride);
    return;
  }
899
900
901
902
  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
                           tile_width, tile_height, width, height, 0, 0,
                           &h_start, &h_end, &v_start, &v_end);
  data_p = data + h_start + v_start * stride;
903
  dst_p = dst + h_start + v_start * dst_stride;
904
  apply_selfguided_restoration(data_p, h_end - h_start, v_end - v_start, stride,
905
                               rst->rsi->sgrproj_info[tile_idx].ep,
906
                               rst->rsi->sgrproj_info[tile_idx].xqd, dst_p,
907
                               dst_stride, rst->tmpbuf);
908
909
910
911
}

static void loop_sgrproj_filter(uint8_t *data, int width, int height,
                                int stride, RestorationInternal *rst,
912
                                uint8_t *dst, int dst_stride) {
913
914
  int tile_idx;
  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
915
916
    loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
                             dst_stride);
917
918
919
  }
}

920
921
static void loop_switchable_filter(uint8_t *data, int width, int height,
                                   int stride, RestorationInternal *rst,
922
                                   uint8_t *dst, int dst_stride) {
923
924
  int tile_idx;
  extend_frame(data, width, height, stride);
925
  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
926
927
928
929
    if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
      loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
                     dst_stride);
    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
930
931
      loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
                              dst_stride);
932
    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
933
934
      loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
                               dst_stride);
935
    }
936
937
938
  }
}

939
#if CONFIG_HIGHBITDEPTH
940
void extend_frame_highbd(uint16_t *data, int width, int height, int stride) {
941
942
943
944
  uint16_t *data_p;
  int i, j;
  for (i = 0; i < height; ++i) {
    data_p = data + i * stride;
945
946
    for (j = -WIENER_HALFWIN; j < 0; ++j) data_p[j] = data_p[0];
    for (j = width; j < width + WIENER_HALFWIN; ++j)
947
948
      data_p[j] = data_p[width - 1];
  }
949
950
  data_p = data - WIENER_HALFWIN;
  for (i = -WIENER_HALFWIN; i < 0; ++i) {
951
    memcpy(data_p + i * stride, data_p,
952
           (width + 2 * WIENER_HALFWIN) * sizeof(uint16_t));
953
  }
954
  for (i = height; i < height + WIENER_HALFWIN; ++i) {
955
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
956
           (width + 2 * WIENER_HALFWIN) * sizeof(uint16_t));
957
958
959
  }
}

960
961
962
963
static void loop_copy_tile_highbd(uint16_t *data, int tile_idx, int subtile_idx,
                                  int subtile_bits, int width, int height,
                                  int stride, RestorationInternal *rst,
                                  uint16_t *dst, int dst_stride) {
964
965
  const int tile_width = rst->tile_width;
  const int tile_height = rst->tile_height;
966
967
968
969
970
971
972
973
974
975
  int i;
  int h_start, h_end, v_start, v_end;
  av1_get_rest_tile_limits(tile_idx, subtile_idx, subtile_bits, rst->nhtiles,
                           rst->nvtiles, tile_width, tile_height, width, height,
                           0, 0, &h_start, &h_end, &v_start, &v_end);
  for (i = v_start; i < v_end; ++i)
    memcpy(dst + i * dst_stride + h_start, data + i * stride + h_start,
           (h_end - h_start) * sizeof(*dst));
}

976
977
978
static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
                                           int width, int height, int stride,
                                           RestorationInternal *rst,
979
980
                                           int bit_depth, uint16_t *dst,
                                           int dst_stride) {
981
982
  const int tile_width = rst->tile_width;
  const int tile_height = rst->tile_height;
983
984
985
  int h_start, h_end, v_start, v_end;
  int i, j;

986
  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
987
988
989
990
    loop_copy_tile_highbd(data, tile_idx, 0, 0, width, height, stride, rst, dst,
                          dst_stride);
    return;
  }
991
  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
992
                           tile_width, tile_height, width, height, 0, 0,
993
                           &h_start, &h_end, &v_start, &v_end);
994
995
996
997
998
999
1000
1001
  // Convolve the whole tile (done in blocks here to match the requirements
  // of the vectorized convolve functions, but the result is equivalent)
  for (i = v_start; i < v_end; i += MAX_SB_SIZE)
    for (j = h_start; j < h_end; j += MAX_SB_SIZE) {
      int w = AOMMIN(MAX_SB_SIZE, (h_end - j + 15) & ~15);
      int h = AOMMIN(MAX_SB_SIZE, (v_end - i + 15) & ~15);
      const uint16_t *data_p = data + i * stride + j;
      uint16_t *dst_p = dst + i * dst_stride + j;
1002
1003
1004
1005
1006
1007
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
      aom_highbd_convolve8_add_src_hip(
          CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
          dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
          rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h, bit_depth);
#else
1008
1009
1010
1011
      aom_highbd_convolve8_add_src(
          CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
          dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
          rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h, bit_depth);
1012
#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1013
1014
1015
    }
}

1016
1017
static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
                                      int stride, RestorationInternal *rst,
1018
1019
                                      int bit_depth, uint8_t *dst8,
                                      int dst_stride) {
1020
  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
1021
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1022
  int tile_idx;
1023
  extend_frame_highbd(data, width, height, stride);
1024
  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1025
    loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
1026
                                   bit_depth, dst, dst_stride);
1027
  }
1028
1029
}

1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height,
                                         int stride, int32_t *dst,
                                         int dst_stride, int bit_depth, int r,
                                         int eps, int32_t *tmpbuf) {
  int i, j;
  for (i = 0; i < height; ++i) {
    for (j = 0; j < width; ++j) {
      dst[i * dst_stride + j] = dgd[i * stride + j];
    }
  }
  av1_selfguided_restoration_internal(dst, width, height, dst_stride, bit_depth,
                                      r, eps, tmpbuf);
}

1044
1045
void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height,
                                  int stride, int32_t *dst, int dst_stride,
1046
                                  int corner, int edge) {
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
  int i, j;
  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);

  i = 0;
  j = 0;
  {
    const int k = i * stride + j;
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
        corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
  }
  i = 0;
  j = width - 1;
  {
    const int k = i * stride + j;
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
        corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
  }
  i = height - 1;
  j = 0;
  {
    const int k = i * stride + j;
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
        corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
  }
  i = height - 1;
  j = width - 1;
  {
    const int k = i * stride + j;
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
        corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
  }
  i = 0;
  for (j = 1; j < width - 1; ++j) {
    const int k = i * stride + j;
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
             corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
                       dgd[k + 1]);
  }
  i = height - 1;
  for (j = 1; j < width - 1; ++j) {
    const int k = i * stride + j;
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
             corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
                       dgd[k + 1]);
  }
  j = 0;
  for (i = 1; i < height - 1; ++i) {
    const int k = i * stride + j;
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
             corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
                       dgd[k - stride] + dgd[k + stride]);
  }
  j = width - 1;
  for (i = 1; i < height - 1; ++i) {
    const int k = i * stride + j;
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
             corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
                       dgd[k - stride] + dgd[k + stride]);
  }
  for (i = 1; i < height - 1; ++i) {
    for (j = 1; j < width - 1; ++j) {
      const int k = i * stride + j;
      const int l = i * dst_stride + j;
      dst[l] =
          center * dgd[k] +
          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
                    dgd[k - stride + 1] + dgd[k + stride + 1]);
    }
  }
1133
1134
}