restoration.c 63.2 KB
Newer Older
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
3
4
5
6
7
8
9
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
11
12
13
14
 *
 */

#include <math.h>

Yaowu Xu's avatar
Yaowu Xu committed
15
16
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
17
#include "./aom_scale_rtcd.h"
18
19
#include "av1/common/onyxc_int.h"
#include "av1/common/restoration.h"
Yaowu Xu's avatar
Yaowu Xu committed
20
21
#include "aom_dsp/aom_dsp_common.h"
#include "aom_mem/aom_mem.h"
22

23
#include "aom_ports/mem.h"
24

25
const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
26
27
28
29
30
31
32
#if USE_HIGHPASS_IN_SGRPROJ
  // corner, edge, r2, eps2
  { -1, 2, 1, 1 }, { -1, 2, 1, 2 }, { -1, 2, 1, 3 }, { -1, 2, 1, 4 },
  { -1, 2, 1, 5 }, { -2, 3, 1, 2 }, { -2, 3, 1, 3 }, { -2, 3, 1, 4 },
  { -2, 3, 1, 5 }, { -2, 3, 1, 6 }, { -3, 4, 1, 3 }, { -3, 4, 1, 4 },
  { -3, 4, 1, 5 }, { -3, 4, 1, 6 }, { -3, 4, 1, 7 }, { -3, 4, 1, 8 }
#else
33
// r1, eps1, r2, eps2
34
#if MAX_RADIUS == 2
35
36
  { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
  { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
37
  { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 2, 30, 1, 6 },
38
39
  { 2, 50, 1, 12 }, { 2, 60, 1, 13 }, { 2, 70, 1, 14 }, { 2, 80, 1, 15 },
#else
40
41
42
43
  { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
  { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
  { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 3, 30, 1, 10 },
  { 3, 50, 1, 12 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 },
44
#endif  // MAX_RADIUS == 2
45
#endif
46
47
};

48
49
50
51
typedef struct {
  RestorationInfo *rsi;
  int keyframe;
  int ntiles, nhtiles, nvtiles;
52
53
54
#if CONFIG_HIGHBITDEPTH
  int bit_depth;
#endif
55
56
57
58
59
60
61
62
63
64
65
66
67
  int subsampling_y;
  int32_t *tmpbuf;
#if CONFIG_STRIPED_LOOP_RESTORATION
  // Temporary buffers to save/restore 2 lines above/below the restoration
  // stripe
  // Allow for filter margin to left and right
  uint16_t
      tmp_save_above[2][RESTORATION_TILESIZE_MAX + 2 * RESTORATION_EXTRA_HORZ];
  uint16_t
      tmp_save_below[2][RESTORATION_TILESIZE_MAX + 2 * RESTORATION_EXTRA_HORZ];
#endif
} RestorationInternal;

68
69
int av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rst_info,
                                 int width, int height) {
70
  const int ntiles = av1_get_rest_ntiles(
71
      width, height, rst_info->restoration_tilesize, NULL, NULL);
Alex Converse's avatar
Alex Converse committed
72
  aom_free(rst_info->restoration_type);
73
  CHECK_MEM_ERROR(cm, rst_info->restoration_type,
Alex Converse's avatar
Alex Converse committed
74
                  (RestorationType *)aom_malloc(
75
                      sizeof(*rst_info->restoration_type) * ntiles));
76
77
78
79
  aom_free(rst_info->wiener_info);
  CHECK_MEM_ERROR(
      cm, rst_info->wiener_info,
      (WienerInfo *)aom_memalign(16, sizeof(*rst_info->wiener_info) * ntiles));
80
  memset(rst_info->wiener_info, 0, sizeof(*rst_info->wiener_info) * ntiles);
Alex Converse's avatar
Alex Converse committed
81
  aom_free(rst_info->sgrproj_info);
82
83
  CHECK_MEM_ERROR(
      cm, rst_info->sgrproj_info,
Alex Converse's avatar
Alex Converse committed
84
      (SgrprojInfo *)aom_malloc(sizeof(*rst_info->sgrproj_info) * ntiles));
85
86
87
88
89
90
91
92
93
94
  return ntiles;
}

void av1_free_restoration_struct(RestorationInfo *rst_info) {
  aom_free(rst_info->restoration_type);
  rst_info->restoration_type = NULL;
  aom_free(rst_info->wiener_info);
  rst_info->wiener_info = NULL;
  aom_free(rst_info->sgrproj_info);
  rst_info->sgrproj_info = NULL;
95
}
96
97
98

// TODO(debargha): This table can be substantially reduced since only a few
// values are actually used.
David Barker's avatar
David Barker committed
99
int sgrproj_mtable[MAX_EPS][MAX_NELEM];
100
101
102
103
104
105
106
107
108
109

static void GenSgrprojVtable() {
  int e, n;
  for (e = 1; e <= MAX_EPS; ++e)
    for (n = 1; n <= MAX_NELEM; ++n) {
      const int n2e = n * n * e;
      sgrproj_mtable[e - 1][n - 1] =
          (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
    }
}
110
111

void av1_loop_restoration_precal() { GenSgrprojVtable(); }
112

113
114
static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
                               int border_horz, int border_vert) {
115
116
117
118
  uint8_t *data_p;
  int i;
  for (i = 0; i < height; ++i) {
    data_p = data + i * stride;
119
120
    memset(data_p - border_horz, data_p[0], border_horz);
    memset(data_p + width, data_p[width - 1], border_horz);
121
  }
122
123
124
  data_p = data - border_horz;
  for (i = -border_vert; i < 0; ++i) {
    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
125
  }
126
  for (i = height; i < height + border_vert; ++i) {
127
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
128
           width + 2 * border_horz);
129
130
131
  }
}

132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#if CONFIG_HIGHBITDEPTH
static void extend_frame_highbd(uint16_t *data, int width, int height,
                                int stride, int border_horz, int border_vert) {
  uint16_t *data_p;
  int i, j;
  for (i = 0; i < height; ++i) {
    data_p = data + i * stride;
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
  }
  data_p = data - border_horz;
  for (i = -border_vert; i < 0; ++i) {
    memcpy(data_p + i * stride, data_p,
           (width + 2 * border_horz) * sizeof(uint16_t));
  }
  for (i = height; i < height + border_vert; ++i) {
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
           (width + 2 * border_horz) * sizeof(uint16_t));
  }
}
#endif

void extend_frame(uint8_t *data, int width, int height, int stride,
                  int border_horz, int border_vert, int highbd) {
#if !CONFIG_HIGHBITDEPTH
  assert(highbd == 0);
  (void)highbd;
#else
  if (highbd)
    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
                        border_horz, border_vert);
  else
#endif
  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
}

static void copy_tile_lowbd(const RestorationTileLimits *limits,
                            const uint8_t *src, int src_stride, uint8_t *dst,
                            int dst_stride) {
  for (int i = limits->v_start; i < limits->v_end; ++i)
    memcpy(dst + i * dst_stride + limits->h_start,
           src + i * src_stride + limits->h_start,
           limits->h_end - limits->h_start);
}

#if CONFIG_HIGHBITDEPTH
static void copy_tile_highbd(const RestorationTileLimits *limits,
                             const uint16_t *src, int src_stride, uint16_t *dst,
                             int dst_stride) {
  for (int i = limits->v_start; i < limits->v_end; ++i)
    memcpy(dst + i * dst_stride + limits->h_start,
           src + i * src_stride + limits->h_start,
           (limits->h_end - limits->h_start) * sizeof(*dst));
}
#endif

static void copy_tile(const RestorationTileLimits *limits, const uint8_t *src,
                      int src_stride, uint8_t *dst, int dst_stride,
                      int highbd) {
#if !CONFIG_HIGHBITDEPTH
  assert(highbd == 0);
  (void)highbd;
#else
  if (highbd)
    copy_tile_highbd(limits, CONVERT_TO_SHORTPTR(src), src_stride,
                     CONVERT_TO_SHORTPTR(dst), dst_stride);
  else
#endif
  copy_tile_lowbd(limits, src, src_stride, dst, dst_stride);
}
202

203
204
205
206
207
208
209
210
211
212
#if CONFIG_STRIPED_LOOP_RESTORATION
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))

// Set up a processing stripe by replacing the vertical stripe
// boundary (2 lines above and 2 lines below) with data coming from
// the above/below buffers. Before doing so, the original frame data
// is saved into a temporary buffer so that it can be restored by
// calling restore_processing_stripe_boundary after filtering the
// processing stripe.
//
213
214
// Returns the height of the processing stripe
static int setup_processing_stripe_boundary(int y0, int v_end, int h_start,
215
                                            int h_end, uint8_t *data8,
216
217
218
219
220
221
                                            int stride,
                                            RestorationInternal *rst,
                                            int use_highbd) {
  int y, y_stripe_topmost, stripe_index, i;
  int tile_offset = RESTORATION_TILE_OFFSET >> rst->subsampling_y;
  int stripe_height = rst->rsi->procunit_height;
222
223
224
  uint8_t *boundary_above_buf = rst->rsi->stripe_boundary_above;
  uint8_t *boundary_below_buf = rst->rsi->stripe_boundary_below;
  int boundary_stride = rst->rsi->stripe_boundary_stride;
225
226
  int x0 = h_start - RESTORATION_EXTRA_HORZ;
  int x1 = h_end + RESTORATION_EXTRA_HORZ;
227
  const int line_size = (x1 - x0) << use_highbd;
228
229
230
231
232
233
234
235
236
237

  stripe_index = (y0 + tile_offset) / stripe_height;
  y_stripe_topmost = stripe_index * stripe_height - tile_offset;
  boundary_above_buf +=
      ((stripe_index - 1) * 2 * boundary_stride + RESTORATION_EXTRA_HORZ)
      << use_highbd;
  boundary_below_buf +=
      (stripe_index * 2 * boundary_stride + RESTORATION_EXTRA_HORZ)
      << use_highbd;

238
239
  assert(CONFIG_HIGHBITDEPTH || !use_highbd);

240
241
242
243
  // setup the 2 lines above the stripe
  for (i = 0; i < 2; i++) {
    y = y_stripe_topmost - 2 + i;
    if (y >= 0 && y < y0 && y >= y0 - 2) {
244
      uint8_t *p8 = data8 + y * stride + x0;
245
246
247
      uint8_t *new_data =
          boundary_above_buf + ((i * boundary_stride + x0) << use_highbd);
      // Save old pixels
248
      memcpy(rst->tmp_save_above[i], REAL_PTR(use_highbd, p8), line_size);
249
      // Replace width pixels from boundary_above_buf
250
      memcpy(REAL_PTR(use_highbd, p8), new_data, line_size);
251
252
253
254
255
256
    }
  }
  // setup the 2 lines below the stripe
  for (i = 0; i < 2; i++) {
    y = y_stripe_topmost + stripe_height + i;
    if (y < v_end + 2) {
257
      uint8_t *p8 = data8 + y * stride + x0;
258
259
260
      uint8_t *new_data =
          boundary_below_buf + ((i * boundary_stride + x0) << use_highbd);
      // Save old pixels
261
      memcpy(rst->tmp_save_below[i], REAL_PTR(use_highbd, p8), line_size);
262
      // Replace width pixels from boundary_below_buf
263
      memcpy(REAL_PTR(use_highbd, p8), new_data, line_size);
264
265
    }
  }
266

267
268
269
270
271
272
273
  // Return actual stripe height
  return AOMMIN(v_end, y_stripe_topmost + stripe_height) - y0;
}

// This function restores the boundary lines modified by
// setup_processing_stripe_boundary.
static void restore_processing_stripe_boundary(int y0, int v_end, int h_start,
274
                                               int h_end, uint8_t *data8,
275
276
277
278
279
280
281
282
                                               int stride,
                                               RestorationInternal *rst,
                                               int use_highbd) {
  int y, y_stripe_topmost, i, stripe_index;
  int tile_offset = 8 >> rst->subsampling_y;
  int stripe_height = rst->rsi->procunit_height;
  int x0 = h_start - RESTORATION_EXTRA_HORZ;
  int x1 = h_end + RESTORATION_EXTRA_HORZ;
283
  const int line_size = (x1 - x0) << use_highbd;
284
285
286
287

  stripe_index = (y0 + tile_offset) / stripe_height;
  y_stripe_topmost = stripe_index * stripe_height - tile_offset;

288
289
  assert(CONFIG_HIGHBITDEPTH || !use_highbd);

290
291
292
293
  // restore the 2 lines above the stripe
  for (i = 0; i < 2; i++) {
    y = y_stripe_topmost - 2 + i;
    if (y >= 0 && y < y0 && y >= y0 - 2) {
294
295
      uint8_t *p8 = data8 + y * stride + x0;
      memcpy(REAL_PTR(use_highbd, p8), rst->tmp_save_above[i], line_size);
296
297
298
299
300
301
    }
  }
  // restore the 2 lines below the stripe
  for (i = 0; i < 2; i++) {
    y = y_stripe_topmost + stripe_height + i;
    if (y < v_end + 2) {
302
303
      uint8_t *p8 = data8 + y * stride + x0;
      memcpy(REAL_PTR(use_highbd, p8), rst->tmp_save_below[i], line_size);
304
305
306
    }
  }
}
307
#undef REAL_PTR
308
309
#endif

310
311
static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert,
                                   int boundary_dist, int istop) {
312
  memcpy(vert, orig, sizeof(InterpKernel));
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
  switch (boundary_dist) {
    case 0:
      vert[WIENER_HALFWIN] += vert[2] + vert[1] + vert[0];
      vert[2] = vert[1] = vert[0] = 0;
      break;
    case 1:
      vert[2] += vert[1] + vert[0];
      vert[1] = vert[0] = 0;
      break;
    case 2:
      vert[1] += vert[0];
      vert[0] = 0;
      break;
    default: break;
  }
  if (!istop) {
    int tmp;
    tmp = vert[0];
    vert[0] = vert[WIENER_WIN - 1];
    vert[WIENER_WIN - 1] = tmp;
    tmp = vert[1];
    vert[1] = vert[WIENER_WIN - 2];
    vert[WIENER_WIN - 2] = tmp;
    tmp = vert[2];
    vert[2] = vert[WIENER_WIN - 3];
    vert[WIENER_WIN - 3] = tmp;
  }
340
341
}

342
343
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
#define wiener_convolve8_add_src aom_convolve8_add_src_hip
344
#else
345
#define wiener_convolve8_add_src aom_convolve8_add_src
346
347
#endif

348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
static void wiener_filter_stripe(const RestorationTileLimits *limits,
                                 const RestorationInternal *rst, int tile_idx,
                                 int procunit_width, int stripe_height,
                                 const uint8_t *src, int src_stride,
                                 uint8_t *dst, int dst_stride) {
  const RestorationInfo *rsi = rst->rsi;
  const int mid_height =
      stripe_height - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2;
  assert(mid_height > 0);
  for (int j = limits->h_start; j < limits->h_end; j += procunit_width) {
    int w = AOMMIN(procunit_width, (limits->h_end - j + 15) & ~15);
    const uint8_t *src_p = src + j;
    uint8_t *dst_p = dst + j;
    for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
      InterpKernel vertical_top;
      stepdown_wiener_kernel(rsi->wiener_info[tile_idx].vfilter, vertical_top,
                             WIENER_BORDER_VERT + b, 1);
      wiener_convolve8_add_src(src_p, src_stride, dst_p, dst_stride,
                               rsi->wiener_info[tile_idx].hfilter, 16,
                               vertical_top, 16, w, 1);
      src_p += src_stride;
      dst_p += dst_stride;
370
371
    }

372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
    wiener_convolve8_add_src(src_p, src_stride, dst_p, dst_stride,
                             rsi->wiener_info[tile_idx].hfilter, 16,
                             rsi->wiener_info[tile_idx].vfilter, 16, w,
                             mid_height);
    src_p += src_stride * mid_height;
    dst_p += dst_stride * mid_height;

    for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
      InterpKernel vertical_bot;
      stepdown_wiener_kernel(rsi->wiener_info[tile_idx].vfilter, vertical_bot,
                             WIENER_BORDER_VERT + b, 0);
      wiener_convolve8_add_src(src_p, src_stride, dst_p, dst_stride,
                               rsi->wiener_info[tile_idx].hfilter, 16,
                               vertical_bot, 16, w, 1);
      src_p += src_stride;
      dst_p += dst_stride;
    }
389
  }
390
}
391

392
393
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
   over the input. The window is of size (2r + 1)x(2r + 1), and we
394
   specialize to r = 1, 2, 3. A default function is used for r > 3.
395
396
397
398
399
400
401
402
403
404
405
406
407
408

   Each loop follows the same format: We keep a window's worth of input
   in individual variables and select data out of that as appropriate.
*/
static void boxsum1(int32_t *src, int width, int height, int src_stride,
                    int sqr, int32_t *dst, int dst_stride) {
  int i, j, a, b, c;

  // Vertical sum over 3-pixel regions, from src into dst.
  if (!sqr) {
    for (j = 0; j < width; ++j) {
      a = src[j];
      b = src[src_stride + j];
      c = src[2 * src_stride + j];
409

410
411
412
413
414
415
416
417
418
419
420
421
422
423
      dst[j] = a + b;
      for (i = 1; i < height - 2; ++i) {
        // Loop invariant: At the start of each iteration,
        // a = src[(i - 1) * src_stride + j]
        // b = src[(i    ) * src_stride + j]
        // c = src[(i + 1) * src_stride + j]
        dst[i * dst_stride + j] = a + b + c;
        a = b;
        b = c;
        c = src[(i + 2) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c;
      dst[(i + 1) * dst_stride + j] = b + c;
    }
424
  } else {
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
    for (j = 0; j < width; ++j) {
      a = src[j] * src[j];
      b = src[src_stride + j] * src[src_stride + j];
      c = src[2 * src_stride + j] * src[2 * src_stride + j];

      dst[j] = a + b;
      for (i = 1; i < height - 2; ++i) {
        dst[i * dst_stride + j] = a + b + c;
        a = b;
        b = c;
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c;
      dst[(i + 1) * dst_stride + j] = b + c;
    }
  }

  // Horizontal sum over 3-pixel regions of dst
  for (i = 0; i < height; ++i) {
    a = dst[i * dst_stride];
    b = dst[i * dst_stride + 1];
    c = dst[i * dst_stride + 2];

    dst[i * dst_stride] = a + b;
    for (j = 1; j < width - 2; ++j) {
      // Loop invariant: At the start of each iteration,
      // a = src[i * src_stride + (j - 1)]
      // b = src[i * src_stride + (j    )]
      // c = src[i * src_stride + (j + 1)]
      dst[i * dst_stride + j] = a + b + c;
      a = b;
      b = c;
      c = dst[i * dst_stride + (j + 2)];
    }
    dst[i * dst_stride + j] = a + b + c;
    dst[i * dst_stride + (j + 1)] = b + c;
  }
}

static void boxsum2(int32_t *src, int width, int height, int src_stride,
                    int sqr, int32_t *dst, int dst_stride) {
  int i, j, a, b, c, d, e;

  // Vertical sum over 5-pixel regions, from src into dst.
  if (!sqr) {
    for (j = 0; j < width; ++j) {
      a = src[j];
      b = src[src_stride + j];
      c = src[2 * src_stride + j];
      d = src[3 * src_stride + j];
      e = src[4 * src_stride + j];

      dst[j] = a + b + c;
      dst[dst_stride + j] = a + b + c + d;
      for (i = 2; i < height - 3; ++i) {
        // Loop invariant: At the start of each iteration,
        // a = src[(i - 2) * src_stride + j]
        // b = src[(i - 1) * src_stride + j]
        // c = src[(i    ) * src_stride + j]
        // d = src[(i + 1) * src_stride + j]
        // e = src[(i + 2) * src_stride + j]
        dst[i * dst_stride + j] = a + b + c + d + e;
        a = b;
        b = c;
        c = d;
        d = e;
        e = src[(i + 3) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e;
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
      dst[(i + 2) * dst_stride + j] = c + d + e;
    }
  } else {
    for (j = 0; j < width; ++j) {
      a = src[j] * src[j];
      b = src[src_stride + j] * src[src_stride + j];
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
      e = src[4 * src_stride + j] * src[4 * src_stride + j];

      dst[j] = a + b + c;
      dst[dst_stride + j] = a + b + c + d;
      for (i = 2; i < height - 3; ++i) {
        dst[i * dst_stride + j] = a + b + c + d + e;
        a = b;
        b = c;
        c = d;
        d = e;
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e;
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
      dst[(i + 2) * dst_stride + j] = c + d + e;
    }
  }

  // Horizontal sum over 5-pixel regions of dst
  for (i = 0; i < height; ++i) {
    a = dst[i * dst_stride];
    b = dst[i * dst_stride + 1];
    c = dst[i * dst_stride + 2];
    d = dst[i * dst_stride + 3];
    e = dst[i * dst_stride + 4];

    dst[i * dst_stride] = a + b + c;
    dst[i * dst_stride + 1] = a + b + c + d;
    for (j = 2; j < width - 3; ++j) {
      // Loop invariant: At the start of each iteration,
      // a = src[i * src_stride + (j - 2)]
      // b = src[i * src_stride + (j - 1)]
      // c = src[i * src_stride + (j    )]
      // d = src[i * src_stride + (j + 1)]
      // e = src[i * src_stride + (j + 2)]
      dst[i * dst_stride + j] = a + b + c + d + e;
      a = b;
      b = c;
      c = d;
      d = e;
      e = dst[i * dst_stride + (j + 3)];
    }
    dst[i * dst_stride + j] = a + b + c + d + e;
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
    dst[i * dst_stride + (j + 2)] = c + d + e;
  }
}

551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
static void boxsum3(int32_t *src, int width, int height, int src_stride,
                    int sqr, int32_t *dst, int dst_stride) {
  int i, j, a, b, c, d, e, f, g;

  // Vertical sum over 7-pixel regions, from src into dst.
  if (!sqr) {
    for (j = 0; j < width; ++j) {
      a = src[j];
      b = src[1 * src_stride + j];
      c = src[2 * src_stride + j];
      d = src[3 * src_stride + j];
      e = src[4 * src_stride + j];
      f = src[5 * src_stride + j];
      g = src[6 * src_stride + j];

      dst[j] = a + b + c + d;
      dst[dst_stride + j] = a + b + c + d + e;
      dst[2 * dst_stride + j] = a + b + c + d + e + f;
      for (i = 3; i < height - 4; ++i) {
        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
        a = b;
        b = c;
        c = d;
        d = e;
        e = f;
        f = g;
        g = src[(i + 4) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
      dst[(i + 3) * dst_stride + j] = d + e + f + g;
    }
  } else {
    for (j = 0; j < width; ++j) {
      a = src[j] * src[j];
      b = src[1 * src_stride + j] * src[1 * src_stride + j];
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
      e = src[4 * src_stride + j] * src[4 * src_stride + j];
      f = src[5 * src_stride + j] * src[5 * src_stride + j];
      g = src[6 * src_stride + j] * src[6 * src_stride + j];

      dst[j] = a + b + c + d;
      dst[dst_stride + j] = a + b + c + d + e;
      dst[2 * dst_stride + j] = a + b + c + d + e + f;
      for (i = 3; i < height - 4; ++i) {
        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
        a = b;
        b = c;
        c = d;
        d = e;
        e = f;
        f = g;
        g = src[(i + 4) * src_stride + j] * src[(i + 4) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
      dst[(i + 3) * dst_stride + j] = d + e + f + g;
    }
  }

  // Horizontal sum over 7-pixel regions of dst
  for (i = 0; i < height; ++i) {
    a = dst[i * dst_stride];
    b = dst[i * dst_stride + 1];
    c = dst[i * dst_stride + 2];
    d = dst[i * dst_stride + 3];
    e = dst[i * dst_stride + 4];
    f = dst[i * dst_stride + 5];
    g = dst[i * dst_stride + 6];

    dst[i * dst_stride] = a + b + c + d;
    dst[i * dst_stride + 1] = a + b + c + d + e;
    dst[i * dst_stride + 2] = a + b + c + d + e + f;
    for (j = 3; j < width - 4; ++j) {
      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
      a = b;
      b = c;
      c = d;
      d = e;
      e = f;
      f = g;
      g = dst[i * dst_stride + (j + 4)];
    }
    dst[i * dst_stride + j] = a + b + c + d + e + f + g;
    dst[i * dst_stride + (j + 1)] = b + c + d + e + f + g;
    dst[i * dst_stride + (j + 2)] = c + d + e + f + g;
    dst[i * dst_stride + (j + 3)] = d + e + f + g;
  }
}

// Generic version for any r. To be removed after experiments are done.
static void boxsumr(int32_t *src, int width, int height, int src_stride, int r,
                    int sqr, int32_t *dst, int dst_stride) {
  int32_t *tmp = aom_malloc(width * height * sizeof(*tmp));
  int tmp_stride = width;
  int i, j;
  if (sqr) {
    for (j = 0; j < width; ++j) tmp[j] = src[j] * src[j];
    for (j = 0; j < width; ++j)
      for (i = 1; i < height; ++i)
        tmp[i * tmp_stride + j] =
            tmp[(i - 1) * tmp_stride + j] +
            src[i * src_stride + j] * src[i * src_stride + j];
  } else {
    memcpy(tmp, src, sizeof(*tmp) * width);
    for (j = 0; j < width; ++j)
      for (i = 1; i < height; ++i)
        tmp[i * tmp_stride + j] =
            tmp[(i - 1) * tmp_stride + j] + src[i * src_stride + j];
  }
  for (i = 0; i <= r; ++i)
    memcpy(&dst[i * dst_stride], &tmp[(i + r) * tmp_stride],
           sizeof(*tmp) * width);
  for (i = r + 1; i < height - r; ++i)
    for (j = 0; j < width; ++j)
      dst[i * dst_stride + j] =
          tmp[(i + r) * tmp_stride + j] - tmp[(i - r - 1) * tmp_stride + j];
  for (i = height - r; i < height; ++i)
    for (j = 0; j < width; ++j)
      dst[i * dst_stride + j] = tmp[(height - 1) * tmp_stride + j] -
                                tmp[(i - r - 1) * tmp_stride + j];

  for (i = 0; i < height; ++i) tmp[i * tmp_stride] = dst[i * dst_stride];
  for (i = 0; i < height; ++i)
    for (j = 1; j < width; ++j)
      tmp[i * tmp_stride + j] =
          tmp[i * tmp_stride + j - 1] + dst[i * src_stride + j];

  for (j = 0; j <= r; ++j)
    for (i = 0; i < height; ++i)
      dst[i * dst_stride + j] = tmp[i * tmp_stride + j + r];
  for (j = r + 1; j < width - r; ++j)
    for (i = 0; i < height; ++i)
      dst[i * dst_stride + j] =
          tmp[i * tmp_stride + j + r] - tmp[i * tmp_stride + j - r - 1];
  for (j = width - r; j < width; ++j)
    for (i = 0; i < height; ++i)
      dst[i * dst_stride + j] =
          tmp[i * tmp_stride + width - 1] - tmp[i * tmp_stride + j - r - 1];
  aom_free(tmp);
}

696
697
698
699
700
701
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
                   int sqr, int32_t *dst, int dst_stride) {
  if (r == 1)
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
  else if (r == 2)
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
702
703
704
705
  else if (r == 3)
    boxsum3(src, width, height, src_stride, sqr, dst, dst_stride);
  else
    boxsumr(src, width, height, src_stride, r, sqr, dst, dst_stride);
706
707
708
709
}

static void boxnum(int width, int height, int r, int8_t *num, int num_stride) {
  int i, j;
710
711
712
  for (i = 0; i <= r; ++i) {
    for (j = 0; j <= r; ++j) {
      num[i * num_stride + j] = (r + 1 + i) * (r + 1 + j);
713
714
715
716
717
718
      num[i * num_stride + (width - 1 - j)] = num[i * num_stride + j];
      num[(height - 1 - i) * num_stride + j] = num[i * num_stride + j];
      num[(height - 1 - i) * num_stride + (width - 1 - j)] =
          num[i * num_stride + j];
    }
  }
719
720
  for (j = 0; j <= r; ++j) {
    const int val = (2 * r + 1) * (r + 1 + j);
721
722
723
724
725
    for (i = r + 1; i < height - r; ++i) {
      num[i * num_stride + j] = val;
      num[i * num_stride + (width - 1 - j)] = val;
    }
  }
726
727
  for (i = 0; i <= r; ++i) {
    const int val = (2 * r + 1) * (r + 1 + i);
728
729
730
731
732
733
734
    for (j = r + 1; j < width - r; ++j) {
      num[i * num_stride + j] = val;
      num[(height - 1 - i) * num_stride + j] = val;
    }
  }
  for (i = r + 1; i < height - r; ++i) {
    for (j = r + 1; j < width - r; ++j) {
735
      num[i * num_stride + j] = (2 * r + 1) * (2 * r + 1);
736
737
738
739
740
    }
  }
}

void decode_xq(int *xqd, int *xq) {
741
  xq[0] = xqd[0];
742
743
744
  xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
}

David Barker's avatar
David Barker committed
745
const int32_t x_by_xplus1[256] = {
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
  0,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  256,
};

David Barker's avatar
David Barker committed
766
const int32_t one_by_x[MAX_NELEM] = {
767
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
768
769
770
771
772
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
#if MAX_RADIUS > 2
  158,  152,  146,  141,  137, 132, 128, 124, 120, 117, 114, 111, 108,
  105,  102,  100,  98,   95,  93,  91,  89,  87,  85,  84
#endif  // MAX_RADIUS > 2
773
774
};

775
static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
776
777
                                                int height, int dgd_stride,
                                                int32_t *dst, int dst_stride,
778
                                                int bit_depth, int r, int eps) {
779
780
781
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
  const int num_stride = width_ext;
David Barker's avatar
David Barker committed
782
783
784
785
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
  // leading to a significant speed improvement.
  // We also align the stride to a multiple of 16 bytes, for consistency
  // with the SIMD version of this function.
786
  int buf_stride = ((width_ext + 3) & ~3) + 16;
787
788
789
790
791
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
  int32_t *A = A_;
  int32_t *B = B_;
  int8_t num_[RESTORATION_PROC_UNIT_PELS];
792
793
  int8_t *num = num_ + SGRPROJ_BORDER_VERT * num_stride + SGRPROJ_BORDER_HORZ;
  int i, j;
794

795
796
797
  // Don't filter tiles with dimensions < 5 on any axis
  if ((width < 5) || (height < 5)) return;

798
799
800
801
802
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
  boxnum(width_ext, height_ext, r, num_, num_stride);
803
  assert(r <= 3);
804
805
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
806
807
  for (i = 0; i < height; ++i) {
    for (j = 0; j < width; ++j) {
David Barker's avatar
David Barker committed
808
      const int k = i * buf_stride + j;
809
      const int n = num[i * num_stride + j];
810

811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
      // a < 2^16 * n < 2^22 regardless of bit depth
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
      // b < 2^8 * n < 2^14 regardless of bit depth
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);

      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
      // This is an artefact of rounding, and can only happen if all pixels
      // are (almost) identical, so in this case we saturate to p=0.
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
      uint32_t s = sgrproj_mtable[eps - 1][n - 1];

      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
      // (this holds even after accounting for the rounding in s)
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);

      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // < 2^8

      // SGRPROJ_SGR - A[k] < 2^8, B[k] < 2^(bit_depth) * n,
      // one_by_x[n - 1] = round(2^12 / n)
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
      // and B[k] is set to a value < 2^(8 + bit depth)
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
                                             (uint32_t)B[k] *
                                             (uint32_t)one_by_x[n - 1],
                                         SGRPROJ_RECIP_BITS);
839
840
841
842
843
    }
  }
  i = 0;
  j = 0;
  {
David Barker's avatar
David Barker committed
844
    const int k = i * buf_stride + j;
845
846
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
847
    const int nb = 3;
848
    const int32_t a =
David Barker's avatar
David Barker committed
849
        3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + A[k + buf_stride + 1];
850
    const int32_t b =
David Barker's avatar
David Barker committed
851
        3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] + B[k + buf_stride + 1];
852
    const int32_t v = a * dgd[l] + b;
853
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
854
855
856
857
  }
  i = 0;
  j = width - 1;
  {
David Barker's avatar
David Barker committed
858
    const int k = i * buf_stride + j;
859
860
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
861
    const int nb = 3;
862
    const int32_t a =
David Barker's avatar
David Barker committed
863
        3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + A[k + buf_stride - 1];
864
    const int32_t b =
David Barker's avatar
David Barker committed
865
        3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] + B[k + buf_stride - 1];
866
    const int32_t v = a * dgd[l] + b;
867
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
868
869
870
871
  }
  i = height - 1;
  j = 0;
  {
David Barker's avatar
David Barker committed
872
    const int k = i * buf_stride + j;
873
874
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
875
    const int nb = 3;
876
    const int32_t a =
David Barker's avatar
David Barker committed
877
        3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + A[k - buf_stride + 1];
878
    const int32_t b =
David Barker's avatar
David Barker committed
879
        3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] + B[k - buf_stride + 1];
880
    const int32_t v = a * dgd[l] + b;
881
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
882
883
884
885
  }
  i = height - 1;
  j = width - 1;
  {
David Barker's avatar
David Barker committed
886
    const int k = i * buf_stride + j;
887
888
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
889
    const int nb = 3;
890
    const int32_t a =
David Barker's avatar
David Barker committed
891
        3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + A[k - buf_stride - 1];
892
    const int32_t b =
David Barker's avatar
David Barker committed
893
        3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] + B[k - buf_stride - 1];
894
    const int32_t v = a * dgd[l] + b;
895
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
896
897
898
  }
  i = 0;
  for (j = 1; j < width - 1; ++j) {
David Barker's avatar
David Barker committed
899
    const int k = i * buf_stride + j;
900
901
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
902
    const int nb = 3;
David Barker's avatar
David Barker committed
903
904
905
906
    const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
                      A[k + buf_stride - 1] + A[k + buf_stride + 1];
    const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
                      B[k + buf_stride - 1] + B[k + buf_stride + 1];
907
    const int32_t v = a * dgd[l] + b;
908
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
909
910
911
  }
  i = height - 1;
  for (j = 1; j < width - 1; ++j) {
David Barker's avatar
David Barker committed
912
    const int k = i * buf_stride + j;
913
914
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
915
    const int nb = 3;
David Barker's avatar
David Barker committed
916
917
918
919
    const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
                      A[k - buf_stride - 1] + A[k - buf_stride + 1];
    const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
                      B[k - buf_stride - 1] + B[k - buf_stride + 1];
920
    const int32_t v = a * dgd[l] + b;
921
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
922
923
924
  }
  j = 0;
  for (i = 1; i < height - 1; ++i) {
David Barker's avatar
David Barker committed
925
    const int k = i * buf_stride + j;
926
927
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
928
    const int nb = 3;
David Barker's avatar
David Barker committed
929
930
931
932
    const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
                      A[k + 1] + A[k - buf_stride + 1] + A[k + buf_stride + 1];
    const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
                      B[k + 1] + B[k - buf_stride + 1] + B[k + buf_stride + 1];
933
    const int32_t v = a * dgd[l] + b;
934
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
935
936
937
  }
  j = width - 1;
  for (i = 1; i < height - 1; ++i) {
David Barker's avatar
David Barker committed
938
    const int k = i * buf_stride + j;
939
940
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
941
    const int nb = 3;
David Barker's avatar
David Barker committed
942
943
944
945
    const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
                      A[k - 1] + A[k - buf_stride - 1] + A[k + buf_stride - 1];
    const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
                      B[k - 1] + B[k - buf_stride - 1] + B[k + buf_stride - 1];
946
    const int32_t v = a * dgd[l] + b;
947
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
948
949
950
  }
  for (i = 1; i < height - 1; ++i) {
    for (j = 1; j < width - 1; ++j) {
David Barker's avatar
David Barker committed
951
      const int k = i * buf_stride + j;
952
953
      const int l = i * dgd_stride + j;
      const int m = i * dst_stride + j;
954
      const int nb = 5;
955
      const int32_t a =
David Barker's avatar
David Barker committed
956
957
958
959
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
              4 +
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
960
              3;
961
      const int32_t b =
David Barker's avatar
David Barker committed
962
963
964
965
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
              4 +
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
966
              3;
967
      const int32_t v = a * dgd[l] + b;
968
      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
969
970
971
972
    }
  }
}

973
void av1_selfguided_restoration_c(const uint8_t *dgd, int width, int height,
974
                                  int stride, int32_t *dst, int dst_stride,
975
976
                                  int r, int eps) {
  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
977
978
  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
  int32_t *dgd32 =
979
      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
980
  int i, j;
981
982
983
  for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
    for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
      dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
984
985
    }
  }
986
  av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
987
                                      dst_stride, 8, r, eps);
988
989
}

990
991
992
void av1_highpass_filter_c(const uint8_t *dgd, int width, int height,
                           int stride, int32_t *dst, int dst_stride, int corner,
                           int edge) {
993
  int i, j;
994
  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
995
996
997
998
999

  i = 0;
  j = 0;
  {
    const int k = i * stride + j;
1000
1001
1002
1003
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
        corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
1004
1005
1006
1007
1008
  }
  i = 0;
  j = width - 1;
  {
    const int k = i * stride + j;
1009
1010
1011
1012
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
        corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
1013
1014
1015
1016
1017
  }
  i = height - 1;
  j = 0;
  {
    const int k = i * stride + j;
1018
1019
1020
1021
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
        corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
1022
1023
1024
1025
1026
  }
  i = height - 1;
  j = width - 1;
  {
    const int k = i * stride + j;
1027
1028
1029
1030
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
        corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
1031
1032
1033
1034
  }
  i = 0;
  for (j = 1; j < width - 1; ++j) {
    const int k = i * stride + j;
1035
1036
1037
1038
1039
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
             corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
                       dgd[k + 1]);
1040
1041
1042
1043
  }
  i = height - 1;
  for (j = 1; j < width - 1; ++j) {
    const int k = i * stride + j;
1044
1045
1046
1047
1048
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
             corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
                       dgd[k + 1]);
1049
1050
1051
1052
  }
  j = 0;
  for (i = 1; i < height - 1; ++i) {
    const int k = i * stride + j;
1053
1054
1055
1056
1057
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
             corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
                       dgd[k - stride] + dgd[k + stride]);
1058
1059
1060
1061
  }
  j = width - 1;
  for (i = 1; i < height - 1; ++i) {
    const int k = i * stride + j;
1062
1063
1064
1065
1066
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
             corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
                       dgd[k - stride] + dgd[k + stride]);
1067
1068
1069
1070
  }
  for (i = 1; i < height - 1; ++i) {
    for (j = 1; j < width - 1; ++j) {
      const int k = i * stride + j;
1071
1072
1073
1074
1075
1076
      const int l = i * dst_stride + j;
      dst[l] =
          center * dgd[k] +
          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
                    dgd[k - stride + 1] + dgd[k + stride + 1]);
1077
1078
1079
1080
    }
  }
}

1081
void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height,
1082
1083
                                    int stride, int eps, int *xqd, uint8_t *dst,
                                    int dst_stride, int32_t *tmpbuf) {
1084
  int xq[2];
1085
  int32_t *flt1 = tmpbuf;
1086
  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
1087
  int i, j;
1088
  assert(width * height <= RESTORATION_TILEPELS_MAX);
1089
1090
#if USE_HIGHPASS_IN_SGRPROJ
  av1_highpass_filter_c(dat, width, height, stride, flt1, width,
1091
                        sgr_params[eps].corner, sgr_params[eps].edge);
1092
#else
1093
  av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
1094
                               sgr_params[eps].r1, sgr_params[eps].e1);
1095
#endif  // USE_HIGHPASS_IN_SGRPROJ
1096
  av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
1097
                               sgr_params[eps].r2, sgr_params[eps].e2);
1098
1099
1100
1101
1102
  decode_xq(xqd, xq);
  for (i = 0; i < height; ++i) {
    for (j = 0; j < width; ++j) {
      const int k = i * width + j;
      const int l = i * stride + j;
1103
1104
1105
1106
      const int m = i * dst_stride + j;
      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
      const int32_t f1 = (int32_t)flt1[k] - u;
      const int32_t f2 = (int32_t)flt2[k] - u;
David Barker's avatar
David Barker committed
1107
      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
1108
1109
      const int16_t w =
          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
1110
      dst[m] = clip_pixel(w);
1111
1112
1113
1114
    }
  }
}

1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
static void sgrproj_filter_stripe(const RestorationTileLimits *limits,
                                  const RestorationInternal *rst, int tile_idx,
                                  int procunit_width, int stripe_height,
                                  const uint8_t *src, int src_stride,
                                  uint8_t *dst, int dst_stride) {
  for (int j = limits->h_start; j < limits->h_end; j += procunit_width) {
    int w = AOMMIN(procunit_width, limits->h_end - j);
    const uint8_t *src_p = src + j;
    uint8_t *dst_p = dst + j;
    apply_selfguided_restoration(src_p, w, stripe_height, src_stride,
                                 rst->rsi->sgrproj_info[tile_idx].ep,
                                 rst->rsi->sgrproj_info[tile_idx].xqd, dst_p,
                                 dst_stride, rst->tmpbuf);
1128
1129
1130
  }
}

1131
#if CONFIG_HIGHBITDEPTH
1132
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1133
#define wiener_highbd_convolve8_add_src aom_highbd_convolve8_add_src_hip
1134
#else
1135
#define wiener_highbd_convolve8_add_src aom_highbd_convolve8_add_src
1136
#endif
1137

1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
static void wiener_filter_stripe_highbd(const RestorationTileLimits *limits,
                                        const RestorationInternal *rst,
                                        int tile_idx, int procunit_width,
                                        int stripe_height, const uint8_t *src8,
                                        int src_stride, uint8_t *dst8,
                                        int dst_stride) {
  const RestorationInfo *rsi = rst->rsi;
  const int mid_height =
      stripe_height - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2;
  assert(mid_height > 0);

  for (int j = limits->h_start; j < limits->h_end; j += procunit_width) {
    int w = AOMMIN(procunit_width, (limits->h_end - j + 15) & ~15);
    const uint8_t *src8_p = src8 + j;
    uint8_t *dst8_p = dst8 + j;

    for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
      InterpKernel vertical_top;
      stepdown_wiener_kernel(rsi->wiener_info[tile_idx].vfilter, vertical_top,
                             WIENER_BORDER_VERT + b, 1);
      wiener_highbd_convolve8_add_src(src8_p, src_stride, dst8_p, dst_stride,
                                      rsi->wiener_info[tile_idx].hfilter, 16,