restoration.c 65.9 KB
Newer Older
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
3
4
5
6
7
8
9
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
11
12
13
14
 *
 */

#include <math.h>

Yaowu Xu's avatar
Yaowu Xu committed
15
16
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
17
#include "./aom_scale_rtcd.h"
18
19
#include "av1/common/onyxc_int.h"
#include "av1/common/restoration.h"
Yaowu Xu's avatar
Yaowu Xu committed
20
21
#include "aom_dsp/aom_dsp_common.h"
#include "aom_mem/aom_mem.h"
22

23
#include "aom_ports/mem.h"
24

25
const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
26
27
28
29
30
31
32
#if USE_HIGHPASS_IN_SGRPROJ
  // corner, edge, r2, eps2
  { -1, 2, 1, 1 }, { -1, 2, 1, 2 }, { -1, 2, 1, 3 }, { -1, 2, 1, 4 },
  { -1, 2, 1, 5 }, { -2, 3, 1, 2 }, { -2, 3, 1, 3 }, { -2, 3, 1, 4 },
  { -2, 3, 1, 5 }, { -2, 3, 1, 6 }, { -3, 4, 1, 3 }, { -3, 4, 1, 4 },
  { -3, 4, 1, 5 }, { -3, 4, 1, 6 }, { -3, 4, 1, 7 }, { -3, 4, 1, 8 }
#else
33
// r1, eps1, r2, eps2
34
#if MAX_RADIUS == 2
35
36
37
38
39
  { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
  { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
  { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 2, 30, 1, 2 },
  { 2, 50, 1, 12 }, { 2, 60, 1, 13 }, { 2, 70, 1, 14 }, { 2, 80, 1, 15 },
#else
40
41
42
43
  { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
  { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
  { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 3, 30, 1, 10 },
  { 3, 50, 1, 12 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 },
44
#endif  // MAX_RADIUS == 2
45
#endif
46
47
};

clang-format's avatar
clang-format committed
48
49
typedef void (*restore_func_type)(uint8_t *data8, int width, int height,
                                  int stride, RestorationInternal *rst,
50
                                  uint8_t *dst8, int dst_stride);
51
#if CONFIG_HIGHBITDEPTH
clang-format's avatar
clang-format committed
52
53
typedef void (*restore_func_highbd_type)(uint8_t *data8, int width, int height,
                                         int stride, RestorationInternal *rst,
54
55
                                         int bit_depth, uint8_t *dst8,
                                         int dst_stride);
56
#endif  // CONFIG_HIGHBITDEPTH
57

58
59
int av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rst_info,
                                 int width, int height) {
60
61
  const int ntiles = av1_get_rest_ntiles(
      width, height, rst_info->restoration_tilesize, NULL, NULL, NULL, NULL);
Alex Converse's avatar
Alex Converse committed
62
  aom_free(rst_info->restoration_type);
63
  CHECK_MEM_ERROR(cm, rst_info->restoration_type,
Alex Converse's avatar
Alex Converse committed
64
                  (RestorationType *)aom_malloc(
65
                      sizeof(*rst_info->restoration_type) * ntiles));
66
67
68
69
  aom_free(rst_info->wiener_info);
  CHECK_MEM_ERROR(
      cm, rst_info->wiener_info,
      (WienerInfo *)aom_memalign(16, sizeof(*rst_info->wiener_info) * ntiles));
70
  memset(rst_info->wiener_info, 0, sizeof(*rst_info->wiener_info) * ntiles);
Alex Converse's avatar
Alex Converse committed
71
  aom_free(rst_info->sgrproj_info);
72
73
  CHECK_MEM_ERROR(
      cm, rst_info->sgrproj_info,
Alex Converse's avatar
Alex Converse committed
74
      (SgrprojInfo *)aom_malloc(sizeof(*rst_info->sgrproj_info) * ntiles));
75
76
77
78
79
80
81
82
83
84
  return ntiles;
}

void av1_free_restoration_struct(RestorationInfo *rst_info) {
  aom_free(rst_info->restoration_type);
  rst_info->restoration_type = NULL;
  aom_free(rst_info->wiener_info);
  rst_info->wiener_info = NULL;
  aom_free(rst_info->sgrproj_info);
  rst_info->sgrproj_info = NULL;
85
}
86
87
88

// TODO(debargha): This table can be substantially reduced since only a few
// values are actually used.
David Barker's avatar
David Barker committed
89
int sgrproj_mtable[MAX_EPS][MAX_NELEM];
90
91
92
93
94
95
96
97
98
99

static void GenSgrprojVtable() {
  int e, n;
  for (e = 1; e <= MAX_EPS; ++e)
    for (n = 1; n <= MAX_NELEM; ++n) {
      const int n2e = n * n * e;
      sgrproj_mtable[e - 1][n - 1] =
          (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
    }
}
100
101

void av1_loop_restoration_precal() { GenSgrprojVtable(); }
102

103
static void loop_restoration_init(RestorationInternal *rst, int kf) {
104
  rst->keyframe = kf;
105
106
}

107
108
void extend_frame(uint8_t *data, int width, int height, int stride,
                  int border_horz, int border_vert) {
109
110
111
112
  uint8_t *data_p;
  int i;
  for (i = 0; i < height; ++i) {
    data_p = data + i * stride;
113
114
    memset(data_p - border_horz, data_p[0], border_horz);
    memset(data_p + width, data_p[width - 1], border_horz);
115
  }
116
117
118
  data_p = data - border_horz;
  for (i = -border_vert; i < 0; ++i) {
    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
119
  }
120
  for (i = height; i < height + border_vert; ++i) {
121
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
122
           width + 2 * border_horz);
123
124
125
  }
}

126
127
128
129
static void loop_copy_tile(uint8_t *data, int tile_idx, int subtile_idx,
                           int subtile_bits, int width, int height, int stride,
                           RestorationInternal *rst, uint8_t *dst,
                           int dst_stride) {
130
131
  const int tile_width = rst->tile_width;
  const int tile_height = rst->tile_height;
132
133
134
135
136
137
138
139
140
141
  int i;
  int h_start, h_end, v_start, v_end;
  av1_get_rest_tile_limits(tile_idx, subtile_idx, subtile_bits, rst->nhtiles,
                           rst->nvtiles, tile_width, tile_height, width, height,
                           0, 0, &h_start, &h_end, &v_start, &v_end);
  for (i = v_start; i < v_end; ++i)
    memcpy(dst + i * dst_stride + h_start, data + i * stride + h_start,
           h_end - h_start);
}

142
143
static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert,
                                   int boundary_dist, int istop) {
144
  memcpy(vert, orig, sizeof(InterpKernel));
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
  switch (boundary_dist) {
    case 0:
      vert[WIENER_HALFWIN] += vert[2] + vert[1] + vert[0];
      vert[2] = vert[1] = vert[0] = 0;
      break;
    case 1:
      vert[2] += vert[1] + vert[0];
      vert[1] = vert[0] = 0;
      break;
    case 2:
      vert[1] += vert[0];
      vert[0] = 0;
      break;
    default: break;
  }
  if (!istop) {
    int tmp;
    tmp = vert[0];
    vert[0] = vert[WIENER_WIN - 1];
    vert[WIENER_WIN - 1] = tmp;
    tmp = vert[1];
    vert[1] = vert[WIENER_WIN - 2];
    vert[WIENER_WIN - 2] = tmp;
    tmp = vert[2];
    vert[2] = vert[WIENER_WIN - 3];
    vert[WIENER_WIN - 3] = tmp;
  }
172
173
}

174
175
static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
                                    int height, int stride,
176
                                    RestorationInternal *rst, uint8_t *dst,
177
                                    int dst_stride) {
178
179
  const int procunit_width = rst->rsi->procunit_width;
  const int procunit_height = rst->rsi->procunit_height;
180
181
  const int tile_width = rst->tile_width;
  const int tile_height = rst->tile_height;
182
183
  int i, j;
  int h_start, h_end, v_start, v_end;
184
  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
185
186
187
188
    loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
                   dst_stride);
    return;
  }
189
  InterpKernel vertical_topbot;
190
  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
191
                           tile_width, tile_height, width, height, 0, 0,
192
                           &h_start, &h_end, &v_start, &v_end);
193
194
  // Convolve the whole tile (done in blocks here to match the requirements
  // of the vectorized convolve functions, but the result is equivalent)
195
196
197
198
  for (i = v_start; i < v_end; i += procunit_height)
    for (j = h_start; j < h_end; j += procunit_width) {
      int w = AOMMIN(procunit_width, (h_end - j + 15) & ~15);
      int h = AOMMIN(procunit_height, (v_end - i + 15) & ~15);
199
200
      const uint8_t *data_p = data + i * stride + j;
      uint8_t *dst_p = dst + i * dst_stride + j;
201
202
203
204
      // Note h is at least 16
      for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
                               vertical_topbot, WIENER_BORDER_VERT + b, 1);
205
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
206
207
208
        aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
                                  rst->rsi->wiener_info[tile_idx].hfilter, 16,
                                  vertical_topbot, 16, w, 1);
209
#else
210
211
212
        aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
                              rst->rsi->wiener_info[tile_idx].hfilter, 16,
                              vertical_topbot, 16, w, 1);
213
#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
214
215
216
        data_p += stride;
        dst_p += dst_stride;
      }
217
218
219
220
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
      aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
                                rst->rsi->wiener_info[tile_idx].hfilter, 16,
                                rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
221
                                h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
222
223
224
225
#else
      aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
                            rst->rsi->wiener_info[tile_idx].hfilter, 16,
                            rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
226
                            h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
227
#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
228
229
230
231
232
      data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
      dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
      for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
        stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
                               vertical_topbot, WIENER_BORDER_VERT + b, 0);
233
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
234
235
236
        aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
                                  rst->rsi->wiener_info[tile_idx].hfilter, 16,
                                  vertical_topbot, 16, w, 1);
237
#else
238
239
240
        aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
                              rst->rsi->wiener_info[tile_idx].hfilter, 16,
                              vertical_topbot, 16, w, 1);
241
#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
242
243
244
        data_p += stride;
        dst_p += dst_stride;
      }
245
246
247
    }
}

clang-format's avatar
clang-format committed
248
static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
249
250
251
                               RestorationInternal *rst, uint8_t *dst,
                               int dst_stride) {
  int tile_idx;
252
253
  extend_frame(data, width, height, stride, WIENER_BORDER_HORZ,
               WIENER_BORDER_VERT);
254
  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
255
256
    loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
                            dst_stride);
257
  }
258
}
259

260
261
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
   over the input. The window is of size (2r + 1)x(2r + 1), and we
262
   specialize to r = 1, 2, 3. A default function is used for r > 3.
263
264
265
266
267
268
269
270
271
272
273
274
275
276

   Each loop follows the same format: We keep a window's worth of input
   in individual variables and select data out of that as appropriate.
*/
static void boxsum1(int32_t *src, int width, int height, int src_stride,
                    int sqr, int32_t *dst, int dst_stride) {
  int i, j, a, b, c;

  // Vertical sum over 3-pixel regions, from src into dst.
  if (!sqr) {
    for (j = 0; j < width; ++j) {
      a = src[j];
      b = src[src_stride + j];
      c = src[2 * src_stride + j];
277

278
279
280
281
282
283
284
285
286
287
288
289
290
291
      dst[j] = a + b;
      for (i = 1; i < height - 2; ++i) {
        // Loop invariant: At the start of each iteration,
        // a = src[(i - 1) * src_stride + j]
        // b = src[(i    ) * src_stride + j]
        // c = src[(i + 1) * src_stride + j]
        dst[i * dst_stride + j] = a + b + c;
        a = b;
        b = c;
        c = src[(i + 2) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c;
      dst[(i + 1) * dst_stride + j] = b + c;
    }
292
  } else {
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
    for (j = 0; j < width; ++j) {
      a = src[j] * src[j];
      b = src[src_stride + j] * src[src_stride + j];
      c = src[2 * src_stride + j] * src[2 * src_stride + j];

      dst[j] = a + b;
      for (i = 1; i < height - 2; ++i) {
        dst[i * dst_stride + j] = a + b + c;
        a = b;
        b = c;
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c;
      dst[(i + 1) * dst_stride + j] = b + c;
    }
  }

  // Horizontal sum over 3-pixel regions of dst
  for (i = 0; i < height; ++i) {
    a = dst[i * dst_stride];
    b = dst[i * dst_stride + 1];
    c = dst[i * dst_stride + 2];

    dst[i * dst_stride] = a + b;
    for (j = 1; j < width - 2; ++j) {
      // Loop invariant: At the start of each iteration,
      // a = src[i * src_stride + (j - 1)]
      // b = src[i * src_stride + (j    )]
      // c = src[i * src_stride + (j + 1)]
      dst[i * dst_stride + j] = a + b + c;
      a = b;
      b = c;
      c = dst[i * dst_stride + (j + 2)];
    }
    dst[i * dst_stride + j] = a + b + c;
    dst[i * dst_stride + (j + 1)] = b + c;
  }
}

static void boxsum2(int32_t *src, int width, int height, int src_stride,
                    int sqr, int32_t *dst, int dst_stride) {
  int i, j, a, b, c, d, e;

  // Vertical sum over 5-pixel regions, from src into dst.
  if (!sqr) {
    for (j = 0; j < width; ++j) {
      a = src[j];
      b = src[src_stride + j];
      c = src[2 * src_stride + j];
      d = src[3 * src_stride + j];
      e = src[4 * src_stride + j];

      dst[j] = a + b + c;
      dst[dst_stride + j] = a + b + c + d;
      for (i = 2; i < height - 3; ++i) {
        // Loop invariant: At the start of each iteration,
        // a = src[(i - 2) * src_stride + j]
        // b = src[(i - 1) * src_stride + j]
        // c = src[(i    ) * src_stride + j]
        // d = src[(i + 1) * src_stride + j]
        // e = src[(i + 2) * src_stride + j]
        dst[i * dst_stride + j] = a + b + c + d + e;
        a = b;
        b = c;
        c = d;
        d = e;
        e = src[(i + 3) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e;
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
      dst[(i + 2) * dst_stride + j] = c + d + e;
    }
  } else {
    for (j = 0; j < width; ++j) {
      a = src[j] * src[j];
      b = src[src_stride + j] * src[src_stride + j];
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
      e = src[4 * src_stride + j] * src[4 * src_stride + j];

      dst[j] = a + b + c;
      dst[dst_stride + j] = a + b + c + d;
      for (i = 2; i < height - 3; ++i) {
        dst[i * dst_stride + j] = a + b + c + d + e;
        a = b;
        b = c;
        c = d;
        d = e;
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e;
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
      dst[(i + 2) * dst_stride + j] = c + d + e;
    }
  }

  // Horizontal sum over 5-pixel regions of dst
  for (i = 0; i < height; ++i) {
    a = dst[i * dst_stride];
    b = dst[i * dst_stride + 1];
    c = dst[i * dst_stride + 2];
    d = dst[i * dst_stride + 3];
    e = dst[i * dst_stride + 4];

    dst[i * dst_stride] = a + b + c;
    dst[i * dst_stride + 1] = a + b + c + d;
    for (j = 2; j < width - 3; ++j) {
      // Loop invariant: At the start of each iteration,
      // a = src[i * src_stride + (j - 2)]
      // b = src[i * src_stride + (j - 1)]
      // c = src[i * src_stride + (j    )]
      // d = src[i * src_stride + (j + 1)]
      // e = src[i * src_stride + (j + 2)]
      dst[i * dst_stride + j] = a + b + c + d + e;
      a = b;
      b = c;
      c = d;
      d = e;
      e = dst[i * dst_stride + (j + 3)];
    }
    dst[i * dst_stride + j] = a + b + c + d + e;
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
    dst[i * dst_stride + (j + 2)] = c + d + e;
  }
}

419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
static void boxsum3(int32_t *src, int width, int height, int src_stride,
                    int sqr, int32_t *dst, int dst_stride) {
  int i, j, a, b, c, d, e, f, g;

  // Vertical sum over 7-pixel regions, from src into dst.
  if (!sqr) {
    for (j = 0; j < width; ++j) {
      a = src[j];
      b = src[1 * src_stride + j];
      c = src[2 * src_stride + j];
      d = src[3 * src_stride + j];
      e = src[4 * src_stride + j];
      f = src[5 * src_stride + j];
      g = src[6 * src_stride + j];

      dst[j] = a + b + c + d;
      dst[dst_stride + j] = a + b + c + d + e;
      dst[2 * dst_stride + j] = a + b + c + d + e + f;
      for (i = 3; i < height - 4; ++i) {
        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
        a = b;
        b = c;
        c = d;
        d = e;
        e = f;
        f = g;
        g = src[(i + 4) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
      dst[(i + 3) * dst_stride + j] = d + e + f + g;
    }
  } else {
    for (j = 0; j < width; ++j) {
      a = src[j] * src[j];
      b = src[1 * src_stride + j] * src[1 * src_stride + j];
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
      e = src[4 * src_stride + j] * src[4 * src_stride + j];
      f = src[5 * src_stride + j] * src[5 * src_stride + j];
      g = src[6 * src_stride + j] * src[6 * src_stride + j];

      dst[j] = a + b + c + d;
      dst[dst_stride + j] = a + b + c + d + e;
      dst[2 * dst_stride + j] = a + b + c + d + e + f;
      for (i = 3; i < height - 4; ++i) {
        dst[i * dst_stride + j] = a + b + c + d + e + f + g;
        a = b;
        b = c;
        c = d;
        d = e;
        e = f;
        f = g;
        g = src[(i + 4) * src_stride + j] * src[(i + 4) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
      dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
      dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
      dst[(i + 3) * dst_stride + j] = d + e + f + g;
    }
  }

  // Horizontal sum over 7-pixel regions of dst
  for (i = 0; i < height; ++i) {
    a = dst[i * dst_stride];
    b = dst[i * dst_stride + 1];
    c = dst[i * dst_stride + 2];
    d = dst[i * dst_stride + 3];
    e = dst[i * dst_stride + 4];
    f = dst[i * dst_stride + 5];
    g = dst[i * dst_stride + 6];

    dst[i * dst_stride] = a + b + c + d;
    dst[i * dst_stride + 1] = a + b + c + d + e;
    dst[i * dst_stride + 2] = a + b + c + d + e + f;
    for (j = 3; j < width - 4; ++j) {
      dst[i * dst_stride + j] = a + b + c + d + e + f + g;
      a = b;
      b = c;
      c = d;
      d = e;
      e = f;
      f = g;
      g = dst[i * dst_stride + (j + 4)];
    }
    dst[i * dst_stride + j] = a + b + c + d + e + f + g;
    dst[i * dst_stride + (j + 1)] = b + c + d + e + f + g;
    dst[i * dst_stride + (j + 2)] = c + d + e + f + g;
    dst[i * dst_stride + (j + 3)] = d + e + f + g;
  }
}

// Generic version for any r. To be removed after experiments are done.
static void boxsumr(int32_t *src, int width, int height, int src_stride, int r,
                    int sqr, int32_t *dst, int dst_stride) {
  int32_t *tmp = aom_malloc(width * height * sizeof(*tmp));
  int tmp_stride = width;
  int i, j;
  if (sqr) {
    for (j = 0; j < width; ++j) tmp[j] = src[j] * src[j];
    for (j = 0; j < width; ++j)
      for (i = 1; i < height; ++i)
        tmp[i * tmp_stride + j] =
            tmp[(i - 1) * tmp_stride + j] +
            src[i * src_stride + j] * src[i * src_stride + j];
  } else {
    memcpy(tmp, src, sizeof(*tmp) * width);
    for (j = 0; j < width; ++j)
      for (i = 1; i < height; ++i)
        tmp[i * tmp_stride + j] =
            tmp[(i - 1) * tmp_stride + j] + src[i * src_stride + j];
  }
  for (i = 0; i <= r; ++i)
    memcpy(&dst[i * dst_stride], &tmp[(i + r) * tmp_stride],
           sizeof(*tmp) * width);
  for (i = r + 1; i < height - r; ++i)
    for (j = 0; j < width; ++j)
      dst[i * dst_stride + j] =
          tmp[(i + r) * tmp_stride + j] - tmp[(i - r - 1) * tmp_stride + j];
  for (i = height - r; i < height; ++i)
    for (j = 0; j < width; ++j)
      dst[i * dst_stride + j] = tmp[(height - 1) * tmp_stride + j] -
                                tmp[(i - r - 1) * tmp_stride + j];

  for (i = 0; i < height; ++i) tmp[i * tmp_stride] = dst[i * dst_stride];
  for (i = 0; i < height; ++i)
    for (j = 1; j < width; ++j)
      tmp[i * tmp_stride + j] =
          tmp[i * tmp_stride + j - 1] + dst[i * src_stride + j];

  for (j = 0; j <= r; ++j)
    for (i = 0; i < height; ++i)
      dst[i * dst_stride + j] = tmp[i * tmp_stride + j + r];
  for (j = r + 1; j < width - r; ++j)
    for (i = 0; i < height; ++i)
      dst[i * dst_stride + j] =
          tmp[i * tmp_stride + j + r] - tmp[i * tmp_stride + j - r - 1];
  for (j = width - r; j < width; ++j)
    for (i = 0; i < height; ++i)
      dst[i * dst_stride + j] =
          tmp[i * tmp_stride + width - 1] - tmp[i * tmp_stride + j - r - 1];
  aom_free(tmp);
}

564
565
566
567
568
569
static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
                   int sqr, int32_t *dst, int dst_stride) {
  if (r == 1)
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
  else if (r == 2)
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
570
571
572
573
  else if (r == 3)
    boxsum3(src, width, height, src_stride, sqr, dst, dst_stride);
  else
    boxsumr(src, width, height, src_stride, r, sqr, dst, dst_stride);
574
575
576
577
}

static void boxnum(int width, int height, int r, int8_t *num, int num_stride) {
  int i, j;
578
579
580
  for (i = 0; i <= r; ++i) {
    for (j = 0; j <= r; ++j) {
      num[i * num_stride + j] = (r + 1 + i) * (r + 1 + j);
581
582
583
584
585
586
      num[i * num_stride + (width - 1 - j)] = num[i * num_stride + j];
      num[(height - 1 - i) * num_stride + j] = num[i * num_stride + j];
      num[(height - 1 - i) * num_stride + (width - 1 - j)] =
          num[i * num_stride + j];
    }
  }
587
588
  for (j = 0; j <= r; ++j) {
    const int val = (2 * r + 1) * (r + 1 + j);
589
590
591
592
593
    for (i = r + 1; i < height - r; ++i) {
      num[i * num_stride + j] = val;
      num[i * num_stride + (width - 1 - j)] = val;
    }
  }
594
595
  for (i = 0; i <= r; ++i) {
    const int val = (2 * r + 1) * (r + 1 + i);
596
597
598
599
600
601
602
    for (j = r + 1; j < width - r; ++j) {
      num[i * num_stride + j] = val;
      num[(height - 1 - i) * num_stride + j] = val;
    }
  }
  for (i = r + 1; i < height - r; ++i) {
    for (j = r + 1; j < width - r; ++j) {
603
      num[i * num_stride + j] = (2 * r + 1) * (2 * r + 1);
604
605
606
607
608
    }
  }
}

void decode_xq(int *xqd, int *xq) {
609
  xq[0] = xqd[0];
610
611
612
  xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
}

David Barker's avatar
David Barker committed
613
const int32_t x_by_xplus1[256] = {
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
  0,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  256,
};

David Barker's avatar
David Barker committed
634
const int32_t one_by_x[MAX_NELEM] = {
635
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
636
637
638
639
640
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
#if MAX_RADIUS > 2
  158,  152,  146,  141,  137, 132, 128, 124, 120, 117, 114, 111, 108,
  105,  102,  100,  98,   95,  93,  91,  89,  87,  85,  84
#endif  // MAX_RADIUS > 2
641
642
};

643
static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
644
645
                                                int height, int dgd_stride,
                                                int32_t *dst, int dst_stride,
646
                                                int bit_depth, int r, int eps) {
647
648
649
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
  const int num_stride = width_ext;
David Barker's avatar
David Barker committed
650
651
652
653
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
  // leading to a significant speed improvement.
  // We also align the stride to a multiple of 16 bytes, for consistency
  // with the SIMD version of this function.
654
  int buf_stride = ((width_ext + 3) & ~3) + 16;
655
656
657
658
659
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
  int32_t *A = A_;
  int32_t *B = B_;
  int8_t num_[RESTORATION_PROC_UNIT_PELS];
660
661
  int8_t *num = num_ + SGRPROJ_BORDER_VERT * num_stride + SGRPROJ_BORDER_HORZ;
  int i, j;
662

663
664
665
  // Don't filter tiles with dimensions < 5 on any axis
  if ((width < 5) || (height < 5)) return;

666
667
668
669
670
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
  boxnum(width_ext, height_ext, r, num_, num_stride);
671
  assert(r <= 3);
672
673
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
674
675
  for (i = 0; i < height; ++i) {
    for (j = 0; j < width; ++j) {
David Barker's avatar
David Barker committed
676
      const int k = i * buf_stride + j;
677
      const int n = num[i * num_stride + j];
678

679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
      // a < 2^16 * n < 2^22 regardless of bit depth
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
      // b < 2^8 * n < 2^14 regardless of bit depth
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);

      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
      // This is an artefact of rounding, and can only happen if all pixels
      // are (almost) identical, so in this case we saturate to p=0.
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
      uint32_t s = sgrproj_mtable[eps - 1][n - 1];

      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
      // (this holds even after accounting for the rounding in s)
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);

      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // < 2^8

      // SGRPROJ_SGR - A[k] < 2^8, B[k] < 2^(bit_depth) * n,
      // one_by_x[n - 1] = round(2^12 / n)
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
      // and B[k] is set to a value < 2^(8 + bit depth)
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
                                             (uint32_t)B[k] *
                                             (uint32_t)one_by_x[n - 1],
                                         SGRPROJ_RECIP_BITS);
707
708
709
710
711
    }
  }
  i = 0;
  j = 0;
  {
David Barker's avatar
David Barker committed
712
    const int k = i * buf_stride + j;
713
714
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
715
    const int nb = 3;
716
    const int32_t a =
David Barker's avatar
David Barker committed
717
        3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + A[k + buf_stride + 1];
718
    const int32_t b =
David Barker's avatar
David Barker committed
719
        3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] + B[k + buf_stride + 1];
720
    const int32_t v = a * dgd[l] + b;
721
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
722
723
724
725
  }
  i = 0;
  j = width - 1;
  {
David Barker's avatar
David Barker committed
726
    const int k = i * buf_stride + j;
727
728
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
729
    const int nb = 3;
730
    const int32_t a =
David Barker's avatar
David Barker committed
731
        3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + A[k + buf_stride - 1];
732
    const int32_t b =
David Barker's avatar
David Barker committed
733
        3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] + B[k + buf_stride - 1];
734
    const int32_t v = a * dgd[l] + b;
735
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
736
737
738
739
  }
  i = height - 1;
  j = 0;
  {
David Barker's avatar
David Barker committed
740
    const int k = i * buf_stride + j;
741
742
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
743
    const int nb = 3;
744
    const int32_t a =
David Barker's avatar
David Barker committed
745
        3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + A[k - buf_stride + 1];
746
    const int32_t b =
David Barker's avatar
David Barker committed
747
        3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] + B[k - buf_stride + 1];
748
    const int32_t v = a * dgd[l] + b;
749
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
750
751
752
753
  }
  i = height - 1;
  j = width - 1;
  {
David Barker's avatar
David Barker committed
754
    const int k = i * buf_stride + j;
755
756
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
757
    const int nb = 3;
758
    const int32_t a =
David Barker's avatar
David Barker committed
759
        3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + A[k - buf_stride - 1];
760
    const int32_t b =
David Barker's avatar
David Barker committed
761
        3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] + B[k - buf_stride - 1];
762
    const int32_t v = a * dgd[l] + b;
763
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
764
765
766
  }
  i = 0;
  for (j = 1; j < width - 1; ++j) {
David Barker's avatar
David Barker committed
767
    const int k = i * buf_stride + j;
768
769
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
770
    const int nb = 3;
David Barker's avatar
David Barker committed
771
772
773
774
    const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
                      A[k + buf_stride - 1] + A[k + buf_stride + 1];
    const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
                      B[k + buf_stride - 1] + B[k + buf_stride + 1];
775
    const int32_t v = a * dgd[l] + b;
776
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
777
778
779
  }
  i = height - 1;
  for (j = 1; j < width - 1; ++j) {
David Barker's avatar
David Barker committed
780
    const int k = i * buf_stride + j;
781
782
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
783
    const int nb = 3;
David Barker's avatar
David Barker committed
784
785
786
787
    const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
                      A[k - buf_stride - 1] + A[k - buf_stride + 1];
    const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
                      B[k - buf_stride - 1] + B[k - buf_stride + 1];
788
    const int32_t v = a * dgd[l] + b;
789
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
790
791
792
  }
  j = 0;
  for (i = 1; i < height - 1; ++i) {
David Barker's avatar
David Barker committed
793
    const int k = i * buf_stride + j;
794
795
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
796
    const int nb = 3;
David Barker's avatar
David Barker committed
797
798
799
800
    const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
                      A[k + 1] + A[k - buf_stride + 1] + A[k + buf_stride + 1];
    const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
                      B[k + 1] + B[k - buf_stride + 1] + B[k + buf_stride + 1];
801
    const int32_t v = a * dgd[l] + b;
802
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
803
804
805
  }
  j = width - 1;
  for (i = 1; i < height - 1; ++i) {
David Barker's avatar
David Barker committed
806
    const int k = i * buf_stride + j;
807
808
    const int l = i * dgd_stride + j;
    const int m = i * dst_stride + j;
809
    const int nb = 3;
David Barker's avatar
David Barker committed
810
811
812
813
    const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
                      A[k - 1] + A[k - buf_stride - 1] + A[k + buf_stride - 1];
    const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
                      B[k - 1] + B[k - buf_stride - 1] + B[k + buf_stride - 1];
814
    const int32_t v = a * dgd[l] + b;
815
    dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
816
817
818
  }
  for (i = 1; i < height - 1; ++i) {
    for (j = 1; j < width - 1; ++j) {
David Barker's avatar
David Barker committed
819
      const int k = i * buf_stride + j;
820
821
      const int l = i * dgd_stride + j;
      const int m = i * dst_stride + j;
822
      const int nb = 5;
823
      const int32_t a =
David Barker's avatar
David Barker committed
824
825
826
827
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
              4 +
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
828
              3;
829
      const int32_t b =
David Barker's avatar
David Barker committed
830
831
832
833
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
              4 +
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
834
              3;
835
      const int32_t v = a * dgd[l] + b;
836
      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
837
838
839
840
    }
  }
}

841
842
void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
                                  int stride, int32_t *dst, int dst_stride,
843
844
                                  int r, int eps) {
  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
845
846
  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
  int32_t *dgd32 =
847
      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
848
  int i, j;
849
850
851
  for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
    for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
      dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
852
853
    }
  }
854
  av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
855
                                      dst_stride, 8, r, eps);
856
857
}

858
859
void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride,
                           int32_t *dst, int dst_stride, int corner, int edge) {
860
  int i, j;
861
  const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
862
863
864
865
866

  i = 0;
  j = 0;
  {
    const int k = i * stride + j;
867
868
869
870
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
        corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
871
872
873
874
875
  }
  i = 0;
  j = width - 1;
  {
    const int k = i * stride + j;
876
877
878
879
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
        corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
880
881
882
883
884
  }
  i = height - 1;
  j = 0;
  {
    const int k = i * stride + j;
885
886
887
888
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
        corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
889
890
891
892
893
  }
  i = height - 1;
  j = width - 1;
  {
    const int k = i * stride + j;
894
895
896
897
    const int l = i * dst_stride + j;
    dst[l] =
        center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
        corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
898
899
900
901
  }
  i = 0;
  for (j = 1; j < width - 1; ++j) {
    const int k = i * stride + j;
902
903
904
905
906
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
             corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
                       dgd[k + 1]);
907
908
909
910
  }
  i = height - 1;
  for (j = 1; j < width - 1; ++j) {
    const int k = i * stride + j;
911
912
913
914
915
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
             corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
                       dgd[k + 1]);
916
917
918
919
  }
  j = 0;
  for (i = 1; i < height - 1; ++i) {
    const int k = i * stride + j;
920
921
922
923
924
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
             corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
                       dgd[k - stride] + dgd[k + stride]);
925
926
927
928
  }
  j = width - 1;
  for (i = 1; i < height - 1; ++i) {
    const int k = i * stride + j;
929
930
931
932
933
    const int l = i * dst_stride + j;
    dst[l] = center * dgd[k] +
             edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
             corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
                       dgd[k - stride] + dgd[k + stride]);
934
935
936
937
  }
  for (i = 1; i < height - 1; ++i) {
    for (j = 1; j < width - 1; ++j) {
      const int k = i * stride + j;
938
939
940
941
942
943
      const int l = i * dst_stride + j;
      dst[l] =
          center * dgd[k] +
          edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
          corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
                    dgd[k - stride + 1] + dgd[k + stride + 1]);
944
945
946
947
    }
  }
}

David Barker's avatar
David Barker committed
948
void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
949
950
                                    int stride, int eps, int *xqd, uint8_t *dst,
                                    int dst_stride, int32_t *tmpbuf) {
951
  int xq[2];
952
  int32_t *flt1 = tmpbuf;
953
  int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
954
  int i, j;
955
  assert(width * height <= RESTORATION_TILEPELS_MAX);
956
957
#if USE_HIGHPASS_IN_SGRPROJ
  av1_highpass_filter_c(dat, width, height, stride, flt1, width,
958
                        sgr_params[eps].corner, sgr_params[eps].edge);
959
#else
960
  av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
961
                               sgr_params[eps].r1, sgr_params[eps].e1);
962
#endif  // USE_HIGHPASS_IN_SGRPROJ
963
  av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
964
                               sgr_params[eps].r2, sgr_params[eps].e2);
965
966
967
968
969
  decode_xq(xqd, xq);
  for (i = 0; i < height; ++i) {
    for (j = 0; j < width; ++j) {
      const int k = i * width + j;
      const int l = i * stride + j;
970
971
972
973
      const int m = i * dst_stride + j;
      const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
      const int32_t f1 = (int32_t)flt1[k] - u;
      const int32_t f2 = (int32_t)flt2[k] - u;
David Barker's avatar
David Barker committed
974
      const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
975
976
      const int16_t w =
          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
977
      dst[m] = clip_pixel(w);
978
979
980
981
982
983
    }
  }
}

static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
                                     int height, int stride,
984
985
                                     RestorationInternal *rst, uint8_t *dst,
                                     int dst_stride) {
986
987
  const int procunit_width = rst->rsi->procunit_width;
  const int procunit_height = rst->rsi->procunit_height;
988
989
  const int tile_width = rst->tile_width;
  const int tile_height = rst->tile_height;
990
  int i, j;
991
992
  int h_start, h_end, v_start, v_end;

993
  if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
994
995
996
997
    loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
                   dst_stride);
    return;
  }
998
999
1000
  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
                           tile_width, tile_height, width, height, 0, 0,
                           &h_start, &h_end, &v_start, &v_end);
1001
1002
1003
1004
1005
1006
  for (i = v_start; i < v_end; i += procunit_height)
    for (j = h_start; j < h_end; j += procunit_width) {
      int w = AOMMIN(procunit_width, h_end - j);
      int h = AOMMIN(procunit_height, v_end - i);
      uint8_t *data_p = data + i * stride + j;
      uint8_t *dst_p = dst + i * dst_stride + j;
1007
      apply_selfguided_restoration(
1008
1009
1010
          data_p, w, h, stride, rst->rsi->sgrproj_info[tile_idx].ep,
          rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
    }
1011
1012
1013
1014
}

static void loop_sgrproj_filter(uint8_t *data, int width, int height,
                                int stride, RestorationInternal *rst,
1015
                                uint8_t *dst, int dst_stride) {
1016
  int tile_idx;
1017
1018
  extend_frame(data, width, height, stride, SGRPROJ_BORDER_HORZ,
               SGRPROJ_BORDER_VERT);
1019
  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1020
1021
    loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
                             dst_stride);
1022
1023
1024
  }
}

1025
1026
static void loop_switchable_filter(uint8_t *data, int width, int height,
                                   int stride, RestorationInternal *rst,
1027
                                   uint8_t *dst, int dst_stride) {
1028
  int tile_idx;
1029
1030
  extend_frame(data, width, height, stride, RESTORATION_BORDER_HORZ,
               RESTORATION_BORDER_VERT);
1031
  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1032
1033
1034
1035
    if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
      loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
                     dst_stride);
    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
1036
1037
      loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
                              dst_stride);
1038
    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
1039
1040
      loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
                               dst_stride);
1041
    }
1042
1043
1044
  }
}

1045
#if CONFIG_HIGHBITDEPTH
1046
1047
void extend_frame_highbd(uint16_t *data, int width, int height, int stride,
                         int border_horz, int border_vert) {
1048
1049
1050
1051
  uint16_t *data_p;
  int i, j;
  for (i = 0; i < height; ++i) {
    data_p = data + i * stride;
1052
1053
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
    for (j = width; j < width +