restoration.c 78 KB
Newer Older
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2 3 4 5 6 7 8 9
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 11 12 13 14
 *
 */

#include <math.h>

Yaowu Xu's avatar
Yaowu Xu committed
15 16
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
17
#include "./aom_scale_rtcd.h"
Yaowu Xu's avatar
Yaowu Xu committed
18
#include "aom_mem/aom_mem.h"
19
#include "av1/common/onyxc_int.h"
20
#if CONFIG_HORZONLY_FRAME_SUPERRES
21 22
#include "av1/common/resize.h"
#endif
23
#include "av1/common/restoration.h"
Yaowu Xu's avatar
Yaowu Xu committed
24 25
#include "aom_dsp/aom_dsp_common.h"
#include "aom_mem/aom_mem.h"
26

27
#include "aom_ports/mem.h"
28

29
const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
30
  // r1, eps1, r2, eps2
31 32
  { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
  { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
33
  { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 2, 30, 1, 6 },
34
  { 2, 50, 1, 12 }, { 2, 60, 1, 13 }, { 2, 70, 1, 14 }, { 2, 80, 1, 15 },
35 36
};

37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
// Count horizontal or vertical units per tile (use a width or height for
// tile_size, respectively). We basically want to divide the tile size by the
// size of a restoration unit. Rather than rounding up unconditionally as you
// might expect, we round to nearest, which models the way a right or bottom
// restoration unit can extend to up to 150% its normal width or height. The
// max with 1 is to deal with tiles that are smaller than half of a restoration
// unit.
static int count_units_in_tile(int unit_size, int tile_size) {
  return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
}

void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
                                  int is_uv) {
#if CONFIG_MAX_TILE
  // We need to allocate enough space for restoration units to cover the
  // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
53
  // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
54 55
  // to do the computation ourselves, iterating over the tiles and keeping
  // track of the largest width and height, then upscaling.
56 57 58 59 60
  TileInfo tile;
  int max_mi_w = 0;
  int max_mi_h = 0;
  int tile_col = 0;
  int tile_row = 0;
61
  for (int i = 0; i < cm->tile_cols; ++i) {
62 63 64 65 66
    av1_tile_set_col(&tile, cm, i);
    if (tile.mi_col_end - tile.mi_col_start > max_mi_w) {
      max_mi_w = tile.mi_col_end - tile.mi_col_start;
      tile_col = i;
    }
67 68
  }
  for (int i = 0; i < cm->tile_rows; ++i) {
69 70 71 72 73
    av1_tile_set_row(&tile, cm, i);
    if (tile.mi_row_end - tile.mi_row_start > max_mi_h) {
      max_mi_h = tile.mi_row_end - tile.mi_row_start;
      tile_row = i;
    }
74
  }
75 76
  TileInfo tile_info;
  av1_tile_init(&tile_info, cm, tile_row, tile_col);
77 78 79
#else
  TileInfo tile_info;
  av1_tile_init(&tile_info, cm, 0, 0);
80
#endif  // CONFIG_MAX_TILE
81

82
  const AV1PixelRect tile_rect = av1_get_tile_rect(&tile_info, cm, is_uv);
83 84
  const int max_tile_w = tile_rect.right - tile_rect.left;
  const int max_tile_h = tile_rect.bottom - tile_rect.top;
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104

  // To calculate hpertile and vpertile (horizontal and vertical units per
  // tile), we basically want to divide the largest tile width or height by the
  // size of a restoration unit. Rather than rounding up unconditionally as you
  // might expect, we round to nearest, which models the way a right or bottom
  // restoration unit can extend to up to 150% its normal width or height. The
  // max with 1 is to deal with tiles that are smaller than half of a
  // restoration unit.
  const int unit_size = rsi->restoration_unit_size;
  const int hpertile = count_units_in_tile(unit_size, max_tile_w);
  const int vpertile = count_units_in_tile(unit_size, max_tile_h);

  rsi->units_per_tile = hpertile * vpertile;
  rsi->horz_units_per_tile = hpertile;
  rsi->vert_units_per_tile = vpertile;

  const int ntiles = cm->tile_rows * cm->tile_cols;
  const int nunits = ntiles * rsi->units_per_tile;

  aom_free(rsi->unit_info);
105 106 107
  CHECK_MEM_ERROR(cm, rsi->unit_info,
                  (RestorationUnitInfo *)aom_memalign(
                      16, sizeof(*rsi->unit_info) * nunits));
108 109 110
}

void av1_free_restoration_struct(RestorationInfo *rst_info) {
111 112
  aom_free(rst_info->unit_info);
  rst_info->unit_info = NULL;
113
}
114 115 116

// TODO(debargha): This table can be substantially reduced since only a few
// values are actually used.
David Barker's avatar
David Barker committed
117
int sgrproj_mtable[MAX_EPS][MAX_NELEM];
118 119 120 121 122 123 124 125 126 127

static void GenSgrprojVtable() {
  int e, n;
  for (e = 1; e <= MAX_EPS; ++e)
    for (n = 1; n <= MAX_NELEM; ++n) {
      const int n2e = n * n * e;
      sgrproj_mtable[e - 1][n - 1] =
          (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
    }
}
128 129

void av1_loop_restoration_precal() { GenSgrprojVtable(); }
130

131 132
static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
                               int border_horz, int border_vert) {
133 134 135 136
  uint8_t *data_p;
  int i;
  for (i = 0; i < height; ++i) {
    data_p = data + i * stride;
137 138
    memset(data_p - border_horz, data_p[0], border_horz);
    memset(data_p + width, data_p[width - 1], border_horz);
139
  }
140 141 142
  data_p = data - border_horz;
  for (i = -border_vert; i < 0; ++i) {
    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
143
  }
144
  for (i = height; i < height + border_vert; ++i) {
145
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
146
           width + 2 * border_horz);
147 148 149
  }
}

150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
static void extend_frame_highbd(uint16_t *data, int width, int height,
                                int stride, int border_horz, int border_vert) {
  uint16_t *data_p;
  int i, j;
  for (i = 0; i < height; ++i) {
    data_p = data + i * stride;
    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
  }
  data_p = data - border_horz;
  for (i = -border_vert; i < 0; ++i) {
    memcpy(data_p + i * stride, data_p,
           (width + 2 * border_horz) * sizeof(uint16_t));
  }
  for (i = height; i < height + border_vert; ++i) {
    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
           (width + 2 * border_horz) * sizeof(uint16_t));
  }
}

void extend_frame(uint8_t *data, int width, int height, int stride,
                  int border_horz, int border_vert, int highbd) {
  if (highbd)
    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
                        border_horz, border_vert);
  else
176
    extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
177 178
}

179 180 181 182
static void copy_tile_lowbd(int width, int height, const uint8_t *src,
                            int src_stride, uint8_t *dst, int dst_stride) {
  for (int i = 0; i < height; ++i)
    memcpy(dst + i * dst_stride, src + i * src_stride, width);
183 184
}

185 186 187 188
static void copy_tile_highbd(int width, int height, const uint16_t *src,
                             int src_stride, uint16_t *dst, int dst_stride) {
  for (int i = 0; i < height; ++i)
    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
189 190
}

191 192
static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
                      uint8_t *dst, int dst_stride, int highbd) {
193
  if (highbd)
194
    copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
195 196
                     CONVERT_TO_SHORTPTR(dst), dst_stride);
  else
197
    copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
198
}
199

200 201 202
#if CONFIG_STRIPED_LOOP_RESTORATION
#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))

203 204 205
// Helper function: Save one column of left/right context to the appropriate
// column buffers, then extend the edge of the current tile into that column.
//
206 207 208 209
// Note: The height passed in should be the height of this processing unit,
// but we actually save/restore an extra RESTORATION_BORDER pixels above and
// below the stripe.
#if CONFIG_LOOPFILTERING_ACROSS_TILES || CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
210
static void setup_boundary_column(const uint8_t *src8, int src_stride,
211 212
                                  uint8_t *dst8, int dst_stride, uint16_t *buf,
                                  int h, int use_highbd) {
213 214 215
  if (use_highbd) {
    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src8);
    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
216 217
    for (int i = -RESTORATION_BORDER; i < h + RESTORATION_BORDER; i++) {
      buf[i + RESTORATION_BORDER] = dst16[i * dst_stride];
218 219 220
      dst16[i * dst_stride] = src16[i * src_stride];
    }
  } else {
221 222
    for (int i = -RESTORATION_BORDER; i < h + RESTORATION_BORDER; i++) {
      buf[i + RESTORATION_BORDER] = dst8[i * dst_stride];
223 224 225 226
      dst8[i * dst_stride] = src8[i * src_stride];
    }
  }
}
227 228 229 230 231 232

static void restore_boundary_column(uint8_t *dst8, int dst_stride,
                                    const uint16_t *buf, int h,
                                    int use_highbd) {
  if (use_highbd) {
    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
233 234
    for (int i = -RESTORATION_BORDER; i < h + RESTORATION_BORDER; i++)
      dst16[i * dst_stride] = buf[i + RESTORATION_BORDER];
235
  } else {
236
    for (int i = -RESTORATION_BORDER; i < h + RESTORATION_BORDER; i++)
237
      dst8[i * dst_stride] = (uint8_t)(buf[i + RESTORATION_BORDER]);
238 239
  }
}
240 241
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES

242
// With striped loop restoration, the filtering for each 64-pixel stripe gets
243 244 245 246
// most of its input from the output of CDEF (stored in data8), but we need to
// fill out a border of 3 pixels above/below the stripe according to the
// following
// rules:
247
//
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
//   This extension is done by a call to extend_frame() at the start of the loop
//   restoration process, so the value of copy_above/copy_below doesn't strictly
//   matter.
//   However, by setting *copy_above = *copy_below = 1 whenever loop filtering
//   across tiles is disabled, we can allow
//   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
//   data has always been copied, simplifying the behaviour at the left and
//   right edges of tiles.
//
// * If we're at a tile boundary and loop filtering across tiles is enabled,
//   then there is a logical stripe which is 64 pixels high, but which is split
//   into an 8px high and a 56px high stripe so that the processing (and
//   coefficient set usage) can be aligned to tiles.
//   In this case, we use the 3 rows of CDEF output across the boundary for
//   context; this corresponds to leaving the frame buffer as-is.
//
// * If we're at a tile boundary and loop filtering across tiles is disabled,
//   then we take the outermost row of CDEF pixels *within the current tile*
//   and copy it three times. Thus we behave exactly as if the tile were a full
//   frame.
//
// * Otherwise, we're at a stripe boundary within a tile. In that case, we
//   take 2 rows of deblocked pixels and extend them to 3 rows of context.
//
// The distinction between the latter two cases is handled by the
// av1_loop_restoration_save_boundary_lines() function, so here we just need
// to decide if we're overwriting the above/below boundary pixels or not.
static void get_stripe_boundary_info(const RestorationTileLimits *limits,
                                     const AV1PixelRect *tile_rect, int ss_y,
#if CONFIG_LOOPFILTERING_ACROSS_TILES
279 280 281
#if CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
                                     int loop_filter_across_tiles_h_enabled,
#else
282
                                     int loop_filter_across_tiles_enabled,
283
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
284 285 286 287 288 289
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
                                     int *copy_above, int *copy_below) {
  *copy_above = 1;
  *copy_below = 1;

#if CONFIG_LOOPFILTERING_ACROSS_TILES
290 291 292
#if CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
  if (loop_filter_across_tiles_h_enabled) {
#else
293
  if (loop_filter_across_tiles_enabled) {
294 295
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
296 297 298 299 300 301 302 303 304 305 306
    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
    const int rtile_offset = RESTORATION_TILE_OFFSET >> ss_y;

    const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
    const int this_stripe_height =
        full_stripe_height - (first_stripe_in_tile ? rtile_offset : 0);
    const int last_stripe_in_tile =
        (limits->v_start + this_stripe_height >= tile_rect->bottom);

    if (first_stripe_in_tile) *copy_above = 0;
    if (last_stripe_in_tile) *copy_below = 0;
307
#if CONFIG_LOOPFILTERING_ACROSS_TILES || CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
308 309 310 311 312 313 314 315 316
  }
#endif
}

// Overwrite the border pixels around a processing stripe so that the conditions
// listed above get_stripe_boundary_info() are preserved.
// We save the pixels which get overwritten into a temporary buffer, so that
// they can be restored by restore_processing_stripe_boundary() after we've
// processed the stripe.
317 318
//
// limits gives the rectangular limits of the remaining stripes for the current
319 320
// restoration unit. rsb is the stored stripe boundaries (taken from either
// deblock or CDEF output as necessary).
321 322 323 324
//
// tile_rect is the limits of the current tile and tile_stripe0 is the index of
// the first stripe in this tile (needed to convert the tile-relative stripe
// index we get from limits into something we can look up in rsb).
325
static void setup_processing_stripe_boundary(
326
    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
327
    int rsb_row, int use_highbd, int h,
328
#if CONFIG_LOOPFILTERING_ACROSS_TILES
329 330 331
#if CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
    const AV1PixelRect *tile_rect, int loop_filter_across_tiles_v_enabled,
#else
332
    const AV1PixelRect *tile_rect, int loop_filter_across_tiles_enabled,
333
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
334
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
335 336
    uint8_t *data8, int data_stride, RestorationLineBuffers *rlbs,
    int copy_above, int copy_below) {
337 338 339
  // Offsets within the line buffers. The buffer logically starts at column
  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
  // has column x0 in the buffer.
340
  const int buf_stride = rsb->stripe_boundary_stride;
341 342 343 344
  const int buf_x0_off = limits->h_start;
  const int line_width =
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
  const int line_size = line_width << use_highbd;
345

346
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
347

348 349 350 351 352
  // Replace RESTORATION_BORDER pixels above the top of the stripe
  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
  // duplicating the topmost of the 2 lines (see the AOMMAX call when
  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
  //
  // Special case: If we're at the top of a tile, which isn't on the topmost
  // tile row, and we're allowed to loop filter across tiles, then we have a
  // logical 64-pixel-high stripe which has been split into an 8-pixel high
  // stripe and a 56-pixel high stripe (the current one). So, in this case,
  // we want to leave the boundary alone!
  if (copy_above) {
    uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;

    for (int i = -RESTORATION_BORDER; i < 0; ++i) {
      const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
      const int buf_off = buf_x0_off + buf_row * buf_stride;
      const uint8_t *buf = rsb->stripe_boundary_above + (buf_off << use_highbd);
      uint8_t *dst8 = data8_tl + i * data_stride;
      // Save old pixels, then replace with data from stripe_boundary_above
      memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
             REAL_PTR(use_highbd, dst8), line_size);
      memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
    }
372
  }
373

374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
  // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
  // The second buffer row is repeated, so src_row gets the values 0, 1, 1
  // for i = 0, 1, 2.
  if (copy_below) {
    const int stripe_end = limits->v_start + h;
    uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;

    for (int i = 0; i < RESTORATION_BORDER; ++i) {
      const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
      const int buf_off = buf_x0_off + buf_row * buf_stride;
      const uint8_t *src = rsb->stripe_boundary_below + (buf_off << use_highbd);

      uint8_t *dst8 = data8_bl + i * data_stride;
      // Save old pixels, then replace with data from stripe_boundary_below
      memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
      memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
    }
391
  }
392

393
#if CONFIG_LOOPFILTERING_ACROSS_TILES
394 395
#if CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
  if (!loop_filter_across_tiles_v_enabled) {
396 397 398 399
    // If loopfiltering across tiles is disabled, we need to check if we're at
    // the edge of the current tile column. If we are, we need to extend the
    // leftmost/rightmost column within the tile by 3 pixels, so that the output
    // doesn't depend on pixels from the next column over.
400 401
    // This applies to the top and bottom borders too, since those may have
    // been filled out with data from the tile to the top-left (etc.) of us.
402 403
    const int at_tile_left_border = (limits->h_start == tile_rect->left);
    const int at_tile_right_border = (limits->h_end == tile_rect->right);
404

405 406 407 408
    if (at_tile_left_border) {
      uint8_t *dst8 = data8 + limits->h_start + limits->v_start * data_stride;
      for (int j = -RESTORATION_BORDER; j < 0; j++)
        setup_boundary_column(dst8, data_stride, dst8 + j, data_stride,
409 410
                              rlbs->tmp_save_left[j + RESTORATION_BORDER], h,
                              use_highbd);
411 412 413 414 415 416
    }

    if (at_tile_right_border) {
      uint8_t *dst8 = data8 + limits->h_end + limits->v_start * data_stride;
      for (int j = 0; j < RESTORATION_BORDER; j++)
        setup_boundary_column(dst8 - 1, data_stride, dst8 + j, data_stride,
417
                              rlbs->tmp_save_right[j], h, use_highbd);
418 419
    }
  }
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
#else
  if (!loop_filter_across_tiles_enabled) {
    // If loopfiltering across tiles is disabled, we need to extend tile edges
    // by 3 pixels, to ensure that we don't sample from the tiles to our left
    // or right.
    const int at_tile_left_border = (limits->h_start == tile_rect->left);
    const int at_tile_right_border = (limits->h_end == tile_rect->right);

    if (at_tile_left_border) {
      uint8_t *dst8 = data8 + limits->h_start + limits->v_start * data_stride;
      for (int j = -RESTORATION_BORDER; j < 0; j++)
        setup_boundary_column(dst8, data_stride, dst8 + j, data_stride,
                              rlbs->tmp_save_left[j + RESTORATION_BORDER], h,
                              use_highbd);
    }

    if (at_tile_right_border) {
      uint8_t *dst8 = data8 + limits->h_end + limits->v_start * data_stride;
      for (int j = 0; j < RESTORATION_BORDER; j++)
        setup_boundary_column(dst8 - 1, data_stride, dst8 + j, data_stride,
                              rlbs->tmp_save_right[j], h, use_highbd);
    }
  }
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
444
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
445 446 447 448
}

// This function restores the boundary lines modified by
// setup_processing_stripe_boundary.
449 450 451 452 453 454 455 456 457 458 459
//
// Note: We need to be careful when handling the corners of the processing
// unit, because (eg.) the top-left corner is considered to be part of
// both the left and top borders. This means that, depending on the
// loop_filter_across_tiles_enabled flag, the corner pixels might get
// overwritten twice, once as part of the "top" border and once as part
// of the "left" border (or similar for other corners).
//
// Everything works out fine as long as we make sure to reverse the order
// when restoring, ie. we need to restore the left/right borders followed
// by the top/bottom borders.
460
static void restore_processing_stripe_boundary(
461
    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
462
    int use_highbd, int h,
463
#if CONFIG_LOOPFILTERING_ACROSS_TILES
464 465 466
#if CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
    const AV1PixelRect *tile_rect, int loop_filter_across_tiles_v_enabled,
#else
467
    const AV1PixelRect *tile_rect, int loop_filter_across_tiles_enabled,
468
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
469
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
470
    uint8_t *data8, int data_stride, int copy_above, int copy_below) {
471 472 473
  const int line_width =
      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
  const int line_size = line_width << use_highbd;
474

475 476
  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;

477 478 479 480 481 482 483
#if CONFIG_LOOPFILTERING_ACROSS_TILES
#if CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
  if (!loop_filter_across_tiles_v_enabled) {
    // Restore any pixels we overwrote at the left/right edge of this
    // processing unit.
    const int at_tile_left_border = (limits->h_start == tile_rect->left);
    const int at_tile_right_border = (limits->h_end == tile_rect->right);
484

485 486 487 488 489 490 491
    if (at_tile_left_border) {
      uint8_t *dst8 = data8 + limits->h_start + limits->v_start * data_stride;
      for (int j = -RESTORATION_BORDER; j < 0; j++)
        restore_boundary_column(dst8 + j, data_stride,
                                rlbs->tmp_save_left[j + RESTORATION_BORDER], h,
                                use_highbd);
    }
492

493 494 495 496 497
    if (at_tile_right_border) {
      uint8_t *dst8 = data8 + limits->h_end + limits->v_start * data_stride;
      for (int j = 0; j < RESTORATION_BORDER; j++)
        restore_boundary_column(dst8 + j, data_stride, rlbs->tmp_save_right[j],
                                h, use_highbd);
498
    }
499
  }
500
#else
501 502
  if (!loop_filter_across_tiles_enabled) {
    // Restore any pixels we overwrote at the left/right edge of this
503
    // processing unit.
504 505 506 507 508 509
    const int at_tile_left_border = (limits->h_start == tile_rect->left);
    const int at_tile_right_border = (limits->h_end == tile_rect->right);

    if (at_tile_left_border) {
      uint8_t *dst8 = data8 + limits->h_start + limits->v_start * data_stride;
      for (int j = -RESTORATION_BORDER; j < 0; j++)
510 511 512
        restore_boundary_column(dst8 + j, data_stride,
                                rlbs->tmp_save_left[j + RESTORATION_BORDER], h,
                                use_highbd);
513 514 515 516 517
    }

    if (at_tile_right_border) {
      uint8_t *dst8 = data8 + limits->h_end + limits->v_start * data_stride;
      for (int j = 0; j < RESTORATION_BORDER; j++)
518 519
        restore_boundary_column(dst8 + j, data_stride, rlbs->tmp_save_right[j],
                                h, use_highbd);
520 521
    }
  }
522
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES_EXT
523
#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544

  if (copy_above) {
    uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
    for (int i = -RESTORATION_BORDER; i < 0; ++i) {
      uint8_t *dst8 = data8_tl + i * data_stride;
      memcpy(REAL_PTR(use_highbd, dst8),
             rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
    }
  }

  if (copy_below) {
    const int stripe_bottom = limits->v_start + h;
    uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;

    for (int i = 0; i < RESTORATION_BORDER; ++i) {
      if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;

      uint8_t *dst8 = data8_bl + i * data_stride;
      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
    }
  }
545 546 547
}
#endif

548 549
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
#define wiener_convolve8_add_src aom_convolve8_add_src_hip
550
#else
551
#define wiener_convolve8_add_src aom_convolve8_add_src
552 553
#endif

554 555 556 557 558 559 560 561 562 563 564
static void wiener_filter_stripe(const RestorationUnitInfo *rui,
                                 int stripe_width, int stripe_height,
                                 int procunit_width, const uint8_t *src,
                                 int src_stride, uint8_t *dst, int dst_stride,
                                 int32_t *tmpbuf, int bit_depth) {
  (void)tmpbuf;
  (void)bit_depth;
  assert(bit_depth == 8);

  for (int j = 0; j < stripe_width; j += procunit_width) {
    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
565 566 567
    const uint8_t *src_p = src + j;
    uint8_t *dst_p = dst + j;
    wiener_convolve8_add_src(src_p, src_stride, dst_p, dst_stride,
568
                             rui->wiener_info.hfilter, 16,
569
                             rui->wiener_info.vfilter, 16, w, stripe_height);
570
  }
571
}
572

573 574
/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
   over the input. The window is of size (2r + 1)x(2r + 1), and we
575
   specialize to r = 1, 2, 3. A default function is used for r > 3.
576 577 578 579 580 581 582 583 584 585 586 587 588 589

   Each loop follows the same format: We keep a window's worth of input
   in individual variables and select data out of that as appropriate.
*/
static void boxsum1(int32_t *src, int width, int height, int src_stride,
                    int sqr, int32_t *dst, int dst_stride) {
  int i, j, a, b, c;

  // Vertical sum over 3-pixel regions, from src into dst.
  if (!sqr) {
    for (j = 0; j < width; ++j) {
      a = src[j];
      b = src[src_stride + j];
      c = src[2 * src_stride + j];
590

591 592 593 594 595 596 597 598 599 600 601 602 603 604
      dst[j] = a + b;
      for (i = 1; i < height - 2; ++i) {
        // Loop invariant: At the start of each iteration,
        // a = src[(i - 1) * src_stride + j]
        // b = src[(i    ) * src_stride + j]
        // c = src[(i + 1) * src_stride + j]
        dst[i * dst_stride + j] = a + b + c;
        a = b;
        b = c;
        c = src[(i + 2) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c;
      dst[(i + 1) * dst_stride + j] = b + c;
    }
605
  } else {
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737
    for (j = 0; j < width; ++j) {
      a = src[j] * src[j];
      b = src[src_stride + j] * src[src_stride + j];
      c = src[2 * src_stride + j] * src[2 * src_stride + j];

      dst[j] = a + b;
      for (i = 1; i < height - 2; ++i) {
        dst[i * dst_stride + j] = a + b + c;
        a = b;
        b = c;
        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c;
      dst[(i + 1) * dst_stride + j] = b + c;
    }
  }

  // Horizontal sum over 3-pixel regions of dst
  for (i = 0; i < height; ++i) {
    a = dst[i * dst_stride];
    b = dst[i * dst_stride + 1];
    c = dst[i * dst_stride + 2];

    dst[i * dst_stride] = a + b;
    for (j = 1; j < width - 2; ++j) {
      // Loop invariant: At the start of each iteration,
      // a = src[i * src_stride + (j - 1)]
      // b = src[i * src_stride + (j    )]
      // c = src[i * src_stride + (j + 1)]
      dst[i * dst_stride + j] = a + b + c;
      a = b;
      b = c;
      c = dst[i * dst_stride + (j + 2)];
    }
    dst[i * dst_stride + j] = a + b + c;
    dst[i * dst_stride + (j + 1)] = b + c;
  }
}

static void boxsum2(int32_t *src, int width, int height, int src_stride,
                    int sqr, int32_t *dst, int dst_stride) {
  int i, j, a, b, c, d, e;

  // Vertical sum over 5-pixel regions, from src into dst.
  if (!sqr) {
    for (j = 0; j < width; ++j) {
      a = src[j];
      b = src[src_stride + j];
      c = src[2 * src_stride + j];
      d = src[3 * src_stride + j];
      e = src[4 * src_stride + j];

      dst[j] = a + b + c;
      dst[dst_stride + j] = a + b + c + d;
      for (i = 2; i < height - 3; ++i) {
        // Loop invariant: At the start of each iteration,
        // a = src[(i - 2) * src_stride + j]
        // b = src[(i - 1) * src_stride + j]
        // c = src[(i    ) * src_stride + j]
        // d = src[(i + 1) * src_stride + j]
        // e = src[(i + 2) * src_stride + j]
        dst[i * dst_stride + j] = a + b + c + d + e;
        a = b;
        b = c;
        c = d;
        d = e;
        e = src[(i + 3) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e;
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
      dst[(i + 2) * dst_stride + j] = c + d + e;
    }
  } else {
    for (j = 0; j < width; ++j) {
      a = src[j] * src[j];
      b = src[src_stride + j] * src[src_stride + j];
      c = src[2 * src_stride + j] * src[2 * src_stride + j];
      d = src[3 * src_stride + j] * src[3 * src_stride + j];
      e = src[4 * src_stride + j] * src[4 * src_stride + j];

      dst[j] = a + b + c;
      dst[dst_stride + j] = a + b + c + d;
      for (i = 2; i < height - 3; ++i) {
        dst[i * dst_stride + j] = a + b + c + d + e;
        a = b;
        b = c;
        c = d;
        d = e;
        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
      }
      dst[i * dst_stride + j] = a + b + c + d + e;
      dst[(i + 1) * dst_stride + j] = b + c + d + e;
      dst[(i + 2) * dst_stride + j] = c + d + e;
    }
  }

  // Horizontal sum over 5-pixel regions of dst
  for (i = 0; i < height; ++i) {
    a = dst[i * dst_stride];
    b = dst[i * dst_stride + 1];
    c = dst[i * dst_stride + 2];
    d = dst[i * dst_stride + 3];
    e = dst[i * dst_stride + 4];

    dst[i * dst_stride] = a + b + c;
    dst[i * dst_stride + 1] = a + b + c + d;
    for (j = 2; j < width - 3; ++j) {
      // Loop invariant: At the start of each iteration,
      // a = src[i * src_stride + (j - 2)]
      // b = src[i * src_stride + (j - 1)]
      // c = src[i * src_stride + (j    )]
      // d = src[i * src_stride + (j + 1)]
      // e = src[i * src_stride + (j + 2)]
      dst[i * dst_stride + j] = a + b + c + d + e;
      a = b;
      b = c;
      c = d;
      d = e;
      e = dst[i * dst_stride + (j + 3)];
    }
    dst[i * dst_stride + j] = a + b + c + d + e;
    dst[i * dst_stride + (j + 1)] = b + c + d + e;
    dst[i * dst_stride + (j + 2)] = c + d + e;
  }
}

static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
                   int sqr, int32_t *dst, int dst_stride) {
  if (r == 1)
    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
  else if (r == 2)
    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
738 739
  else
    assert(0 && "Invalid value of r in self-guided filter");
740 741
}

742
void decode_xq(const int *xqd, int *xq) {
743
  xq[0] = xqd[0];
744 745 746
  xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
}

David Barker's avatar
David Barker committed
747
const int32_t x_by_xplus1[256] = {
748 749 750
  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
  // instead of 0. See comments in av1_selfguided_restoration_internal() for why
  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769
  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  256,
};

David Barker's avatar
David Barker committed
770
const int32_t one_by_x[MAX_NELEM] = {
771 772
  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
773 774
};

775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933
#if CONFIG_FAST_SGR
static void av1_selfguided_restoration_fast_internal(
    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
    int dst_stride, int bit_depth, int r, int eps) {
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
  // leading to a significant speed improvement.
  // We also align the stride to a multiple of 16 bytes, for consistency
  // with the SIMD version of this function.
  int buf_stride = ((width_ext + 3) & ~3) + 16;
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
  int32_t *A = A_;
  int32_t *B = B_;
  int i, j;

  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
         "Need SGRPROJ_BORDER_* >= r+1");

  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
  for (i = -1; i < height + 1; i += 2) {
    for (j = -1; j < width + 1; ++j) {
      const int k = i * buf_stride + j;
      const int n = (2 * r + 1) * (2 * r + 1);

      // a < 2^16 * n < 2^22 regardless of bit depth
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
      // b < 2^8 * n < 2^14 regardless of bit depth
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);

      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
      // This bound on p is due to:
      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
      //
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
      // This is an artefact of rounding, and can only happen if all pixels
      // are (almost) identical, so in this case we saturate to p=0.
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;

      // Note: If MAX_RADIUS <= 2, then this 's' is a function only of
      // r and eps. Further, this is the only place we use 'eps', so we could
      // pre-calculate 's' for each parameter set and store that in place of
      // 'eps'.
      uint32_t s = sgrproj_mtable[eps - 1][n - 1];

      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
      // (this holds even after accounting for the rounding in s)
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);

      // Note: We have to be quite careful about the value of A[k].
      // This is used as a blend factor between individual pixel values and the
      // local mean. So it logically has a range of [0, 256], including both
      // endpoints.
      //
      // This is a pain for hardware, as we'd like something which can be stored
      // in exactly 8 bits.
      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
      // slightly above 2^(8 + bit depth), due to rounding in the value of
      // one_by_x[25-1].
      //
      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
      // overflow), without significantly affecting the final result: z == 0
      // implies that the image is essentially "flat", so the local mean and
      // individual pixel values are very similar.
      //
      // Note that saturating on the other side, ie. requring A[k] <= 255,
      // would be a bad idea, as that corresponds to the case where the image
      // is very variable, when we want to preserve the local pixel value as
      // much as possible.
      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]

      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
      // one_by_x[n - 1] = round(2^12 / n)
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
      // and B[k] is set to a value < 2^(8 + bit depth)
      // This holds even with the rounding in one_by_x and in the overall
      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
                                             (uint32_t)B[k] *
                                             (uint32_t)one_by_x[n - 1],
                                         SGRPROJ_RECIP_BITS);
    }
  }
  // Use the A[] and B[] arrays to calculate the filtered image
  for (i = 0; i < height; ++i) {
    if (!(i & 1)) {  // even row
      for (j = 0; j < width; ++j) {
        const int k = i * buf_stride + j;
        const int l = i * dgd_stride + j;
        const int m = i * dst_stride + j;
        const int nb = 5;
        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
                              5;
        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
                              5;
        const int32_t v = a * dgd[l] + b;
        dst[m] =
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
      }
    } else if (i != height - 1) {  // odd row and not last
      for (j = 0; j < width; ++j) {
        const int k = i * buf_stride + j;
        const int l = i * dgd_stride + j;
        const int m = i * dst_stride + j;
        const int nb = 6;
        const int buf_stride2 = 2 * buf_stride;
        const int32_t a = A[k] * 16 + (A[k - 1] + A[k + 1]) * 14 +
                          (A[k - buf_stride2] + A[k + buf_stride2]) * 4 +
                          (A[k - 1 - buf_stride2] + A[k - 1 + buf_stride2] +
                           A[k + 1 - buf_stride2] + A[k + 1 + buf_stride2]) *
                              3;
        const int32_t b = B[k] * 16 + (B[k - 1] + B[k + 1]) * 14 +
                          (B[k - buf_stride2] + B[k + buf_stride2]) * 4 +
                          (B[k - 1 - buf_stride2] + B[k - 1 + buf_stride2] +
                           B[k + 1 - buf_stride2] + B[k + 1 + buf_stride2]) *
                              3;
        const int32_t v = a * dgd[l] + b;
        dst[m] =
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
      }
    } else {  // odd row and last
      for (j = 0; j < width; ++j) {
        const int k = i * buf_stride + j;
        const int l = i * dgd_stride + j;
        const int m = i * dst_stride + j;
        const int nb = 6;
        const int buf_stride2 = 2 * buf_stride;
        const int32_t a = A[k] * 16 + (A[k - 1] + A[k + 1]) * 14 +
                          A[k - buf_stride2] * 8 +
                          (A[k - 1 - buf_stride2] + A[k + 1 - buf_stride2]) * 6;
        const int32_t b = B[k] * 16 + (B[k - 1] + B[k + 1]) * 14 +
                          B[k - buf_stride2] * 8 +
                          (B[k - 1 - buf_stride2] + B[k + 1 - buf_stride2]) * 6;
        const int32_t v = a * dgd[l] + b;
        dst[m] =
            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
      }
    }
  }
}
#endif  // CONFIG_FAST_SGR

934
static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
935 936
                                                int height, int dgd_stride,
                                                int32_t *dst, int dst_stride,
937
                                                int bit_depth, int r, int eps) {
938 939
  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
David Barker's avatar
David Barker committed
940 941 942 943
  // Adjusting the stride of A and B here appears to avoid bad cache effects,
  // leading to a significant speed improvement.
  // We also align the stride to a multiple of 16 bytes, for consistency
  // with the SIMD version of this function.
944
  int buf_stride = ((width_ext + 3) & ~3) + 16;
945 946 947 948
  int32_t A_[RESTORATION_PROC_UNIT_PELS];
  int32_t B_[RESTORATION_PROC_UNIT_PELS];
  int32_t *A = A_;
  int32_t *B = B_;
949
  int i, j;
950

951 952 953
  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
         "Need SGRPROJ_BORDER_* >= r+1");
954

955 956 957 958 959 960
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
961 962 963 964
  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
  for (i = -1; i < height + 1; ++i) {
    for (j = -1; j < width + 1; ++j) {
David Barker's avatar
David Barker committed
965
      const int k = i * buf_stride + j;
966
      const int n = (2 * r + 1) * (2 * r + 1);
967

968 969 970 971 972 973 974
      // a < 2^16 * n < 2^22 regardless of bit depth
      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
      // b < 2^8 * n < 2^14 regardless of bit depth
      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);

      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
      // and p itself satisfies p < 2^14 * n^2 < 2^26.
975 976 977
      // This bound on p is due to:
      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
      //
978 979 980 981
      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
      // This is an artefact of rounding, and can only happen if all pixels
      // are (almost) identical, so in this case we saturate to p=0.
      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
982 983 984 985 986

      // Note: If MAX_RADIUS <= 2, then this 's' is a function only of
      // r and eps. Further, this is the only place we use 'eps', so we could
      // pre-calculate 's' for each parameter set and store that in place of
      // 'eps'.
987 988 989 990 991 992 993
      uint32_t s = sgrproj_mtable[eps - 1][n - 1];

      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
      // (this holds even after accounting for the rounding in s)
      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);

994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
      // Note: We have to be quite careful about the value of A[k].
      // This is used as a blend factor between individual pixel values and the
      // local mean. So it logically has a range of [0, 256], including both
      // endpoints.
      //
      // This is a pain for hardware, as we'd like something which can be stored
      // in exactly 8 bits.
      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
      // slightly above 2^(8 + bit depth), due to rounding in the value of
      // one_by_x[25-1].
      //
      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
      // overflow), without significantly affecting the final result: z == 0
      // implies that the image is essentially "flat", so the local mean and
      // individual pixel values are very similar.
      //
      // Note that saturating on the other side, ie. requring A[k] <= 255,
      // would be a bad idea, as that corresponds to the case where the image
      // is very variable, when we want to preserve the local pixel value as
      // much as possible.
      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]

      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
1019 1020 1021
      // one_by_x[n - 1] = round(2^12 / n)
      // => the product here is < 2^(20 + bit_depth) <= 2^32,
      // and B[k] is set to a value < 2^(8 + bit depth)
1022 1023
      // This holds even with the rounding in one_by_x and in the overall
      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
1024 1025 1026 1027
      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
                                             (uint32_t)B[k] *
                                             (uint32_t)one_by_x[n - 1],
                                         SGRPROJ_RECIP_BITS);
1028 1029
    }
  }
1030 1031 1032
  // Use the A[] and B[] arrays to calculate the filtered image
  for (i = 0; i < height; ++i) {
    for (j = 0; j < width; ++j) {
David Barker's avatar
David Barker committed
1033
      const int k = i * buf_stride + j;
1034 1035
      const int l = i * dgd_stride + j;
      const int m = i * dst_stride + j;
1036
      const int nb = 5;
1037
      const int32_t a =
David Barker's avatar
David Barker committed
1038 1039 1040 1041
          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
              4 +
          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
1042
              3;
1043
      const int32_t b =
David Barker's avatar
David Barker committed
1044 1045 1046 1047
          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
              4 +
          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
1048
              3;
1049
      const int32_t v = a * dgd[l] + b;
1050
      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
1051 1052 1053 1054
    }
  }
}

1055
void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
1056 1057 1058
                                  int dgd_stride, int32_t *flt1, int32_t *flt2,
                                  int flt_stride, const sgr_params_type *params,
                                  int bit_depth, int highbd) {
1059
  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
1060 1061
  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
  int32_t *dgd32 =
1062
      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
1063 1064 1065 1066 1067

  if (highbd) {
    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
1068
        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
1069 1070 1071 1072 1073
      }
    }
  } else {
    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
1074
        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
1075
      }
1076 1077
    }
  }
1078

1079 1080 1081 1082 1083 1084 1085 1086
#if CONFIG_FAST_SGR
  av1_selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
                                           flt1, flt_stride, bit_depth,
                                           params->r1, params->e1);
  av1_selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
                                           flt2, flt_stride, bit_depth,
                                           params->r2, params->e2);
#else
1087 1088 1089 1090 1091 1092
  av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
                                      flt_stride, bit_depth, params->r1,
                                      params->e1);
  av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt2,
                                      flt_stride, bit_depth, params->r2,
                                      params->e2);
1093
#endif  // CONFIG_FAST_SGR
1094 1095
}

1096
void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
1097
                                    int stride, int eps, const int *xqd,
1098 1099 1100
                                    uint8_t *dst8, int dst_stride,
                                    int32_t *tmpbuf, int bit_depth,
                                    int highbd) {
1101
  int xq[2];
1102
  int32_t *flt1 = tmpbuf;