Commit a9d41e88 authored by Steinar Midtskogen's avatar Steinar Midtskogen Committed by Jean-Marc Valin

Merge dering/clpf rdo and filtering

* Dering and clpf were merged into a single pass.
* 32x32 and 128x128 filter block sizes for clpf were removed.
* RDO for dering and clpf merged and improved:
  - "0" no longer required to be in the strength selection
  - Dering strength can now be 0, 1 or 2 bits per block

              LL    HL
PSNR:       -0.04 -0.01
PSNR HVS:   -0.27 -0.18
SSIM:       -0.15 +0.01
CIEDE 2000: -0.11 -0.03
APSNR:      -0.03 -0.00
MS SSIM:    -0.18 -0.11

Change-Id: I9f002a16ad218eab6007f90f1f176232443495f0
parent f5931e5e
......@@ -852,19 +852,21 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint
specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
if (aom_config("CONFIG_CDEF") eq "yes") {
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd";
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd";
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd, unsigned int dmp";
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd, unsigned int dmp";
# VS compiling for 32 bit targets does not support vector types in
# structs as arguments, which makes the v256 type of the intrinsics
# hard to support, so optimizations for this target are disabled.
if ($opts{config} !~ /libs-x86-win32-vs.*/) {
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/;
specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/;
}
}
if ($opts{config} !~ /libs-x86-win32-vs.*/) {
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
}
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd";
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int dmp";
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int dmp";
......
......@@ -215,8 +215,8 @@ if (CONFIG_CDEF)
"${AOM_ROOT}/av1/common/clpf.h"
"${AOM_ROOT}/av1/common/clpf_simd.h"
"${AOM_ROOT}/av1/common/clpf_simd_kernel.h"
"${AOM_ROOT}/av1/common/dering.c"
"${AOM_ROOT}/av1/common/dering.h"
"${AOM_ROOT}/av1/common/cdef.c"
"${AOM_ROOT}/av1/common/cdef.h"
"${AOM_ROOT}/av1/common/od_dering.c"
"${AOM_ROOT}/av1/common/od_dering.h")
......@@ -224,7 +224,7 @@ if (CONFIG_CDEF)
${AOM_AV1_ENCODER_SOURCES}
"${AOM_ROOT}/av1/encoder/clpf_rdo.c"
"${AOM_ROOT}/av1/encoder/clpf_rdo.h"
"${AOM_ROOT}/av1/encoder/pickdering.c")
"${AOM_ROOT}/av1/encoder/pickcdef.c")
set(AOM_AV1_COMMON_SSE2_INTRIN
${AOM_AV1_COMMON_SSE2_INTRIN}
......
......@@ -97,8 +97,8 @@ AV1_COMMON_SRCS-yes += common/od_dering.c
AV1_COMMON_SRCS-yes += common/od_dering.h
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.h
AV1_COMMON_SRCS-yes += common/dering.c
AV1_COMMON_SRCS-yes += common/dering.h
AV1_COMMON_SRCS-yes += common/cdef.c
AV1_COMMON_SRCS-yes += common/cdef.h
endif
ifeq ($(CONFIG_ACCOUNTING),yes)
AV1_COMMON_SRCS-yes += common/accounting.h
......
......@@ -109,7 +109,7 @@ AV1_CX_SRCS-yes += encoder/temporal_filter.h
AV1_CX_SRCS-yes += encoder/mbgraph.c
AV1_CX_SRCS-yes += encoder/mbgraph.h
ifeq ($(CONFIG_CDEF),yes)
AV1_CX_SRCS-yes += encoder/pickdering.c
AV1_CX_SRCS-yes += encoder/pickcdef.c
AV1_CX_SRCS-yes += encoder/clpf_rdo.c
AV1_CX_SRCS-yes += encoder/clpf_rdo.h
AV1_CX_SRCS-yes += encoder/clpf_rdo_simd.h
......
......@@ -21,7 +21,7 @@ struct search_site_config;
struct mv;
union int_mv;
struct yv12_buffer_config;
typedef int16_t od_dering_in;
typedef uint16_t od_dering_in;
EOF
}
forward_decls qw/av1_common_forward_decls/;
......@@ -755,10 +755,10 @@ if (aom_config("CONFIG_CDEF") eq "yes") {
add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift";
specialize qw/od_dir_find8 sse4_1/;
add_proto qw/int od_filter_dering_direction_4x4/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
add_proto qw/int od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir";
specialize qw/od_filter_dering_direction_4x4 sse4_1/;
add_proto qw/int od_filter_dering_direction_8x8/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
add_proto qw/int od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir";
specialize qw/od_filter_dering_direction_8x8 sse4_1/;
}
......
......@@ -370,6 +370,7 @@ typedef struct {
#endif // CONFIG_NEW_QUANT
/* deringing gain *per-superblock* */
int8_t dering_gain;
int8_t clpf_strength;
#if CONFIG_DELTA_Q
int current_q_index;
#endif
......
......@@ -9,22 +9,87 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <string.h>
#include <assert.h>
#include <math.h>
#include <string.h>
#include "./aom_scale_rtcd.h"
#include "aom/aom_integer.h"
#include "av1/common/dering.h"
#include "av1/common/cdef.h"
#include "av1/common/od_dering.h"
#include "av1/common/onyxc_int.h"
#include "av1/common/reconinter.h"
#include "av1/common/od_dering.h"
int compute_level_from_index(int global_level, int gi) {
static const int dering_gains[DERING_REFINEMENT_LEVELS] = { 0, 11, 16, 22 };
int level;
if (global_level == 0) return 0;
level = (global_level * dering_gains[gi] + 8) >> 4;
return clamp(level, gi, MAX_DERING_LEVEL - 1);
int dering_level_table[DERING_STRENGTHS] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 17, 20, 24, 28, 33, 39, 46, 54, 63
};
#ifndef NDEBUG
static int is_sorted(const int *arr, int num) {
int sorted = 1;
while (sorted && num-- > 1) sorted &= arr[num] >= arr[num - 1];
return sorted;
}
#endif
uint32_t levels_to_id(const int lev[DERING_REFINEMENT_LEVELS],
const int str[CLPF_REFINEMENT_LEVELS]) {
uint32_t id = 0;
int i;
assert(is_sorted(lev, DERING_REFINEMENT_LEVELS));
assert(is_sorted(str, CLPF_REFINEMENT_LEVELS));
for (i = 0; i < DERING_REFINEMENT_LEVELS; i++)
id = id * DERING_STRENGTHS + lev[i];
for (i = 0; i < CLPF_REFINEMENT_LEVELS; i++)
id = id * CLPF_STRENGTHS + str[i];
return id;
}
void id_to_levels(int lev[DERING_REFINEMENT_LEVELS],
int str[CLPF_REFINEMENT_LEVELS], uint32_t id) {
int i;
for (i = CLPF_REFINEMENT_LEVELS - 1; i >= 0; i--) {
str[i] = id % CLPF_STRENGTHS;
id /= CLPF_STRENGTHS;
}
for (i = DERING_REFINEMENT_LEVELS - 1; i >= 0; i--) {
lev[i] = id % DERING_STRENGTHS;
id /= DERING_STRENGTHS;
}
// Pack tables
int j;
for (i = j = 1; i < DERING_REFINEMENT_LEVELS && j < DERING_REFINEMENT_LEVELS;
i++)
if (lev[j - 1] == lev[j])
memmove(&lev[j - 1], &lev[j],
(DERING_REFINEMENT_LEVELS - j) * sizeof(*lev));
else
j++;
for (i = j = 1; i < CLPF_REFINEMENT_LEVELS && j < DERING_REFINEMENT_LEVELS;
i++)
if (str[j - 1] == str[j])
memmove(&str[j - 1], &str[j],
(CLPF_REFINEMENT_LEVELS - i) * sizeof(*str));
else
j++;
assert(is_sorted(lev, DERING_REFINEMENT_LEVELS));
assert(is_sorted(str, CLPF_REFINEMENT_LEVELS));
}
void cdef_get_bits(const int *lev, const int *str, int *dering_bits,
int *clpf_bits) {
int i;
*dering_bits = *clpf_bits = 1;
for (i = 1; i < DERING_REFINEMENT_LEVELS; i++)
(*dering_bits) += lev[i] != lev[i - 1];
for (i = 1; i < CLPF_REFINEMENT_LEVELS; i++)
(*clpf_bits) += str[i] != str[i - 1];
*dering_bits = get_msb(*dering_bits);
*clpf_bits = get_msb(*clpf_bits);
}
int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) {
......@@ -82,7 +147,7 @@ int sb_compute_dering_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
}
static INLINE void copy_8x8_16bit_to_8bit(uint8_t *dst, int dstride,
int16_t *src, int sstride) {
uint16_t *src, int sstride) {
int i, j;
for (i = 0; i < 8; i++)
for (j = 0; j < 8; j++)
......@@ -90,7 +155,7 @@ static INLINE void copy_8x8_16bit_to_8bit(uint8_t *dst, int dstride,
}
static INLINE void copy_4x4_16bit_to_8bit(uint8_t *dst, int dstride,
int16_t *src, int sstride) {
uint16_t *src, int sstride) {
int i, j;
for (i = 0; i < 4; i++)
for (j = 0; j < 4; j++)
......@@ -98,7 +163,7 @@ static INLINE void copy_4x4_16bit_to_8bit(uint8_t *dst, int dstride,
}
/* TODO: Optimize this function for SSE. */
void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride, int16_t *src,
void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride, uint16_t *src,
dering_list *dlist, int dering_count,
int bsize) {
int bi, bx, by;
......@@ -120,11 +185,10 @@ void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride, int16_t *src,
}
/* TODO: Optimize this function for SSE. */
static void copy_sb8_16(AV1_COMMON *cm, int16_t *dst, int dstride,
static void copy_sb8_16(UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
const uint8_t *src, int src_voffset, int src_hoffset,
int sstride, int vsize, int hsize) {
int r, c;
(void)cm;
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
const uint16_t *base =
......@@ -134,26 +198,28 @@ static void copy_sb8_16(AV1_COMMON *cm, int16_t *dst, int dstride,
dst[r * dstride + c] = base[r * sstride + c];
}
}
} else
} else {
#endif
{
const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
for (r = 0; r < vsize; r++) {
for (c = 0; c < hsize; c++) {
dst[r * dstride + c] = base[r * sstride + c];
}
}
#if CONFIG_AOM_HIGHBITDEPTH
}
#endif
}
void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
MACROBLOCKD *xd, int global_level) {
void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd,
uint32_t global_level, int clpf_strength_u,
int clpf_strength_v) {
int r, c;
int sbr, sbc;
int nhsb, nvsb;
int16_t src[OD_DERING_INBUF_SIZE];
int16_t *linebuf[3];
int16_t colbuf[3][OD_BSIZE_MAX + 2 * OD_FILT_VBORDER][OD_FILT_HBORDER];
uint16_t src[OD_DERING_INBUF_SIZE];
uint16_t *linebuf[3];
uint16_t colbuf[3][OD_BSIZE_MAX + 2 * OD_FILT_VBORDER][OD_FILT_HBORDER];
dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
unsigned char *row_dering, *prev_row_dering, *curr_row_dering;
int dering_count;
......@@ -164,12 +230,13 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
int pli;
int dering_left;
int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
int nplanes;
if (xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
xd->plane[2].subsampling_x == xd->plane[2].subsampling_y)
nplanes = 3;
else
nplanes = 1;
int nplanes = 3;
int lev[DERING_REFINEMENT_LEVELS];
int str[CLPF_REFINEMENT_LEVELS];
int chroma_dering =
xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
id_to_levels(lev, str, global_level);
nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
av1_setup_dst_planes(xd->plane, frame, 0, 0);
......@@ -195,29 +262,46 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
}
dering_left = 1;
for (sbc = 0; sbc < nhsb; sbc++) {
int level;
int level, clpf_strength;
int nhb, nvb;
int cstart = 0;
BOUNDARY_TYPE boundary_type =
cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
MAX_MIB_SIZE * sbc]
->mbmi.boundary_info;
if (!dering_left) cstart = -OD_FILT_HBORDER;
nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
level = compute_level_from_index(
global_level, cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
MAX_MIB_SIZE * sbc]
->mbmi.dering_gain);
level = dering_level_table
[lev[cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
MAX_MIB_SIZE * sbc]
->mbmi.dering_gain]];
clpf_strength =
str[cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
MAX_MIB_SIZE * sbc]
->mbmi.clpf_strength];
clpf_strength += clpf_strength == 3;
curr_row_dering[sbc] = 0;
if (level == 0 ||
if ((level == 0 && clpf_strength == 0) ||
(dering_count = sb_compute_dering_list(
cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, dlist)) == 0) {
dering_left = 0;
continue;
}
curr_row_dering[sbc] = 1;
for (pli = 0; pli < nplanes; pli++) {
int16_t dst[OD_BSIZE_MAX * OD_BSIZE_MAX];
uint16_t dst[OD_BSIZE_MAX * OD_BSIZE_MAX];
int threshold;
int coffset;
int rend, cend;
int clpf_damping = 3 - (pli != AOM_PLANE_Y) + (cm->base_qindex >> 6);
if (pli) {
if (!chroma_dering) level = 0;
clpf_strength = pli == 1 ? clpf_strength_u : clpf_strength_v;
clpf_strength += clpf_strength == 3;
}
if (sbc == nhsb - 1)
cend = (nhb << bsize[pli]);
else
......@@ -347,14 +431,15 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
threshold = (level * 5 + 4) >> 3 << coeff_shift;
else
threshold = level << coeff_shift;
if (threshold == 0) continue;
od_dering(
dst, &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
dec[pli], dir, pli, dlist, dering_count, threshold, coeff_shift);
if (threshold == 0 && clpf_strength == 0) continue;
od_dering(dst,
&src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
dec[pli], dir, pli, dlist, dering_count, threshold,
clpf_strength, clpf_damping, coeff_shift, boundary_type);
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
copy_dering_16bit_to_16bit(
(int16_t *)&CONVERT_TO_SHORTPTR(
&CONVERT_TO_SHORTPTR(
xd->plane[pli]
.dst.buf)[xd->plane[pli].dst.stride *
(MAX_MIB_SIZE * sbr << bsize[pli]) +
......
......@@ -11,32 +11,48 @@
#ifndef AV1_COMMON_DERING_H_
#define AV1_COMMON_DERING_H_
#include "av1/common/od_dering.h"
#include "av1/common/onyxc_int.h"
#include "aom/aom_integer.h"
// ceil(log2(DERING_STRENGTHS^DERING_REFINEMENT_LEVELS *
// CLPF_STRENGTHS^CLPF_REFINEMENT_LEVELS))
#define DERING_LEVEL_BITS (22)
#define MAX_DERING_LEVEL (1LL << DERING_LEVEL_BITS)
#define DERING_REFINEMENT_BITS 2
#define DERING_REFINEMENT_LEVELS 4
#define CLPF_REFINEMENT_BITS 1
#define CLPF_REFINEMENT_LEVELS 2
#define DERING_STRENGTHS 21
#define CLPF_STRENGTHS 4
#include "./aom_config.h"
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
#include "od_dering.h"
#include "av1/common/od_dering.h"
#include "av1/common/onyxc_int.h"
#include "./od_dering.h"
#ifdef __cplusplus
extern "C" {
#endif
#define DERING_LEVEL_BITS 6
#define MAX_DERING_LEVEL (1 << DERING_LEVEL_BITS)
extern int dering_level_table[DERING_STRENGTHS];
#define DERING_REFINEMENT_BITS 2
#define DERING_REFINEMENT_LEVELS 4
uint32_t levels_to_id(const int lev[DERING_REFINEMENT_LEVELS],
const int str[CLPF_REFINEMENT_LEVELS]);
void id_to_levels(int lev[DERING_REFINEMENT_LEVELS],
int str[CLPF_REFINEMENT_LEVELS], uint32_t id);
void cdef_get_bits(const int *lev, const int *str, int *dering_bits,
int *clpf_bits);
int compute_level_from_index(int global_level, int gi);
int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
int sb_compute_dering_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
dering_list *dlist);
void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
MACROBLOCKD *xd, int global_level);
void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd,
uint32_t global_level, int clpf_strength_u,
int clpf_strength_v);
int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
AV1_COMMON *cm, MACROBLOCKD *xd);
void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
AV1_COMMON *cm, MACROBLOCKD *xd);
#ifdef __cplusplus
} // extern "C"
......
......@@ -14,9 +14,9 @@
#include "aom/aom_image.h"
#include "aom_dsp/aom_dsp_common.h"
int sign(int i) { return i < 0 ? -1 : 1; }
static int sign(int i) { return i < 0 ? -1 : 1; }
int constrain(int x, int s, unsigned int damping) {
static int constrain(int x, int s, unsigned int damping) {
return sign(x) *
AOMMAX(0, abs(x) - AOMMAX(0, abs(x) - s +
(abs(x) >> (damping - get_msb(s)))));
......@@ -59,8 +59,8 @@ void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
}
}
#if CONFIG_AOM_HIGHBITDEPTH
// Identical to aom_clpf_block_c() apart from "src" and "dst".
// TODO(stemidts): Put under CONFIG_AOM_HIGHBITDEPTH if CDEF do 8 bit internally
void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt,
......@@ -88,235 +88,3 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
}
}
}
#endif
// Return number of filtered blocks
void av1_clpf_frame(
const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *org,
AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, int plane,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *, const AV1_COMMON *cm, int, int,
int, unsigned int, unsigned int, int8_t *, int)) {
/* Constrained low-pass filter (CLPF) */
int c, k, l, m, n;
const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
const int suby = plane != AOM_PLANE_Y && frame->subsampling_y;
const int bs = (subx || suby) ? 4 : 8;
const int bslog = get_msb(bs);
int width = plane != AOM_PLANE_Y ? frame->uv_crop_width : frame->y_crop_width;
int height =
plane != AOM_PLANE_Y ? frame->uv_crop_height : frame->y_crop_height;
int xpos, ypos;
const int sstride = plane != AOM_PLANE_Y ? frame->uv_stride : frame->y_stride;
int dstride = bs;
const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
uint8_t *cache = NULL;
uint8_t **cache_ptr = NULL;
uint8_t **cache_dst = NULL;
int cache_idx = 0;
const int cache_size = num_fb_hor << (2 * fb_size_log2);
const int cache_blocks = cache_size / (bs * bs);
uint8_t *src_buffer =
plane != AOM_PLANE_Y
? (plane == AOM_PLANE_U ? frame->u_buffer : frame->v_buffer)
: frame->y_buffer;
uint8_t *dst_buffer;
// Damping is the filter cut-off log2 point for the constrain function.
// For instance, if the damping is 5, neighbour differences above 32 will
// be ignored and half of the strength will be applied for a difference of 16.
int damping =
cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6);
// Make buffer space for in-place filtering
#if CONFIG_AOM_HIGHBITDEPTH
strength <<= (cm->bit_depth - 8);
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size << !!cm->use_highbitdepth));
dst_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache;
#else
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
dst_buffer = cache;
#endif
CHECK_MEM_ERROR(cm, cache_ptr, aom_malloc(cache_blocks * sizeof(*cache_ptr)));
CHECK_MEM_ERROR(cm, cache_dst, aom_malloc(cache_blocks * sizeof(*cache_dst)));
memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst));
// Iterate over all filter blocks
for (k = 0; k < num_fb_ver; k++) {
for (l = 0; l < num_fb_hor; l++) {
int h, w;
int allskip = !(enable_fb_flag && fb_size_log2 == MAX_FB_SIZE_LOG2);
const int xoff = l << fb_size_log2;
const int yoff = k << fb_size_log2;
for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) {
xpos = xoff + n * bs;
ypos = yoff + m * bs;
if (xpos < width && ypos < height) {
allskip &=
cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE]
->mbmi.skip;
}
}
}
// Calculate the actual filter block size near frame edges
h = AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
w = AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1);
h += !h << fb_size_log2;
w += !w << fb_size_log2;
if (!allskip && // Do not filter the block if all is skip encoded
(!enable_fb_flag ||
// Only called if fb_flag enabled (luma only)
decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
fb_size_log2,
cm->clpf_blocks + yoff / MIN_FB_SIZE * cm->clpf_stride +
xoff / MIN_FB_SIZE,
plane))) {
// Iterate over all smaller blocks inside the filter block
for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
int sizex, sizey;
xpos = xoff + n * bs;
ypos = yoff + m * bs;
sizex = AOMMIN(width - xpos, bs);
sizey = AOMMIN(height - ypos, bs);
if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE]
->mbmi.skip ||
(enable_fb_flag && fb_size_log2 == MAX_FB_SIZE_LOG2)) {
BOUNDARY_TYPE boundary_type =
cm->mi[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE]
.mbmi.boundary_info;
// Temporary buffering needed for in-place filtering
if (cache_ptr[cache_idx]) {
// Copy filtered block back into the frame
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]);
if (sizex == 8) {
for (c = 0; c < sizey; c++) {
*(uint64_t *)(d + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
*(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
}
} else if (sizex == 4) {
for (c = 0; c < sizey; c++)
*(uint64_t *)(d + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
} else {
for (c = 0; c < sizey; c++)
memcpy(d + c * sstride, cache_ptr[cache_idx] + c * bs * 2,
sizex);
}
} else {
if (sizex == 8)
for (c = 0; c < sizey; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
else if (sizex == 4)
for (c = 0; c < sizey; c++)
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
else
for (c = 0; c < sizey; c++)
memcpy(cache_dst[cache_idx] + c * sstride,
cache_ptr[cache_idx] + c * bs, sizex);
}
#else
if (sizex == 8)
for (c = 0; c < sizey; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
else if (sizex == 4)
for (c = 0; c < sizey; c++)