diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index eafdb2bc35c38df1bf3c6a8b31bd4cb6d180a7f9..4aa156381a5c3e1bc69c485a78ed4b2c7d8fb692 100644 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -852,19 +852,21 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; if (aom_config("CONFIG_CDEF") eq "yes") { + add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd"; if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd"; add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd, unsigned int dmp"; add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd, unsigned int dmp"; # VS compiling for 32 bit targets does not support vector types in # structs as arguments, which makes the v256 type of the intrinsics # hard to support, so optimizations for this target are disabled. if ($opts{config} !~ /libs-x86-win32-vs.*/) { - specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/; specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/; specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/; } } + if ($opts{config} !~ /libs-x86-win32-vs.*/) { + specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/; + } add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd"; add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int dmp"; add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int dmp"; diff --git a/av1/av1.cmake b/av1/av1.cmake index d90578018863726c8de39a9b813ff762a9216e75..566d9be3775139f9baaa977256d4aa7961d099e1 100644 --- a/av1/av1.cmake +++ b/av1/av1.cmake @@ -215,8 +215,8 @@ if (CONFIG_CDEF) "${AOM_ROOT}/av1/common/clpf.h" "${AOM_ROOT}/av1/common/clpf_simd.h" "${AOM_ROOT}/av1/common/clpf_simd_kernel.h" - "${AOM_ROOT}/av1/common/dering.c" - "${AOM_ROOT}/av1/common/dering.h" + "${AOM_ROOT}/av1/common/cdef.c" + "${AOM_ROOT}/av1/common/cdef.h" "${AOM_ROOT}/av1/common/od_dering.c" "${AOM_ROOT}/av1/common/od_dering.h") @@ -224,7 +224,7 @@ if (CONFIG_CDEF) ${AOM_AV1_ENCODER_SOURCES} "${AOM_ROOT}/av1/encoder/clpf_rdo.c" "${AOM_ROOT}/av1/encoder/clpf_rdo.h" - "${AOM_ROOT}/av1/encoder/pickdering.c") + "${AOM_ROOT}/av1/encoder/pickcdef.c") set(AOM_AV1_COMMON_SSE2_INTRIN ${AOM_AV1_COMMON_SSE2_INTRIN} diff --git a/av1/av1_common.mk b/av1/av1_common.mk index 92e78eb10b54dde5569f02dc1b5fb99b8296220f..ca0215bc9e6d05830d9bd68c43cf095347fe59ab 100644 --- a/av1/av1_common.mk +++ b/av1/av1_common.mk @@ -97,8 +97,8 @@ AV1_COMMON_SRCS-yes += common/od_dering.c AV1_COMMON_SRCS-yes += common/od_dering.h AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.c AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.h -AV1_COMMON_SRCS-yes += common/dering.c -AV1_COMMON_SRCS-yes += common/dering.h +AV1_COMMON_SRCS-yes += common/cdef.c +AV1_COMMON_SRCS-yes += common/cdef.h endif ifeq ($(CONFIG_ACCOUNTING),yes) AV1_COMMON_SRCS-yes += common/accounting.h diff --git a/av1/av1_cx.mk b/av1/av1_cx.mk index e4ae420016d31720a8b02663cf04a61e68338cde..9b624a2995206b75f7670580bfaa292b24931d80 100644 --- a/av1/av1_cx.mk +++ b/av1/av1_cx.mk @@ -109,7 +109,7 @@ AV1_CX_SRCS-yes += encoder/temporal_filter.h AV1_CX_SRCS-yes += encoder/mbgraph.c AV1_CX_SRCS-yes += encoder/mbgraph.h ifeq ($(CONFIG_CDEF),yes) -AV1_CX_SRCS-yes += encoder/pickdering.c +AV1_CX_SRCS-yes += encoder/pickcdef.c AV1_CX_SRCS-yes += encoder/clpf_rdo.c AV1_CX_SRCS-yes += encoder/clpf_rdo.h AV1_CX_SRCS-yes += encoder/clpf_rdo_simd.h diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 100dc4f7c3002d35aead8e779fbbf884db175a55..b389b70de8e9acc1ecc97f13fd5aef682380f94e 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl @@ -21,7 +21,7 @@ struct search_site_config; struct mv; union int_mv; struct yv12_buffer_config; -typedef int16_t od_dering_in; +typedef uint16_t od_dering_in; EOF } forward_decls qw/av1_common_forward_decls/; @@ -755,10 +755,10 @@ if (aom_config("CONFIG_CDEF") eq "yes") { add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift"; specialize qw/od_dir_find8 sse4_1/; - add_proto qw/int od_filter_dering_direction_4x4/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir"; + add_proto qw/int od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir"; specialize qw/od_filter_dering_direction_4x4 sse4_1/; - add_proto qw/int od_filter_dering_direction_8x8/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir"; + add_proto qw/int od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir"; specialize qw/od_filter_dering_direction_8x8 sse4_1/; } diff --git a/av1/common/blockd.h b/av1/common/blockd.h index a362698c66b0c6ec098efe5efbec76d9b69e01e3..dc433a29e058f11a4831b55fb06fcd516bab55e1 100644 --- a/av1/common/blockd.h +++ b/av1/common/blockd.h @@ -370,6 +370,7 @@ typedef struct { #endif // CONFIG_NEW_QUANT /* deringing gain *per-superblock* */ int8_t dering_gain; + int8_t clpf_strength; #if CONFIG_DELTA_Q int current_q_index; #endif diff --git a/av1/common/dering.c b/av1/common/cdef.c similarity index 73% rename from av1/common/dering.c rename to av1/common/cdef.c index 4f78c2c746f64da327c5dacdca8144ea51fbc3de..e2efdbc20fc3886a3ce9d6006bdc1199151023f5 100644 --- a/av1/common/dering.c +++ b/av1/common/cdef.c @@ -9,22 +9,87 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include +#include #include +#include #include "./aom_scale_rtcd.h" #include "aom/aom_integer.h" -#include "av1/common/dering.h" +#include "av1/common/cdef.h" +#include "av1/common/od_dering.h" #include "av1/common/onyxc_int.h" #include "av1/common/reconinter.h" -#include "av1/common/od_dering.h" -int compute_level_from_index(int global_level, int gi) { - static const int dering_gains[DERING_REFINEMENT_LEVELS] = { 0, 11, 16, 22 }; - int level; - if (global_level == 0) return 0; - level = (global_level * dering_gains[gi] + 8) >> 4; - return clamp(level, gi, MAX_DERING_LEVEL - 1); +int dering_level_table[DERING_STRENGTHS] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 17, 20, 24, 28, 33, 39, 46, 54, 63 +}; + +#ifndef NDEBUG +static int is_sorted(const int *arr, int num) { + int sorted = 1; + while (sorted && num-- > 1) sorted &= arr[num] >= arr[num - 1]; + return sorted; +} +#endif + +uint32_t levels_to_id(const int lev[DERING_REFINEMENT_LEVELS], + const int str[CLPF_REFINEMENT_LEVELS]) { + uint32_t id = 0; + int i; + + assert(is_sorted(lev, DERING_REFINEMENT_LEVELS)); + assert(is_sorted(str, CLPF_REFINEMENT_LEVELS)); + for (i = 0; i < DERING_REFINEMENT_LEVELS; i++) + id = id * DERING_STRENGTHS + lev[i]; + for (i = 0; i < CLPF_REFINEMENT_LEVELS; i++) + id = id * CLPF_STRENGTHS + str[i]; + return id; +} + +void id_to_levels(int lev[DERING_REFINEMENT_LEVELS], + int str[CLPF_REFINEMENT_LEVELS], uint32_t id) { + int i; + for (i = CLPF_REFINEMENT_LEVELS - 1; i >= 0; i--) { + str[i] = id % CLPF_STRENGTHS; + id /= CLPF_STRENGTHS; + } + for (i = DERING_REFINEMENT_LEVELS - 1; i >= 0; i--) { + lev[i] = id % DERING_STRENGTHS; + id /= DERING_STRENGTHS; + } + + // Pack tables + int j; + for (i = j = 1; i < DERING_REFINEMENT_LEVELS && j < DERING_REFINEMENT_LEVELS; + i++) + if (lev[j - 1] == lev[j]) + memmove(&lev[j - 1], &lev[j], + (DERING_REFINEMENT_LEVELS - j) * sizeof(*lev)); + else + j++; + for (i = j = 1; i < CLPF_REFINEMENT_LEVELS && j < DERING_REFINEMENT_LEVELS; + i++) + if (str[j - 1] == str[j]) + memmove(&str[j - 1], &str[j], + (CLPF_REFINEMENT_LEVELS - i) * sizeof(*str)); + else + j++; + + assert(is_sorted(lev, DERING_REFINEMENT_LEVELS)); + assert(is_sorted(str, CLPF_REFINEMENT_LEVELS)); +} + +void cdef_get_bits(const int *lev, const int *str, int *dering_bits, + int *clpf_bits) { + int i; + *dering_bits = *clpf_bits = 1; + for (i = 1; i < DERING_REFINEMENT_LEVELS; i++) + (*dering_bits) += lev[i] != lev[i - 1]; + for (i = 1; i < CLPF_REFINEMENT_LEVELS; i++) + (*clpf_bits) += str[i] != str[i - 1]; + + *dering_bits = get_msb(*dering_bits); + *clpf_bits = get_msb(*clpf_bits); } int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) { @@ -82,7 +147,7 @@ int sb_compute_dering_list(const AV1_COMMON *const cm, int mi_row, int mi_col, } static INLINE void copy_8x8_16bit_to_8bit(uint8_t *dst, int dstride, - int16_t *src, int sstride) { + uint16_t *src, int sstride) { int i, j; for (i = 0; i < 8; i++) for (j = 0; j < 8; j++) @@ -90,7 +155,7 @@ static INLINE void copy_8x8_16bit_to_8bit(uint8_t *dst, int dstride, } static INLINE void copy_4x4_16bit_to_8bit(uint8_t *dst, int dstride, - int16_t *src, int sstride) { + uint16_t *src, int sstride) { int i, j; for (i = 0; i < 4; i++) for (j = 0; j < 4; j++) @@ -98,7 +163,7 @@ static INLINE void copy_4x4_16bit_to_8bit(uint8_t *dst, int dstride, } /* TODO: Optimize this function for SSE. */ -void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride, int16_t *src, +void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride, uint16_t *src, dering_list *dlist, int dering_count, int bsize) { int bi, bx, by; @@ -120,11 +185,10 @@ void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride, int16_t *src, } /* TODO: Optimize this function for SSE. */ -static void copy_sb8_16(AV1_COMMON *cm, int16_t *dst, int dstride, +static void copy_sb8_16(UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride, const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize) { int r, c; - (void)cm; #if CONFIG_AOM_HIGHBITDEPTH if (cm->use_highbitdepth) { const uint16_t *base = @@ -134,26 +198,28 @@ static void copy_sb8_16(AV1_COMMON *cm, int16_t *dst, int dstride, dst[r * dstride + c] = base[r * sstride + c]; } } - } else + } else { #endif - { const uint8_t *base = &src[src_voffset * sstride + src_hoffset]; for (r = 0; r < vsize; r++) { for (c = 0; c < hsize; c++) { dst[r * dstride + c] = base[r * sstride + c]; } } +#if CONFIG_AOM_HIGHBITDEPTH } +#endif } -void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, - MACROBLOCKD *xd, int global_level) { +void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd, + uint32_t global_level, int clpf_strength_u, + int clpf_strength_v) { int r, c; int sbr, sbc; int nhsb, nvsb; - int16_t src[OD_DERING_INBUF_SIZE]; - int16_t *linebuf[3]; - int16_t colbuf[3][OD_BSIZE_MAX + 2 * OD_FILT_VBORDER][OD_FILT_HBORDER]; + uint16_t src[OD_DERING_INBUF_SIZE]; + uint16_t *linebuf[3]; + uint16_t colbuf[3][OD_BSIZE_MAX + 2 * OD_FILT_VBORDER][OD_FILT_HBORDER]; dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE]; unsigned char *row_dering, *prev_row_dering, *curr_row_dering; int dering_count; @@ -164,12 +230,13 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int pli; int dering_left; int coeff_shift = AOMMAX(cm->bit_depth - 8, 0); - int nplanes; - if (xd->plane[1].subsampling_x == xd->plane[1].subsampling_y && - xd->plane[2].subsampling_x == xd->plane[2].subsampling_y) - nplanes = 3; - else - nplanes = 1; + int nplanes = 3; + int lev[DERING_REFINEMENT_LEVELS]; + int str[CLPF_REFINEMENT_LEVELS]; + int chroma_dering = + xd->plane[1].subsampling_x == xd->plane[1].subsampling_y && + xd->plane[2].subsampling_x == xd->plane[2].subsampling_y; + id_to_levels(lev, str, global_level); nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE; nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE; av1_setup_dst_planes(xd->plane, frame, 0, 0); @@ -195,29 +262,46 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, } dering_left = 1; for (sbc = 0; sbc < nhsb; sbc++) { - int level; + int level, clpf_strength; int nhb, nvb; int cstart = 0; + BOUNDARY_TYPE boundary_type = + cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride + + MAX_MIB_SIZE * sbc] + ->mbmi.boundary_info; if (!dering_left) cstart = -OD_FILT_HBORDER; nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc); nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr); - level = compute_level_from_index( - global_level, cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride + - MAX_MIB_SIZE * sbc] - ->mbmi.dering_gain); + level = dering_level_table + [lev[cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride + + MAX_MIB_SIZE * sbc] + ->mbmi.dering_gain]]; + clpf_strength = + str[cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride + + MAX_MIB_SIZE * sbc] + ->mbmi.clpf_strength]; + clpf_strength += clpf_strength == 3; curr_row_dering[sbc] = 0; - if (level == 0 || + if ((level == 0 && clpf_strength == 0) || (dering_count = sb_compute_dering_list( cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, dlist)) == 0) { dering_left = 0; continue; } + curr_row_dering[sbc] = 1; for (pli = 0; pli < nplanes; pli++) { - int16_t dst[OD_BSIZE_MAX * OD_BSIZE_MAX]; + uint16_t dst[OD_BSIZE_MAX * OD_BSIZE_MAX]; int threshold; int coffset; int rend, cend; + int clpf_damping = 3 - (pli != AOM_PLANE_Y) + (cm->base_qindex >> 6); + + if (pli) { + if (!chroma_dering) level = 0; + clpf_strength = pli == 1 ? clpf_strength_u : clpf_strength_v; + clpf_strength += clpf_strength == 3; + } if (sbc == nhsb - 1) cend = (nhb << bsize[pli]); else @@ -347,14 +431,15 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, threshold = (level * 5 + 4) >> 3 << coeff_shift; else threshold = level << coeff_shift; - if (threshold == 0) continue; - od_dering( - dst, &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER], - dec[pli], dir, pli, dlist, dering_count, threshold, coeff_shift); + if (threshold == 0 && clpf_strength == 0) continue; + od_dering(dst, + &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER], + dec[pli], dir, pli, dlist, dering_count, threshold, + clpf_strength, clpf_damping, coeff_shift, boundary_type); #if CONFIG_AOM_HIGHBITDEPTH if (cm->use_highbitdepth) { copy_dering_16bit_to_16bit( - (int16_t *)&CONVERT_TO_SHORTPTR( + &CONVERT_TO_SHORTPTR( xd->plane[pli] .dst.buf)[xd->plane[pli].dst.stride * (MAX_MIB_SIZE * sbr << bsize[pli]) + diff --git a/av1/common/dering.h b/av1/common/cdef.h similarity index 51% rename from av1/common/dering.h rename to av1/common/cdef.h index 73e7bf130ef80a62925c52f4996a90d1f3dd6894..bb010c4189155db2149a9758f44498837330fb07 100644 --- a/av1/common/dering.h +++ b/av1/common/cdef.h @@ -11,32 +11,48 @@ #ifndef AV1_COMMON_DERING_H_ #define AV1_COMMON_DERING_H_ -#include "av1/common/od_dering.h" -#include "av1/common/onyxc_int.h" -#include "aom/aom_integer.h" +// ceil(log2(DERING_STRENGTHS^DERING_REFINEMENT_LEVELS * +// CLPF_STRENGTHS^CLPF_REFINEMENT_LEVELS)) +#define DERING_LEVEL_BITS (22) +#define MAX_DERING_LEVEL (1LL << DERING_LEVEL_BITS) + +#define DERING_REFINEMENT_BITS 2 +#define DERING_REFINEMENT_LEVELS 4 +#define CLPF_REFINEMENT_BITS 1 +#define CLPF_REFINEMENT_LEVELS 2 + +#define DERING_STRENGTHS 21 +#define CLPF_STRENGTHS 4 + #include "./aom_config.h" +#include "aom/aom_integer.h" #include "aom_ports/mem.h" -#include "od_dering.h" +#include "av1/common/od_dering.h" +#include "av1/common/onyxc_int.h" +#include "./od_dering.h" #ifdef __cplusplus extern "C" { #endif -#define DERING_LEVEL_BITS 6 -#define MAX_DERING_LEVEL (1 << DERING_LEVEL_BITS) +extern int dering_level_table[DERING_STRENGTHS]; -#define DERING_REFINEMENT_BITS 2 -#define DERING_REFINEMENT_LEVELS 4 +uint32_t levels_to_id(const int lev[DERING_REFINEMENT_LEVELS], + const int str[CLPF_REFINEMENT_LEVELS]); +void id_to_levels(int lev[DERING_REFINEMENT_LEVELS], + int str[CLPF_REFINEMENT_LEVELS], uint32_t id); +void cdef_get_bits(const int *lev, const int *str, int *dering_bits, + int *clpf_bits); -int compute_level_from_index(int global_level, int gi); int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col); int sb_compute_dering_list(const AV1_COMMON *const cm, int mi_row, int mi_col, dering_list *dlist); -void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, - MACROBLOCKD *xd, int global_level); +void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd, + uint32_t global_level, int clpf_strength_u, + int clpf_strength_v); -int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, - AV1_COMMON *cm, MACROBLOCKD *xd); +void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, + AV1_COMMON *cm, MACROBLOCKD *xd); #ifdef __cplusplus } // extern "C" diff --git a/av1/common/clpf.c b/av1/common/clpf.c index 7d9933d9857e96df7e60a363c24be32d76d2cf24..1ff21ada223977b9ccdeca05ead7c057031cfb95 100644 --- a/av1/common/clpf.c +++ b/av1/common/clpf.c @@ -14,9 +14,9 @@ #include "aom/aom_image.h" #include "aom_dsp/aom_dsp_common.h" -int sign(int i) { return i < 0 ? -1 : 1; } +static int sign(int i) { return i < 0 ? -1 : 1; } -int constrain(int x, int s, unsigned int damping) { +static int constrain(int x, int s, unsigned int damping) { return sign(x) * AOMMAX(0, abs(x) - AOMMAX(0, abs(x) - s + (abs(x) >> (damping - get_msb(s))))); @@ -59,8 +59,8 @@ void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride, } } -#if CONFIG_AOM_HIGHBITDEPTH // Identical to aom_clpf_block_c() apart from "src" and "dst". +// TODO(stemidts): Put under CONFIG_AOM_HIGHBITDEPTH if CDEF do 8 bit internally void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, @@ -88,235 +88,3 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride, } } } -#endif - -// Return number of filtered blocks -void av1_clpf_frame( - const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *org, - AV1_COMMON *cm, int enable_fb_flag, unsigned int strength, - unsigned int fb_size_log2, int plane, - int (*decision)(int, int, const YV12_BUFFER_CONFIG *, - const YV12_BUFFER_CONFIG *, const AV1_COMMON *cm, int, int, - int, unsigned int, unsigned int, int8_t *, int)) { - /* Constrained low-pass filter (CLPF) */ - int c, k, l, m, n; - const int subx = plane != AOM_PLANE_Y && frame->subsampling_x; - const int suby = plane != AOM_PLANE_Y && frame->subsampling_y; - const int bs = (subx || suby) ? 4 : 8; - const int bslog = get_msb(bs); - int width = plane != AOM_PLANE_Y ? frame->uv_crop_width : frame->y_crop_width; - int height = - plane != AOM_PLANE_Y ? frame->uv_crop_height : frame->y_crop_height; - int xpos, ypos; - const int sstride = plane != AOM_PLANE_Y ? frame->uv_stride : frame->y_stride; - int dstride = bs; - const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2; - const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2; - uint8_t *cache = NULL; - uint8_t **cache_ptr = NULL; - uint8_t **cache_dst = NULL; - int cache_idx = 0; - const int cache_size = num_fb_hor << (2 * fb_size_log2); - const int cache_blocks = cache_size / (bs * bs); - uint8_t *src_buffer = - plane != AOM_PLANE_Y - ? (plane == AOM_PLANE_U ? frame->u_buffer : frame->v_buffer) - : frame->y_buffer; - uint8_t *dst_buffer; - // Damping is the filter cut-off log2 point for the constrain function. - // For instance, if the damping is 5, neighbour differences above 32 will - // be ignored and half of the strength will be applied for a difference of 16. - int damping = - cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6); - -// Make buffer space for in-place filtering -#if CONFIG_AOM_HIGHBITDEPTH - strength <<= (cm->bit_depth - 8); - CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size << !!cm->use_highbitdepth)); - dst_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache; -#else - CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size)); - dst_buffer = cache; -#endif - CHECK_MEM_ERROR(cm, cache_ptr, aom_malloc(cache_blocks * sizeof(*cache_ptr))); - CHECK_MEM_ERROR(cm, cache_dst, aom_malloc(cache_blocks * sizeof(*cache_dst))); - memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst)); - - // Iterate over all filter blocks - for (k = 0; k < num_fb_ver; k++) { - for (l = 0; l < num_fb_hor; l++) { - int h, w; - int allskip = !(enable_fb_flag && fb_size_log2 == MAX_FB_SIZE_LOG2); - const int xoff = l << fb_size_log2; - const int yoff = k << fb_size_log2; - for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) { - for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) { - xpos = xoff + n * bs; - ypos = yoff + m * bs; - if (xpos < width && ypos < height) { - allskip &= - cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride + - (xpos << subx) / MI_SIZE] - ->mbmi.skip; - } - } - } - - // Calculate the actual filter block size near frame edges - h = AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); - w = AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); - h += !h << fb_size_log2; - w += !w << fb_size_log2; - if (!allskip && // Do not filter the block if all is skip encoded - (!enable_fb_flag || - // Only called if fb_flag enabled (luma only) - decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength, - fb_size_log2, - cm->clpf_blocks + yoff / MIN_FB_SIZE * cm->clpf_stride + - xoff / MIN_FB_SIZE, - plane))) { - // Iterate over all smaller blocks inside the filter block - for (m = 0; m < ((h + bs - 1) >> bslog); m++) { - for (n = 0; n < ((w + bs - 1) >> bslog); n++) { - int sizex, sizey; - xpos = xoff + n * bs; - ypos = yoff + m * bs; - sizex = AOMMIN(width - xpos, bs); - sizey = AOMMIN(height - ypos, bs); - if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride + - (xpos << subx) / MI_SIZE] - ->mbmi.skip || - (enable_fb_flag && fb_size_log2 == MAX_FB_SIZE_LOG2)) { - BOUNDARY_TYPE boundary_type = - cm->mi[(ypos << suby) / MI_SIZE * cm->mi_stride + - (xpos << subx) / MI_SIZE] - .mbmi.boundary_info; - - // Temporary buffering needed for in-place filtering - if (cache_ptr[cache_idx]) { -// Copy filtered block back into the frame -#if CONFIG_AOM_HIGHBITDEPTH - if (cm->use_highbitdepth) { - uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]); - if (sizex == 8) { - for (c = 0; c < sizey; c++) { - *(uint64_t *)(d + c * sstride) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); - *(uint64_t *)(d + c * sstride + 4) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8); - } - } else if (sizex == 4) { - for (c = 0; c < sizey; c++) - *(uint64_t *)(d + c * sstride) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); - } else { - for (c = 0; c < sizey; c++) - memcpy(d + c * sstride, cache_ptr[cache_idx] + c * bs * 2, - sizex); - } - } else { - if (sizex == 8) - for (c = 0; c < sizey; c++) - *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs); - else if (sizex == 4) - for (c = 0; c < sizey; c++) - *(uint32_t *)(cache_dst[cache_idx] + c * sstride) = - *(uint32_t *)(cache_ptr[cache_idx] + c * bs); - else - for (c = 0; c < sizey; c++) - memcpy(cache_dst[cache_idx] + c * sstride, - cache_ptr[cache_idx] + c * bs, sizex); - } -#else - if (sizex == 8) - for (c = 0; c < sizey; c++) - *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs); - else if (sizex == 4) - for (c = 0; c < sizey; c++) - *(uint32_t *)(cache_dst[cache_idx] + c * sstride) = - *(uint32_t *)(cache_ptr[cache_idx] + c * bs); - else - for (c = 0; c < sizey; c++) - memcpy(cache_dst[cache_idx] + c * sstride, - cache_ptr[cache_idx] + c * bs, sizex); -#endif - } -#if CONFIG_AOM_HIGHBITDEPTH - if (cm->use_highbitdepth) { - cache_ptr[cache_idx] = cache + cache_idx * bs * bs * 2; - dst_buffer = - CONVERT_TO_BYTEPTR(cache_ptr[cache_idx]) - ypos * bs - xpos; - } else { - cache_ptr[cache_idx] = cache + cache_idx * bs * bs; - dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos; - } -#else - cache_ptr[cache_idx] = cache + cache_idx * bs * bs; - dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos; -#endif - cache_dst[cache_idx] = src_buffer + ypos * sstride + xpos; - if (++cache_idx >= cache_blocks) cache_idx = 0; - -// Apply the filter -#if CONFIG_AOM_HIGHBITDEPTH - if (cm->use_highbitdepth) { - aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer), - CONVERT_TO_SHORTPTR(dst_buffer), sstride, - dstride, xpos, ypos, sizex, sizey, strength, - boundary_type, damping); - } else { - aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos, - ypos, sizex, sizey, strength, boundary_type, - damping); - } -#else - aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos, - ypos, sizex, sizey, strength, boundary_type, - damping); -#endif - } - } - } - } - } - } - - // Copy remaining blocks into the frame - for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx]; - cache_idx++) { -#if CONFIG_AOM_HIGHBITDEPTH - if (cm->use_highbitdepth) { - uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]); - for (c = 0; c < bs; c++) { - *(uint64_t *)(d + c * sstride) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); - if (bs == 8) - *(uint64_t *)(d + c * sstride + 4) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8); - } - } else { - for (c = 0; c < bs; c++) - if (bs == 4) - *(uint32_t *)(cache_dst[cache_idx] + c * sstride) = - *(uint32_t *)(cache_ptr[cache_idx] + c * bs); - else - *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs); - } -#else - for (c = 0; c < bs; c++) - if (bs == 4) - *(uint32_t *)(cache_dst[cache_idx] + c * sstride) = - *(uint32_t *)(cache_ptr[cache_idx] + c * bs); - else - *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs); -#endif - } - - aom_free(cache); - aom_free(cache_ptr); - aom_free(cache_dst); -} diff --git a/av1/common/clpf.h b/av1/common/clpf.h index b50b7a67fe33652e586de665ab8a8ea7543a0cd1..d6348deb0feba8487cde23c08043657be3ef24a8 100644 --- a/av1/common/clpf.h +++ b/av1/common/clpf.h @@ -13,20 +13,6 @@ #include "av1/common/reconinter.h" -#define MAX_FB_SIZE_LOG2 7 -#define MIN_FB_SIZE_LOG2 5 -#define MAX_FB_SIZE (1 << MAX_FB_SIZE_LOG2) -#define MIN_FB_SIZE (1 << MIN_FB_SIZE_LOG2) - int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G, int H, int b, unsigned int dmp); -void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, - const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm, - int enable_fb_flag, unsigned int strength, - unsigned int fb_size_log2, int plane, - int (*decision)(int, int, const YV12_BUFFER_CONFIG *, - const YV12_BUFFER_CONFIG *, - const AV1_COMMON *cm, int, int, int, - unsigned int, unsigned int, int8_t *, int)); - #endif diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h index b36553f61b9ba675efae1e6a668aa100e24a25b5..ae4294fbc73b5af99a090a2c2a2b10d1a8d7bb83 100644 --- a/av1/common/clpf_simd.h +++ b/av1/common/clpf_simd.h @@ -263,7 +263,7 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride, } } -#if CONFIG_AOM_HIGHBITDEPTH +#if defined(CONFIG_AOM_HIGHBITDEPTH) // sign(a - b) * max(0, abs(a - b) - max(0, abs(a - b) - // strength + (abs(a - b) >> (dmp - log2(s))))) SIMD_INLINE v128 constrain_hbd(v128 a, v128 b, unsigned int strength, diff --git a/av1/common/enums.h b/av1/common/enums.h index 1e868ae99d17141f6917adc2bd16f05ef3d0cddb..392c97af419880303c362268cf07cf08174477ac 100644 --- a/av1/common/enums.h +++ b/av1/common/enums.h @@ -295,16 +295,6 @@ typedef enum { } PALETTE_COLOR; #endif // CONFIG_PALETTE -#ifdef CONFIG_CDEF -#define CLPF_NOFLAG -1 -typedef enum { - CLPF_NOSIZE = 0, - CLPF_32X32 = 1, - CLPF_64X64 = 2, - CLPF_128X128 = 3 -} CLPF_BLOCK_SIZE; -#endif - typedef enum ATTRIBUTE_PACKED { DC_PRED, // Average of above and left pixels V_PRED, // Vertical diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c index 82fd09e7c4eb38ff1c3d5cf26e5cd20d0b6ee5ee..1371904c6991edeb74755e1efcaa47c1ecb191bf 100644 --- a/av1/common/od_dering.c +++ b/av1/common/od_dering.c @@ -8,14 +8,17 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ + +#include +#include + #ifdef HAVE_CONFIG_H -#include "config.h" +#include "./config.h" #endif -#include -#include -#include "dering.h" +#include "./aom_dsp_rtcd.h" #include "./av1_rtcd.h" +#include "./cdef.h" /* Generated from gen_filter_tables.c. */ const int OD_DIRECTION_OFFSETS_TABLE[8][3] = { @@ -38,7 +41,7 @@ const int OD_DIRECTION_OFFSETS_TABLE[8][3] = { in a particular direction. Since each direction have the same sum(x^2) term, that term is never computed. See Section 2, step 2, of: http://jmvalin.ca/notes/intra_paint.pdf */ -int od_dir_find8_c(const int16_t *img, int stride, int32_t *var, +int od_dir_find8_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift) { int i; int32_t cost[8] = { 0 }; @@ -110,8 +113,9 @@ int od_dir_find8_c(const int16_t *img, int stride, int32_t *var, } /* Smooth in the direction detected. */ -int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in, - int threshold, int dir) { +int od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, + const uint16_t *in, int threshold, + int dir) { int i; int j; int k; @@ -144,8 +148,9 @@ int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in, } /* Smooth in the direction detected. */ -int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in, - int threshold, int dir) { +int od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, + const uint16_t *in, int threshold, + int dir) { int i; int j; int k; @@ -198,22 +203,22 @@ static INLINE int od_adjust_thresh(int threshold, int32_t var) { return (threshold * OD_THRESH_TABLE_Q8[OD_ILOG(v1)] + 128) >> 8; } -static INLINE void copy_8x8_16bit_to_16bit(int16_t *dst, int dstride, - int16_t *src, int sstride) { +static INLINE void copy_8x8_16bit_to_16bit(uint16_t *dst, int dstride, + uint16_t *src, int sstride) { int i, j; for (i = 0; i < 8; i++) for (j = 0; j < 8; j++) dst[i * dstride + j] = src[i * sstride + j]; } -static INLINE void copy_4x4_16bit_to_16bit(int16_t *dst, int dstride, - int16_t *src, int sstride) { +static INLINE void copy_4x4_16bit_to_16bit(uint16_t *dst, int dstride, + uint16_t *src, int sstride) { int i, j; for (i = 0; i < 4; i++) for (j = 0; j < 4; j++) dst[i * dstride + j] = src[i * sstride + j]; } /* TODO: Optimize this function for SSE. */ -void copy_dering_16bit_to_16bit(int16_t *dst, int dstride, int16_t *src, +void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src, dering_list *dlist, int dering_count, int bsize) { int bi, bx, by; @@ -234,10 +239,11 @@ void copy_dering_16bit_to_16bit(int16_t *dst, int dstride, int16_t *src, } } -void od_dering(int16_t *y, int16_t *in, int xdec, +void od_dering(uint16_t *y, uint16_t *in, int xdec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli, dering_list *dlist, int dering_count, int threshold, - int coeff_shift) { + int clpf_strength, int clpf_damping, int coeff_shift, + BOUNDARY_TYPE bt) { int bi; int bx; int by; @@ -276,6 +282,21 @@ void od_dering(int16_t *y, int16_t *in, int xdec, dir[by][bx]); } } + if (!clpf_strength) return; copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, y, dlist, dering_count, bsize); + for (bi = 0; bi < dering_count; bi++) { + BOUNDARY_TYPE bt2 = 0; + by = dlist[bi].by; + bx = dlist[bi].bx; + + // Prevent CLPF from reading across superblock boundaries + if (!by) bt2 |= TILE_ABOVE_BOUNDARY; + if (by == (1 << bsize) - 1) bt2 |= TILE_BOTTOM_BOUNDARY; + + aom_clpf_block_hbd(in, &y[((bi - by) << 2 * bsize) - (bx << bsize)], + OD_FILT_BSTRIDE, 1 << bsize, bx << bsize, by << bsize, + 1 << bsize, 1 << bsize, clpf_strength << coeff_shift, + bt | bt2, clpf_damping + coeff_shift); + } } diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h index e61e7fa195cf333c0b349245138070d0ccb2dcd5..e414fff54a65781781cbf74fb727adc913054d54 100644 --- a/av1/common/od_dering.h +++ b/av1/common/od_dering.h @@ -24,8 +24,9 @@ #define OD_FILT_VBORDER (3) /* We only need to buffer three horizontal lines too, but let's make it four to make vectorization easier. */ -#define OD_FILT_HBORDER (4) -#define OD_FILT_BSTRIDE (OD_BSIZE_MAX + 2 * OD_FILT_HBORDER) +#define OD_FILT_HBORDER (32) +#define OD_FILT_BSTRIDE \ + ALIGN_POWER_OF_TWO(OD_BSIZE_MAX + 2 * OD_FILT_HBORDER, 5) #define OD_DERING_VERY_LARGE (30000) #define OD_DERING_INBUF_SIZE \ @@ -38,19 +39,22 @@ typedef struct { unsigned char bx; } dering_list; -typedef int (*od_filter_dering_direction_func)(int16_t *y, int ystride, - const int16_t *in, int threshold, - int dir); -void copy_dering_16bit_to_16bit(int16_t *dst, int dstride, int16_t *src, +typedef int (*od_filter_dering_direction_func)(uint16_t *y, int ystride, + const uint16_t *in, + int threshold, int dir); +void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src, dering_list *dlist, int dering_count, int bsize); -void od_dering(int16_t *y, int16_t *in, int xdec, +void od_dering(uint16_t *y, uint16_t *in, int xdec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli, - dering_list *dlist, int skip_stride, int threshold, - int coeff_shift); -int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in, - int threshold, int dir); -int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in, - int threshold, int dir); + dering_list *dlist, int dering_count, int threshold, + int clpf_strength, int clpf_damping, int coeff_shift, + BOUNDARY_TYPE bt); +int od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, + const uint16_t *in, int threshold, + int dir); +int od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, + const uint16_t *in, int threshold, + int dir); #endif diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h index f0d6dab841c3f17a4bddb49824a3b3d6fd249675..997e3f28506aa655101c49ebd66997fc0d1f54d6 100644 --- a/av1/common/onyxc_int.h +++ b/av1/common/onyxc_int.h @@ -35,7 +35,11 @@ #if CONFIG_PVQ #include "av1/common/pvq.h" #endif - +#if CONFIG_CDEF +struct AV1Common; +typedef struct AV1Common AV1_COMMON; +#include "av1/common/cdef.h" +#endif #ifdef __cplusplus extern "C" { #endif @@ -166,30 +170,6 @@ typedef struct AV1Common { // Marks if we need to use 16bit frame buffers (1: yes, 0: no). int use_highbitdepth; #endif -#if CONFIG_CDEF - // Two bits are used to signal the strength for all blocks and the - // valid values are: - // 0: no filtering - // 1: strength = 1 - // 2: strength = 2 - // 3: strength = 4 - int clpf_strength_y; - int clpf_strength_u; - int clpf_strength_v; - - // If clpf_strength_y is not 0, another two bits are used to signal - // the filter block size. The valid values for clfp_size are: - // 0: no block signalling - // 1: 32x32 - // 2: 64x64 - // 3: 128x128 - CLPF_BLOCK_SIZE clpf_size; - - // Buffer for storing whether to filter individual blocks. - int8_t *clpf_blocks; - int clpf_stride; -#endif - YV12_BUFFER_CONFIG *frame_to_show; RefCntBuffer *prev_frame; @@ -417,7 +397,13 @@ typedef struct AV1Common { int mib_size; // Size of the superblock in units of MI blocks int mib_size_log2; // Log 2 of above. #if CONFIG_CDEF - int dering_level; + uint32_t dering_level; + int dering_lev[DERING_REFINEMENT_LEVELS]; + int clpf_str[CLPF_REFINEMENT_LEVELS]; + int dering_bits; + int clpf_bits; + int clpf_strength_u; + int clpf_strength_v; #endif #if CONFIG_DELTA_Q diff --git a/av1/common/x86/od_dering_sse4.c b/av1/common/x86/od_dering_sse4.c index 455bb2c314f46322b0e3347564c06a4ae1b48291..58b601f7c310ba928f7b14d95e1ba6c114035531 100644 --- a/av1/common/x86/od_dering_sse4.c +++ b/av1/common/x86/od_dering_sse4.c @@ -228,8 +228,8 @@ static INLINE __m128i od_cmplt_abs_epi16(__m128i in, __m128i threshold) { return _mm_cmplt_epi16(_mm_abs_epi16(in), threshold); } -int od_filter_dering_direction_4x4_sse4_1(int16_t *y, int ystride, - const int16_t *in, int threshold, +int od_filter_dering_direction_4x4_sse4_1(uint16_t *y, int ystride, + const uint16_t *in, int threshold, int dir) { int i; __m128i sum; @@ -303,8 +303,8 @@ int od_filter_dering_direction_4x4_sse4_1(int16_t *y, int ystride, return (hsum_epi16(total_abs) + 2) >> 2; } -int od_filter_dering_direction_8x8_sse4_1(int16_t *y, int ystride, - const int16_t *in, int threshold, +int od_filter_dering_direction_8x8_sse4_1(uint16_t *y, int ystride, + const uint16_t *in, int threshold, int dir) { int i; __m128i sum; diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c index 96afcb514a084f87773382be38c34615c996c4d5..0897f0c615b50407e3fbd4293b958767079a20cd 100644 --- a/av1/decoder/decodeframe.c +++ b/av1/decoder/decodeframe.c @@ -29,9 +29,8 @@ #include "av1/common/alloccommon.h" #if CONFIG_CDEF -#include "aom/aom_image.h" +#include "av1/common/cdef.h" #include "av1/common/clpf.h" -#include "av1/common/dering.h" #endif #if CONFIG_INSPECTION #include "av1/decoder/inspection.h" @@ -1967,22 +1966,6 @@ static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id, } } #endif // CONFIG_SUPERTX -#if CONFIG_CDEF -static int clpf_all_skip(const AV1_COMMON *cm, int mi_col, int mi_row, - int size) { - int r, c; - int skip = 1; - const int maxc = AOMMIN(size, cm->mi_cols - mi_col); - const int maxr = AOMMIN(size, cm->mi_rows - mi_row); - for (r = 0; r < maxr && skip; r++) { - for (c = 0; c < maxc && skip; c++) { - skip &= !!cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c] - ->mbmi.skip; - } - } - return skip; -} -#endif // TODO(slavarnway): eliminate bsize and subsize in future commits static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd, @@ -2341,100 +2324,29 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd, #if CONFIG_CDEF #if CONFIG_EXT_PARTITION if (cm->sb_size == BLOCK_128X128 && bsize == BLOCK_128X128) { - if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) { + if (!sb_all_skip(cm, mi_row, mi_col)) { cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain = - aom_read_literal(r, DERING_REFINEMENT_BITS, ACCT_STR); + aom_read_literal(r, cm->dering_bits, ACCT_STR); + cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.clpf_strength = + aom_read_literal(r, cm->clpf_bits, ACCT_STR); } else { cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain = - 0; + cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col] + ->mbmi.clpf_strength = 0; } } else if (cm->sb_size == BLOCK_64X64 && bsize == BLOCK_64X64) { #else if (bsize == BLOCK_64X64) { #endif - if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) { + if (!sb_all_skip(cm, mi_row, mi_col)) { cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain = - aom_read_literal(r, DERING_REFINEMENT_BITS, ACCT_STR); + aom_read_literal(r, cm->dering_bits, ACCT_STR); + cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.clpf_strength = + aom_read_literal(r, cm->clpf_bits, ACCT_STR); } else { cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain = - 0; - } - } -#if CONFIG_EXT_PARTITION - if (cm->sb_size == BLOCK_128X128 && bsize == BLOCK_128X128 && - cm->clpf_strength_y && cm->clpf_size != CLPF_NOSIZE) { - const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride + - mi_col * MI_SIZE / MIN_FB_SIZE; - if (cm->clpf_size == CLPF_128X128) { - cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR); - } else if (cm->clpf_size == CLPF_64X64) { - const int tr = tl + 2; - const int bl = tl + 2 * cm->clpf_stride; - const int br = tr + 2 * cm->clpf_stride; - const int size = 64 / MI_SIZE; - - // Up to four bits per SB - if (!clpf_all_skip(cm, mi_col, mi_row, size)) - cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR); - - if (mi_col + size < cm->mi_cols && - !clpf_all_skip(cm, mi_col + size, mi_row, size)) - cm->clpf_blocks[tr] = aom_read_literal(r, 1, ACCT_STR); - - if (mi_row + size < cm->mi_rows && - !clpf_all_skip(cm, mi_col, mi_row + size, size)) - cm->clpf_blocks[bl] = aom_read_literal(r, 1, ACCT_STR); - - if (mi_col + size < cm->mi_cols && mi_row + size < cm->mi_rows && - !clpf_all_skip(cm, mi_col + size, mi_row + size, size)) - cm->clpf_blocks[br] = aom_read_literal(r, 1, ACCT_STR); - } else if (cm->clpf_size == CLPF_32X32) { - int i, j; - const int size = 32 / MI_SIZE; - for (i = 0; i < 4; ++i) - for (j = 0; j < 4; ++j) { - const int index = tl + i * cm->clpf_stride + j; - if (mi_row + i * size < cm->mi_rows && - mi_col + j * size < cm->mi_cols && - !clpf_all_skip(cm, mi_col + j * size, mi_row + i * size, size)) - cm->clpf_blocks[index] = aom_read_literal(r, 1, ACCT_STR); - } - } - } else if (cm->sb_size == BLOCK_64X64 && bsize == BLOCK_64X64 && -#else - if (bsize == BLOCK_64X64 && -#endif // CONFIG_EXT_PARTITION - cm->clpf_strength_y && cm->clpf_size != CLPF_NOSIZE) { - const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride + - mi_col * MI_SIZE / MIN_FB_SIZE; - - if (!((mi_row * MI_SIZE) & 127) && !((mi_col * MI_SIZE) & 127) && - cm->clpf_size == CLPF_128X128) { - cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR); - } else if (cm->clpf_size == CLPF_64X64 && - !clpf_all_skip(cm, mi_col, mi_row, 64 / MI_SIZE)) { - cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR); - } else if (cm->clpf_size == CLPF_32X32) { - const int tr = tl + 1; - const int bl = tl + cm->clpf_stride; - const int br = tr + cm->clpf_stride; - const int size = 32 / MI_SIZE; - - // Up to four bits per SB - if (!clpf_all_skip(cm, mi_col, mi_row, size)) - cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR); - - if (mi_col + size < cm->mi_cols && - !clpf_all_skip(cm, mi_col + size, mi_row, size)) - cm->clpf_blocks[tr] = aom_read_literal(r, 1, ACCT_STR); - - if (mi_row + size < cm->mi_rows && - !clpf_all_skip(cm, mi_col, mi_row + size, size)) - cm->clpf_blocks[bl] = aom_read_literal(r, 1, ACCT_STR); - - if (mi_col + size < cm->mi_cols && mi_row + size < cm->mi_rows && - !clpf_all_skip(cm, mi_col + size, mi_row + size, size)) - cm->clpf_blocks[br] = aom_read_literal(r, 1, ACCT_STR); + cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col] + ->mbmi.clpf_strength = 0; } } #endif // CONFIG_CDEF @@ -2697,42 +2609,12 @@ static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { } #if CONFIG_CDEF -static void setup_clpf(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) { - AV1_COMMON *const cm = &pbi->common; - const int width = pbi->cur_buf->buf.y_crop_width; - const int height = pbi->cur_buf->buf.y_crop_height; - - cm->clpf_blocks = 0; - cm->clpf_strength_y = aom_rb_read_literal(rb, 2); +static void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { + cm->dering_level = aom_rb_read_literal(rb, DERING_LEVEL_BITS); cm->clpf_strength_u = aom_rb_read_literal(rb, 2); cm->clpf_strength_v = aom_rb_read_literal(rb, 2); - if (cm->clpf_strength_y) { - cm->clpf_size = aom_rb_read_literal(rb, 2); - if (cm->clpf_size != CLPF_NOSIZE) { - int size; - cm->clpf_stride = - ((width + MIN_FB_SIZE - 1) & ~(MIN_FB_SIZE - 1)) >> MIN_FB_SIZE_LOG2; - size = - cm->clpf_stride * ((height + MIN_FB_SIZE - 1) & ~(MIN_FB_SIZE - 1)) >> - MIN_FB_SIZE_LOG2; - CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(size)); - memset(cm->clpf_blocks, -1, size); - } - } -} - -static int clpf_bit(UNUSED int k, UNUSED int l, - UNUSED const YV12_BUFFER_CONFIG *rec, - UNUSED const YV12_BUFFER_CONFIG *org, - UNUSED const AV1_COMMON *cm, UNUSED int block_size, - UNUSED int w, UNUSED int h, UNUSED unsigned int strength, - UNUSED unsigned int fb_size_log2, int8_t *bit, - UNUSED int plane) { - return *bit; -} - -static void setup_dering(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { - cm->dering_level = aom_rb_read_literal(rb, DERING_LEVEL_BITS); + id_to_levels(cm->dering_lev, cm->clpf_str, cm->dering_level); + cdef_get_bits(cm->dering_lev, cm->clpf_str, &cm->dering_bits, &cm->clpf_bits); } #endif // CONFIG_CDEF @@ -4359,8 +4241,7 @@ static size_t read_uncompressed_header(AV1Decoder *pbi, setup_loopfilter(cm, rb); #if CONFIG_CDEF - setup_dering(cm, rb); - setup_clpf(pbi, rb); + setup_cdef(cm, rb); #endif #if CONFIG_LOOP_RESTORATION decode_restoration_mode(cm, rb); @@ -5002,28 +4883,11 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data, } #if CONFIG_CDEF - if (cm->dering_level && !cm->skip_loop_filter) { - av1_dering_frame(&pbi->cur_buf->buf, cm, &pbi->mb, cm->dering_level); - } - if (!cm->skip_loop_filter) { - const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf; - if (cm->clpf_strength_y) { - av1_clpf_frame(frame, NULL, cm, cm->clpf_size != CLPF_NOSIZE, - cm->clpf_strength_y + (cm->clpf_strength_y == 3), - 4 + cm->clpf_size, AOM_PLANE_Y, clpf_bit); - } - if (cm->clpf_strength_u) { - av1_clpf_frame(frame, NULL, cm, 0, // No block signals for chroma - cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4, - AOM_PLANE_U, NULL); - } - if (cm->clpf_strength_v) { - av1_clpf_frame(frame, NULL, cm, 0, // No block signals for chroma - cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4, - AOM_PLANE_V, NULL); - } + if ((cm->dering_level || cm->clpf_strength_u || cm->clpf_strength_v) && + !cm->skip_loop_filter) { + av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb, cm->dering_level, + cm->clpf_strength_u, cm->clpf_strength_v); } - if (cm->clpf_blocks) aom_free(cm->clpf_blocks); #endif // CONFIG_CDEF #if CONFIG_LOOP_RESTORATION diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c index 7ff1e165b267a3f03463c8ef285caa27e7df9e33..587134f38b383354fe438297904182f6ac4ce4fd 100644 --- a/av1/encoder/bitstream.c +++ b/av1/encoder/bitstream.c @@ -24,8 +24,8 @@ #endif // CONFIG_BITSTREAM_DEBUG #if CONFIG_CDEF +#include "av1/common/cdef.h" #include "av1/common/clpf.h" -#include "av1/common/dering.h" #endif // CONFIG_CDEF #include "av1/common/entropy.h" #include "av1/common/entropymode.h" @@ -2735,95 +2735,29 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile, #if CONFIG_CDEF #if CONFIG_EXT_PARTITION if (cm->sb_size == BLOCK_128X128 && bsize == BLOCK_128X128 && - cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) { + !sb_all_skip(cm, mi_row, mi_col)) { aom_write_literal( w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain, - DERING_REFINEMENT_BITS); + cm->dering_bits); + aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col] + ->mbmi.clpf_strength, + cm->clpf_bits); } else if (cm->sb_size == BLOCK_64X64 && bsize == BLOCK_64X64 && #else if (bsize == BLOCK_64X64 && #endif // CONFIG_EXT_PARTITION - cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) { - aom_write_literal( - w, - cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain, - DERING_REFINEMENT_BITS); + !sb_all_skip(cm, mi_row, mi_col)) { + if (cm->dering_bits) + aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col] + ->mbmi.dering_gain, + cm->dering_bits); + if (cm->clpf_bits) + aom_write_literal(w, cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col] + ->mbmi.clpf_strength, + cm->clpf_bits); } #endif - -#if CONFIG_CDEF -#if CONFIG_EXT_PARTITION - if (cm->sb_size == BLOCK_128X128 && bsize == BLOCK_128X128 && - cm->clpf_blocks && cm->clpf_strength_y && cm->clpf_size != CLPF_NOSIZE) { - const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride + - mi_col * MI_SIZE / MIN_FB_SIZE; - if (cm->clpf_size == CLPF_128X128 && cm->clpf_blocks[tl] != CLPF_NOFLAG) { - aom_write_literal(w, cm->clpf_blocks[tl], 1); - } else if (cm->clpf_size == CLPF_64X64) { - const int tr = tl + 2; - const int bl = tl + 2 * cm->clpf_stride; - const int br = tr + 2 * cm->clpf_stride; - - // Up to four bits per SB. - if (cm->clpf_blocks[tl] != CLPF_NOFLAG) - aom_write_literal(w, cm->clpf_blocks[tl], 1); - - if (mi_col + MI_SIZE < cm->mi_cols && cm->clpf_blocks[tr] != CLPF_NOFLAG) - aom_write_literal(w, cm->clpf_blocks[tr], 1); - - if (mi_row + MI_SIZE < cm->mi_rows && cm->clpf_blocks[bl] != CLPF_NOFLAG) - aom_write_literal(w, cm->clpf_blocks[bl], 1); - - if (mi_row + MI_SIZE < cm->mi_rows && mi_col + MI_SIZE < cm->mi_cols && - cm->clpf_blocks[br] != CLPF_NOFLAG) - aom_write_literal(w, cm->clpf_blocks[br], 1); - } else if (cm->clpf_size == CLPF_32X32) { - int i, j; - const int size = 32 / MI_SIZE; - // Up to sixteen bits per SB. - for (i = 0; i < 4; ++i) - for (j = 0; j < 4; ++j) { - const int index = tl + i * cm->clpf_stride + j; - if (mi_row + i * size < cm->mi_rows && - mi_col + j * size < cm->mi_cols && - cm->clpf_blocks[index] != CLPF_NOFLAG) - aom_write_literal(w, cm->clpf_blocks[index], 1); - } - } - } else if (cm->sb_size == BLOCK_64X64 && bsize == BLOCK_64X64 && -#else - if (bsize == BLOCK_64X64 && -#endif // CONFIG_EXT_PARTITION - cm->clpf_blocks && cm->clpf_strength_y && - cm->clpf_size != CLPF_NOSIZE) { - const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride + - mi_col * MI_SIZE / MIN_FB_SIZE; - const int tr = tl + 1; - const int bl = tl + cm->clpf_stride; - const int br = tr + cm->clpf_stride; - - // Up to four bits per SB. - // When clpf_size indicates a size larger than the SB size - // (CLPF_128X128), one bit for every fourth SB will be transmitted - // regardless of skip blocks. - if (cm->clpf_blocks[tl] != CLPF_NOFLAG) - aom_write_literal(w, cm->clpf_blocks[tl], 1); - - if (mi_col + MI_SIZE / 2 < cm->mi_cols && - cm->clpf_blocks[tr] != CLPF_NOFLAG) - aom_write_literal(w, cm->clpf_blocks[tr], 1); - - if (mi_row + MI_SIZE / 2 < cm->mi_rows && - cm->clpf_blocks[bl] != CLPF_NOFLAG) - aom_write_literal(w, cm->clpf_blocks[bl], 1); - - if (mi_row + MI_SIZE / 2 < cm->mi_rows && - mi_col + MI_SIZE / 2 < cm->mi_cols && - cm->clpf_blocks[br] != CLPF_NOFLAG) - aom_write_literal(w, cm->clpf_blocks[br], 1); - } -#endif // CONFIG_CDEF } static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile, @@ -3522,22 +3456,13 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { } #if CONFIG_CDEF -static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { - aom_wb_write_literal(wb, cm->clpf_strength_y, 2); +static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { + aom_wb_write_literal(wb, cm->dering_level, DERING_LEVEL_BITS); aom_wb_write_literal(wb, cm->clpf_strength_u, 2); aom_wb_write_literal(wb, cm->clpf_strength_v, 2); - if (cm->clpf_strength_y) { - aom_wb_write_literal(wb, cm->clpf_size, 2); - } } #endif -#if CONFIG_CDEF -static void encode_dering(int level, struct aom_write_bit_buffer *wb) { - aom_wb_write_literal(wb, level, DERING_LEVEL_BITS); -} -#endif // CONFIG_CDEF - static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) { if (delta_q != 0) { aom_wb_write_bit(wb, 1); @@ -4481,8 +4406,7 @@ static void write_uncompressed_header(AV1_COMP *cpi, encode_loopfilter(cm, wb); #if CONFIG_CDEF - encode_dering(cm->dering_level, wb); - encode_clpf(cm, wb); + encode_cdef(cm, wb); #endif #if CONFIG_LOOP_RESTORATION encode_restoration_mode(cm, wb); diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c index 3ef67cc85a662b32ae9648a63e9568a333d0e425..01736818bfcef854f0b499708c1a587ef20f75e1 100644 --- a/av1/encoder/clpf_rdo.c +++ b/av1/encoder/clpf_rdo.c @@ -142,68 +142,17 @@ void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org, } #endif -int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, - const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, - int block_size, int w, int h, unsigned int strength, - unsigned int fb_size_log2, int8_t *res, int plane) { - int m, n, sum0 = 0, sum1 = 0; - int damping = - cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6); - - for (m = 0; m < h; m++) { - for (n = 0; n < w; n++) { - int xpos = (l << fb_size_log2) + n * block_size; - int ypos = (k << fb_size_log2) + m * block_size; - if (fb_size_log2 == MAX_FB_SIZE_LOG2 || - !cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE] - ->mbmi.skip) { -#if CONFIG_AOM_HIGHBITDEPTH - if (cm->use_highbitdepth) { - aom_clpf_detect_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer), - CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride, - org->y_stride, xpos, ypos, rec->y_crop_width, - rec->y_crop_height, &sum0, &sum1, strength, - block_size, cm->bit_depth, damping); - } else { - aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, - org->y_stride, xpos, ypos, rec->y_crop_width, - rec->y_crop_height, &sum0, &sum1, strength, - block_size, damping); - } -#else - aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, - org->y_stride, xpos, ypos, rec->y_crop_width, - rec->y_crop_height, &sum0, &sum1, strength, block_size, - damping); -#endif - } - } - } - *res = sum1 < sum0; - return *res; -} - // Calculate the square error of all filter settings. Result: // res[0][0] : unfiltered // res[0][1-3] : strength=1,2,4, no signals -// (Only for luma:) -// res[1][0] : (bit count, fb size = 128) -// res[1][1-3] : strength=1,2,4, fb size = 128 -// res[1][4] : unfiltered, including skip -// res[1][5-7] : strength=1,2,4, including skip, fb_size = 128 -// res[2][0] : (bit count, fb size = 64) -// res[2][1-3] : strength=1,2,4, fb size = 64 -// res[3][0] : (bit count, fb size = 32) -// res[3][1-3] : strength=1,2,4, fb size = 32 -static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec, - const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, - unsigned int block_size, unsigned int fb_size_log2, int w, - int h, int64_t res[4][8], int plane) { - int c, m, n, filtered = 0; - int sum[8]; +static void clpf_rdo(const YV12_BUFFER_CONFIG *rec, + const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, + unsigned int block_size, int w, int h, uint64_t res[4], + int plane) { + int m, n; + int sum[4]; const int subx = plane != AOM_PLANE_Y && rec->subsampling_x; const int suby = plane != AOM_PLANE_Y && rec->subsampling_y; - int bslog = get_msb(block_size); uint8_t *rec_buffer = plane != AOM_PLANE_Y ? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer) @@ -220,166 +169,64 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec, int damping = cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6); - sum[0] = sum[1] = sum[2] = sum[3] = sum[4] = sum[5] = sum[6] = sum[7] = 0; - if (plane == AOM_PLANE_Y && - fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) { - int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered; - - filtered = fb_size_log2-- == MAX_FB_SIZE_LOG2; - w1 = AOMMIN(1 << (fb_size_log2 - bslog), w); - h1 = AOMMIN(1 << (fb_size_log2 - bslog), h); - w2 = AOMMIN(w - (1 << (fb_size_log2 - bslog)), w >> 1); - h2 = AOMMIN(h - (1 << (fb_size_log2 - bslog)), h >> 1); - i = get_msb(MAX_FB_SIZE) - fb_size_log2; - sum1 = (int)res[i][1]; - sum2 = (int)res[i][2]; - sum3 = (int)res[i][3]; - oldfiltered = (int)res[i][0]; - res[i][0] = 0; - - filtered |= clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1, - res, plane); - if (1 << (fb_size_log2 - bslog) < w) - filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size, - fb_size_log2, w2, h1, res, plane); - if (1 << (fb_size_log2 - bslog) < h) { - filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size, - fb_size_log2, w1, h2, res, plane); - filtered |= - clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2), rec, org, - cm, block_size, fb_size_log2, w2, h2, res, plane); - } - - // Correct sums for unfiltered blocks - res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]); - res[i][2] = AOMMIN(sum2 + res[i][0], res[i][2]); - res[i][3] = AOMMIN(sum3 + res[i][0], res[i][3]); - if (i == 1) { - res[i][5] = AOMMIN(sum1 + res[i][4], res[i][5]); - res[i][6] = AOMMIN(sum2 + res[i][4], res[i][6]); - res[i][7] = AOMMIN(sum3 + res[i][4], res[i][7]); - } - - res[i][0] = oldfiltered + filtered; // Number of signal bits - - return filtered; - } + sum[0] = sum[1] = sum[2] = sum[3] = 0; for (m = 0; m < h; m++) { for (n = 0; n < w; n++) { - int xpos = x + n * block_size; - int ypos = y + m * block_size; - int skip = // Filtered skip blocks stored only for fb_size == 128 - 4 * - !!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride + - (xpos << subx) / MI_SIZE] - ->mbmi.skip; + int xpos = n * block_size; + int ypos = m * block_size; + if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride + + (xpos << subx) / MI_SIZE] + ->mbmi.skip) { #if CONFIG_AOM_HIGHBITDEPTH - if (cm->use_highbitdepth) { - aom_clpf_detect_multi_hbd( - CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer), - rec_stride, org_stride, xpos, ypos, rec_width, rec_height, - sum + skip, block_size, cm->bit_depth, damping); - } else { + if (cm->use_highbitdepth) { + aom_clpf_detect_multi_hbd( + CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer), + rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum, + block_size, cm->bit_depth, damping); + } else { + aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride, + xpos, ypos, rec_width, rec_height, sum, + block_size, damping); + } +#else aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride, - xpos, ypos, rec_width, rec_height, sum + skip, + xpos, ypos, rec_width, rec_height, sum, block_size, damping); - } -#else - aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride, - xpos, ypos, rec_width, rec_height, sum + skip, - block_size, damping); #endif - filtered |= !skip; + } } } - for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++) { - res[c][0] += sum[0]; - res[c][1] += sum[1]; - res[c][2] += sum[2]; - res[c][3] += sum[3]; - if (c != 1) continue; - // Only needed when fb_size == 128 - res[c][4] += sum[4]; - res[c][5] += sum[5]; - res[c][6] += sum[6]; - res[c][7] += sum[7]; - } - return filtered; + res[0] += sum[0]; + res[1] += sum[1]; + res[2] += sum[2]; + res[3] += sum[3]; } -void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec, +void av1_clpf_test_plane(const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, - int *best_strength, int *best_bs, int plane) { - int c, j, k, l; - int64_t best, sums[4][8]; + int *best_strength, int plane) { + int i; + uint64_t best, sums[4]; int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width; int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height; const int bs = MI_SIZE; const int bslog = get_msb(bs); - int fb_size_log2 = get_msb(MAX_FB_SIZE); - int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2; - int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2; memset(sums, 0, sizeof(sums)); - if (plane != AOM_PLANE_Y) - // Use a block size of MI_SIZE regardless of the subsampling. This - // This is accurate enough to determine the best strength and - // we don't need to add SIMD optimisations for 4x4 blocks. - clpf_rdo(0, 0, rec, org, cm, bs, fb_size_log2, width >> bslog, - height >> bslog, sums, plane); - else - for (k = 0; k < num_fb_ver; k++) { - for (l = 0; l < num_fb_hor; l++) { - // Calculate the block size after frame border clipping - int h = - AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); - int w = - AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); - h += !h << fb_size_log2; - w += !w << fb_size_log2; - clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, MI_SIZE, - fb_size_log2, w >> bslog, h >> bslog, sums, plane); - } - } - - // For fb_size == 128 skip blocks are included in the result. - if (plane == AOM_PLANE_Y) { - sums[1][1] += sums[1][5] - sums[1][4]; - sums[1][2] += sums[1][6] - sums[1][4]; - sums[1][3] += sums[1][7] - sums[1][4]; - } else { // Slightly favour unfiltered chroma - sums[0][0] -= sums[0][0] >> 7; - } + clpf_rdo(rec, org, cm, bs, width >> bslog, height >> bslog, sums, plane); - for (j = 0; j < 4; j++) { - static const double lambda_square[] = { - // exp(x / 8.5) - 1.0000, 1.1248, 1.2653, 1.4232, 1.6009, 1.8008, 2.0256, 2.2785, - 2.5630, 2.8830, 3.2429, 3.6478, 4.1032, 4.6155, 5.1917, 5.8399, - 6.5689, 7.3891, 8.3116, 9.3492, 10.516, 11.829, 13.306, 14.967, - 16.836, 18.938, 21.302, 23.962, 26.953, 30.318, 34.103, 38.361, - 43.151, 48.538, 54.598, 61.414, 69.082, 77.706, 87.408, 98.320, - 110.59, 124.40, 139.93, 157.40, 177.05, 199.16, 224.02, 251.99, - 283.45, 318.84, 358.65, 403.42, 453.79, 510.45, 574.17, 645.86, - 726.49, 817.19, 919.22, 1033.9, 1163.0, 1308.2, 1471.6, 1655.3 - }; + // Add a favourable bias for conservative strengths + for (i = 0; i < 4; i++) sums[i] -= sums[i] >> (7 + i); - // Estimate the bit costs and adjust the square errors - double lambda = - lambda_square[av1_get_qindex(&cm->seg, 0, cm->base_qindex) >> 2]; - int i, cost = (int)((lambda * (sums[j][0] + 6 + 2 * (j > 0)) + 0.5)); - for (i = 0; i < 4; i++) - sums[j][i] = ((sums[j][i] + (i && j) * cost) << 4) + j * 4 + i; - } + // Tag the strength to the error + for (i = 0; i < 4; i++) sums[i] = (sums[i] << 2) + i; - best = (int64_t)1 << 62; - for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++) - for (j = 0; j < 4; j++) - if ((!c || j) && sums[c][j] < best) best = sums[c][j]; - best &= 15; - if (best_bs) *best_bs = (best > 3) * (5 + (best < 12) + (best < 8)); - *best_strength = best ? 1 << ((best - 1) & 3) : 0; + // Identify the strength with the smallest error + best = (uint64_t)1 << 63; + for (i = 0; i < 4; i++) + if (sums[i] < best) best = sums[i]; + *best_strength = best & 3 ? 1 << ((best - 1) & 3) : 0; } diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h index f92f7d2c03cae706c1a5c65b29abbc31ecd92696..e137378165386da1a812373a0b5d4d11b2f07959 100644 --- a/av1/encoder/clpf_rdo.h +++ b/av1/encoder/clpf_rdo.h @@ -14,13 +14,8 @@ #include "av1/common/reconinter.h" -int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, - const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, - int block_size, int w, int h, unsigned int strength, - unsigned int fb_size_log2, int8_t *res, int plane); - -void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec, +void av1_clpf_test_plane(const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, - int *best_strength, int *best_bs, int plane); + int *best_strength, int plane); #endif diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c index aca6e0bf90ad2997a68fc4a45ebd8416c5bebc87..7bc9710bd83d38d0a39c451c41498b55719bb3ee 100644 --- a/av1/encoder/encoder.c +++ b/av1/encoder/encoder.c @@ -17,10 +17,9 @@ #include "av1/common/alloccommon.h" #if CONFIG_CDEF -#include "aom/aom_image.h" +#include "av1/common/cdef.h" #include "av1/common/clpf.h" #include "av1/encoder/clpf_rdo.h" -#include "av1/common/dering.h" #endif // CONFIG_CDEF #include "av1/common/filter.h" #include "av1/common/idct.h" @@ -3526,57 +3525,18 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { } #if CONFIG_CDEF if (is_lossless_requested(&cpi->oxcf)) { - cm->dering_level = 0; + cm->dering_level = cm->clpf_strength_u = cm->clpf_strength_v = 0; } else { - cm->dering_level = - av1_dering_search(cm->frame_to_show, cpi->Source, cm, xd); - av1_dering_frame(cm->frame_to_show, cm, xd, cm->dering_level); - } - cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0; - cm->clpf_size = CLPF_64X64; + // Find cm->dering_level, cm->clpf_strength_u and cm->clpf_strength_v + av1_cdef_search(cm->frame_to_show, cpi->Source, cm, xd); - // Allocate buffer to hold the status of all filter blocks: - // 1 = On, 0 = off, -1 = implicitly off - { - int size; - cm->clpf_stride = ((cm->frame_to_show->y_crop_width + MIN_FB_SIZE - 1) & - ~(MIN_FB_SIZE - 1)) >> - MIN_FB_SIZE_LOG2; - size = cm->clpf_stride * - ((cm->frame_to_show->y_crop_height + MIN_FB_SIZE - 1) & - ~(MIN_FB_SIZE - 1)) >> - MIN_FB_SIZE_LOG2; - CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(size)); - memset(cm->clpf_blocks, CLPF_NOFLAG, size); - } - - if (!is_lossless_requested(&cpi->oxcf)) { - const YV12_BUFFER_CONFIG *const frame = cm->frame_to_show; - - // Find the best strength and block size for the entire frame - int fb_size_log2, strength_y, strength_u, strength_v; - av1_clpf_test_frame(frame, cpi->Source, cm, &strength_y, &fb_size_log2, - AOM_PLANE_Y); - av1_clpf_test_frame(frame, cpi->Source, cm, &strength_u, 0, AOM_PLANE_U); - av1_clpf_test_frame(frame, cpi->Source, cm, &strength_v, 0, AOM_PLANE_V); - - if (strength_y) { - // Apply the filter using the chosen strength - cm->clpf_strength_y = strength_y - (strength_y == 4); - cm->clpf_size = - fb_size_log2 ? fb_size_log2 - MAX_FB_SIZE_LOG2 + 3 : CLPF_NOSIZE; - av1_clpf_frame(frame, cpi->Source, cm, cm->clpf_size != CLPF_NOSIZE, - strength_y, 4 + cm->clpf_size, AOM_PLANE_Y, - av1_clpf_decision); - } - if (strength_u) { - cm->clpf_strength_u = strength_u - (strength_u == 4); - av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, AOM_PLANE_U, NULL); - } - if (strength_v) { - cm->clpf_strength_v = strength_v - (strength_v == 4); - av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, AOM_PLANE_V, NULL); - } + // Apply the filter + av1_cdef_frame(cm->frame_to_show, cm, xd, cm->dering_level, + cm->clpf_strength_u, cm->clpf_strength_v); + + // Pack the clpf chroma strengths into two bits each + cm->clpf_strength_u -= cm->clpf_strength_u == 4; + cm->clpf_strength_v -= cm->clpf_strength_v == 4; } #endif #if CONFIG_LOOP_RESTORATION @@ -4980,11 +4940,6 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, if (cm->show_frame) dump_filtered_recon_frames(cpi); #endif // DUMP_RECON_FRAMES -#if CONFIG_CDEF - aom_free(cm->clpf_blocks); - cm->clpf_blocks = 0; -#endif - if (cm->seg.update_map) update_reference_segmentation_map(cpi); if (frame_is_intra_only(cm) == 0) { diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c new file mode 100644 index 0000000000000000000000000000000000000000..4ff308eff75ebe99045d593cbf635e1194a4c7a2 --- /dev/null +++ b/av1/encoder/pickcdef.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_scale_rtcd.h" +#include "aom/aom_integer.h" +#include "av1/common/cdef.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/clpf_rdo.h" +#include "av1/encoder/encoder.h" + +static double compute_dist(uint16_t *x, int xstride, uint16_t *y, int ystride, + int nhb, int nvb, int coeff_shift) { + int i, j; + double sum; + sum = 0; + for (i = 0; i < nvb << 3; i++) { + for (j = 0; j < nhb << 3; j++) { + double tmp; + tmp = x[i * xstride + j] - y[i * ystride + j]; + sum += tmp * tmp; + } + } + return sum / (double)(1 << 2 * coeff_shift); +} + +void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, + AV1_COMMON *cm, MACROBLOCKD *xd) { + int r, c; + int sbr, sbc; + uint16_t *src; + uint16_t *ref_coeff; + dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE]; + int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } }; + int stride; + int bsize[3]; + int dec[3]; + int pli; + int level; + int dering_count; + int coeff_shift = AOMMAX(cm->bit_depth - 8, 0); + uint64_t best_tot_mse = 0; + int sb_count; + int nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE; + int nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE; + int *sb_index = aom_malloc(nvsb * nhsb * sizeof(*sb_index)); + uint64_t(*mse)[DERING_STRENGTHS][CLPF_STRENGTHS] = + aom_malloc(sizeof(*mse) * nvsb * nhsb); + int clpf_damping = 3 + (cm->base_qindex >> 6); + int i; + int lev[DERING_REFINEMENT_LEVELS]; + int best_lev[DERING_REFINEMENT_LEVELS]; + int str[CLPF_REFINEMENT_LEVELS]; + int best_str[CLPF_REFINEMENT_LEVELS]; + double lambda = exp(cm->base_qindex / 36.0); + static int log2[] = { 0, 1, 2, 2 }; + + src = aom_memalign(32, sizeof(*src) * cm->mi_rows * cm->mi_cols * 64); + ref_coeff = + aom_memalign(32, sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * 64); + av1_setup_dst_planes(xd->plane, frame, 0, 0); + for (pli = 0; pli < 3; pli++) { + dec[pli] = xd->plane[pli].subsampling_x; + bsize[pli] = OD_DERING_SIZE_LOG2 - dec[pli]; + } + stride = cm->mi_cols << bsize[0]; + for (r = 0; r < cm->mi_rows << bsize[0]; ++r) { + for (c = 0; c < cm->mi_cols << bsize[0]; ++c) { +#if CONFIG_AOM_HIGHBITDEPTH + if (cm->use_highbitdepth) { + src[r * stride + c] = CONVERT_TO_SHORTPTR( + xd->plane[0].dst.buf)[r * xd->plane[0].dst.stride + c]; + ref_coeff[r * stride + c] = + CONVERT_TO_SHORTPTR(ref->y_buffer)[r * ref->y_stride + c]; + } else { +#endif + src[r * stride + c] = + xd->plane[0].dst.buf[r * xd->plane[0].dst.stride + c]; + ref_coeff[r * stride + c] = ref->y_buffer[r * ref->y_stride + c]; +#if CONFIG_AOM_HIGHBITDEPTH + } +#endif + } + } + sb_count = 0; + for (sbr = 0; sbr < nvsb; sbr++) { + for (sbc = 0; sbc < nhsb; sbc++) { + int nvb, nhb; + int gi; + DECLARE_ALIGNED(32, uint16_t, dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]); + DECLARE_ALIGNED(32, uint16_t, + tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]); + nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc); + nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr); + dering_count = sb_compute_dering_list(cm, sbr * MAX_MIB_SIZE, + sbc * MAX_MIB_SIZE, dlist); + if (dering_count == 0) continue; + for (gi = 0; gi < DERING_STRENGTHS; gi++) { + int threshold; + DECLARE_ALIGNED(32, uint16_t, inbuf[OD_DERING_INBUF_SIZE]); + uint16_t *in; + int j; + level = dering_level_table[gi]; + threshold = level << coeff_shift; + for (r = 0; r < nvb << bsize[0]; r++) { + for (c = 0; c < nhb << bsize[0]; c++) { + dst[(r * MAX_MIB_SIZE << bsize[0]) + c] = + src[((sbr * MAX_MIB_SIZE << bsize[0]) + r) * stride + + (sbc * MAX_MIB_SIZE << bsize[0]) + c]; + } + } + in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER; + /* We avoid filtering the pixels for which some of the pixels to average + are outside the frame. We could change the filter instead, but it + would + add special cases for any future vectorization. */ + for (i = 0; i < OD_DERING_INBUF_SIZE; i++) + inbuf[i] = OD_DERING_VERY_LARGE; + for (i = -OD_FILT_VBORDER * (sbr != 0); + i < (nvb << bsize[0]) + OD_FILT_VBORDER * (sbr != nvsb - 1); i++) { + for (j = -OD_FILT_HBORDER * (sbc != 0); + j < (nhb << bsize[0]) + OD_FILT_HBORDER * (sbc != nhsb - 1); + j++) { + uint16_t *x; + x = &src[(sbr * stride * MAX_MIB_SIZE << bsize[0]) + + (sbc * MAX_MIB_SIZE << bsize[0])]; + in[i * OD_FILT_BSTRIDE + j] = x[i * stride + j]; + } + } + for (i = 0; i < CLPF_STRENGTHS; i++) { + od_dering(tmp_dst, in, 0, dir, 0, dlist, dering_count, threshold, + i + (i == 3), clpf_damping, coeff_shift, 0); + copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[0], tmp_dst, + dlist, dering_count, bsize[0]); + mse[sb_count][gi][i] = (int)compute_dist( + dst, MAX_MIB_SIZE << bsize[0], + &ref_coeff[(sbr * stride * MAX_MIB_SIZE << bsize[0]) + + (sbc * MAX_MIB_SIZE << bsize[0])], + stride, nhb, nvb, coeff_shift); + } + sb_index[sb_count] = + MAX_MIB_SIZE * sbr * cm->mi_stride + MAX_MIB_SIZE * sbc; + } + sb_count++; + } + } + best_tot_mse = (uint64_t)1 << 63; + + int l0; + for (l0 = 0; l0 < DERING_STRENGTHS; l0++) { + int l1; + lev[0] = l0; + for (l1 = l0; l1 < DERING_STRENGTHS; l1++) { + int l2; + lev[1] = l1; + for (l2 = l1; l2 < DERING_STRENGTHS; l2++) { + int l3; + lev[2] = l2; + for (l3 = l2; l3 < DERING_STRENGTHS; l3++) { + int cs0; + lev[3] = l3; + for (cs0 = 0; cs0 < CLPF_STRENGTHS; cs0++) { + int cs1; + str[0] = cs0; + for (cs1 = cs0; cs1 < CLPF_STRENGTHS; cs1++) { + uint64_t tot_mse = 0; + str[1] = cs1; + for (i = 0; i < sb_count; i++) { + int gi; + int cs; + uint64_t best_mse = (uint64_t)1 << 63; + for (gi = 0; gi < DERING_REFINEMENT_LEVELS; gi++) { + for (cs = 0; cs < CLPF_REFINEMENT_LEVELS; cs++) { + if (mse[i][lev[gi]][str[cs]] < best_mse) { + best_mse = mse[i][lev[gi]][str[cs]]; + } + } + } + tot_mse += best_mse; + } + + // Add the bit cost + int dering_diffs = 0, clpf_diffs = 0; + for (i = 1; i < DERING_REFINEMENT_LEVELS; i++) + dering_diffs += lev[i] != lev[i - 1]; + for (i = 1; i < CLPF_REFINEMENT_LEVELS; i++) + clpf_diffs += str[i] != str[i - 1]; + tot_mse += (uint64_t)(sb_count * lambda * + (log2[dering_diffs] + log2[clpf_diffs])); + + if (tot_mse < best_tot_mse) { + for (i = 0; i < DERING_REFINEMENT_LEVELS; i++) + best_lev[i] = lev[i]; + for (i = 0; i < CLPF_REFINEMENT_LEVELS; i++) + best_str[i] = str[i]; + best_tot_mse = tot_mse; + } + } + } + } + } + } + } + for (i = 0; i < DERING_REFINEMENT_LEVELS; i++) lev[i] = best_lev[i]; + for (i = 0; i < CLPF_REFINEMENT_LEVELS; i++) str[i] = best_str[i]; + + id_to_levels(lev, str, levels_to_id(lev, str)); // Pack tables + cdef_get_bits(lev, str, &cm->dering_bits, &cm->clpf_bits); + + for (i = 0; i < sb_count; i++) { + int gi, cs; + int best_gi, best_clpf; + uint64_t best_mse = (uint64_t)1 << 63; + best_gi = best_clpf = 0; + for (gi = 0; gi < (1 << cm->dering_bits); gi++) { + for (cs = 0; cs < (1 << cm->clpf_bits); cs++) { + if (mse[i][lev[gi]][str[cs]] < best_mse) { + best_gi = gi; + best_clpf = cs; + best_mse = mse[i][lev[gi]][str[cs]]; + } + } + } + cm->mi_grid_visible[sb_index[i]]->mbmi.dering_gain = best_gi; + cm->mi_grid_visible[sb_index[i]]->mbmi.clpf_strength = best_clpf; + } + + aom_free(src); + aom_free(ref_coeff); + aom_free(mse); + aom_free(sb_index); + + av1_clpf_test_plane(cm->frame_to_show, ref, cm, &cm->clpf_strength_u, + AOM_PLANE_U); + av1_clpf_test_plane(cm->frame_to_show, ref, cm, &cm->clpf_strength_v, + AOM_PLANE_V); + cm->dering_level = levels_to_id(best_lev, best_str); +} diff --git a/av1/encoder/pickdering.c b/av1/encoder/pickdering.c deleted file mode 100644 index dce76863231a88c0c9b243e02d3d24ff49976b5b..0000000000000000000000000000000000000000 --- a/av1/encoder/pickdering.c +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include - -#include "./aom_scale_rtcd.h" -#include "av1/common/dering.h" -#include "av1/common/onyxc_int.h" -#include "av1/common/reconinter.h" -#include "av1/encoder/encoder.h" -#include "aom/aom_integer.h" - -static double compute_dist(int16_t *x, int xstride, int16_t *y, int ystride, - int nhb, int nvb, int coeff_shift) { - int i, j; - double sum; - sum = 0; - for (i = 0; i < nvb << 3; i++) { - for (j = 0; j < nhb << 3; j++) { - double tmp; - tmp = x[i * xstride + j] - y[i * ystride + j]; - sum += tmp * tmp; - } - } - return sum / (double)(1 << 2 * coeff_shift); -} - -int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, - AV1_COMMON *cm, MACROBLOCKD *xd) { - int r, c; - int sbr, sbc; - int nhsb, nvsb; - int16_t *src; - int16_t *ref_coeff; - dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE]; - int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } }; - int stride; - int bsize[3]; - int dec[3]; - int pli; - int level; - int best_level; - int dering_count; - int coeff_shift = AOMMAX(cm->bit_depth - 8, 0); - src = aom_malloc(sizeof(*src) * cm->mi_rows * cm->mi_cols * 64); - ref_coeff = aom_malloc(sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * 64); - av1_setup_dst_planes(xd->plane, frame, 0, 0); - for (pli = 0; pli < 3; pli++) { - dec[pli] = xd->plane[pli].subsampling_x; - bsize[pli] = OD_DERING_SIZE_LOG2 - dec[pli]; - } - stride = cm->mi_cols << bsize[0]; - for (r = 0; r < cm->mi_rows << bsize[0]; ++r) { - for (c = 0; c < cm->mi_cols << bsize[0]; ++c) { -#if CONFIG_AOM_HIGHBITDEPTH - if (cm->use_highbitdepth) { - src[r * stride + c] = CONVERT_TO_SHORTPTR( - xd->plane[0].dst.buf)[r * xd->plane[0].dst.stride + c]; - ref_coeff[r * stride + c] = - CONVERT_TO_SHORTPTR(ref->y_buffer)[r * ref->y_stride + c]; - } else { -#endif - src[r * stride + c] = - xd->plane[0].dst.buf[r * xd->plane[0].dst.stride + c]; - ref_coeff[r * stride + c] = ref->y_buffer[r * ref->y_stride + c]; -#if CONFIG_AOM_HIGHBITDEPTH - } -#endif - } - } - nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE; - nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE; - /* Pick a base threshold based on the quantizer. The threshold will then be - adjusted on a 64x64 basis. We use a threshold of the form T = a*Q^b, - where a and b are derived empirically trying to optimize rate-distortion - at different quantizer settings. */ - best_level = AOMMIN( - MAX_DERING_LEVEL - 1, - (int)floor(.5 + - .45 * pow(av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> - (cm->bit_depth - 8), - 0.6))); - for (sbr = 0; sbr < nvsb; sbr++) { - for (sbc = 0; sbc < nhsb; sbc++) { - int nvb, nhb; - int gi; - int best_gi; - int32_t best_mse = INT32_MAX; - int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]; - int16_t tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]; - nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc); - nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr); - dering_count = sb_compute_dering_list(cm, sbr * MAX_MIB_SIZE, - sbc * MAX_MIB_SIZE, dlist); - if (dering_count == 0) continue; - best_gi = 0; - for (gi = 0; gi < DERING_REFINEMENT_LEVELS; gi++) { - int cur_mse; - int threshold; - int16_t inbuf[OD_DERING_INBUF_SIZE]; - int16_t *in; - int i, j; - level = compute_level_from_index(best_level, gi); - threshold = level << coeff_shift; - for (r = 0; r < nvb << bsize[0]; r++) { - for (c = 0; c < nhb << bsize[0]; c++) { - dst[(r * MAX_MIB_SIZE << bsize[0]) + c] = - src[((sbr * MAX_MIB_SIZE << bsize[0]) + r) * stride + - (sbc * MAX_MIB_SIZE << bsize[0]) + c]; - } - } - in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER; - /* We avoid filtering the pixels for which some of the pixels to average - are outside the frame. We could change the filter instead, but it - would - add special cases for any future vectorization. */ - for (i = 0; i < OD_DERING_INBUF_SIZE; i++) - inbuf[i] = OD_DERING_VERY_LARGE; - for (i = -OD_FILT_VBORDER * (sbr != 0); - i < (nvb << bsize[0]) + OD_FILT_VBORDER * (sbr != nvsb - 1); i++) { - for (j = -OD_FILT_HBORDER * (sbc != 0); - j < (nhb << bsize[0]) + OD_FILT_HBORDER * (sbc != nhsb - 1); - j++) { - int16_t *x; - x = &src[(sbr * stride * MAX_MIB_SIZE << bsize[0]) + - (sbc * MAX_MIB_SIZE << bsize[0])]; - in[i * OD_FILT_BSTRIDE + j] = x[i * stride + j]; - } - } - od_dering(tmp_dst, in, 0, dir, 0, dlist, dering_count, threshold, - coeff_shift); - copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[0], tmp_dst, - dlist, dering_count, bsize[0]); - cur_mse = (int)compute_dist( - dst, MAX_MIB_SIZE << bsize[0], - &ref_coeff[(sbr * stride * MAX_MIB_SIZE << bsize[0]) + - (sbc * MAX_MIB_SIZE << bsize[0])], - stride, nhb, nvb, coeff_shift); - if (cur_mse < best_mse) { - best_gi = gi; - best_mse = cur_mse; - } - } - cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride + - MAX_MIB_SIZE * sbc] - ->mbmi.dering_gain = best_gi; - } - } - aom_free(src); - aom_free(ref_coeff); - return best_level; -}