From a25c6c3ba675a7b0c1e4793b210b02d71cdb42c0 Mon Sep 17 00:00:00 2001 From: Steinar Midtskogen <stemidts@cisco.com> Date: Tue, 13 Sep 2016 16:37:13 +0200 Subject: [PATCH] Extend CLPF to chroma. Objective quality impact (low latency): PSNR YCbCr: 0.13% -1.37% -1.79% PSNRHVS: 0.03% SSIM: 0.24% MSSSIM: 0.10% CIEDE2000: -0.83% Change-Id: I8ddf0def569286775f0f9d4d4005932766a7fc27 --- aom_dsp/aom_dsp_rtcd_defs.pl | 8 +- av1/common/clpf.c | 116 ++++-- av1/common/clpf.h | 4 +- av1/common/clpf_simd.h | 422 +++++++++++-------- av1/common/onyxc_int.h | 4 +- av1/decoder/decodeframe.c | 31 +- av1/encoder/bitstream.c | 6 +- av1/encoder/clpf_rdo.c | 175 +++++--- av1/encoder/clpf_rdo.h | 4 +- av1/encoder/clpf_rdo_simd.h | 775 +++++++++-------------------------- av1/encoder/encoder.c | 33 +- test/clpf_test.cc | 2 + 12 files changed, 695 insertions(+), 885 deletions(-) diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 04a2cc4df8..6eb6310c8f 100644 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -628,16 +628,16 @@ if (aom_config("CONFIG_CLPF") eq "yes") { if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength"; specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/; - add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift"; + add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift, int size"; specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/; - add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift"; + add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift, int size"; specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/; } add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength"; specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/; - add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength"; + add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size"; specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/; - add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum"; + add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size"; specialize qw/aom_clpf_detect_multi sse2 ssse3 sse4_1 neon/; } diff --git a/av1/common/clpf.c b/av1/common/clpf.c index ff2c372615..6bea2bbe59 100644 --- a/av1/common/clpf.c +++ b/av1/common/clpf.c @@ -8,9 +8,10 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include <assert.h> + #include "av1/common/clpf.h" #include "./aom_dsp_rtcd.h" +#include "aom/aom_image.h" #include "aom_dsp/aom_dsp_common.h" int av1_clpf_maxbits(const AV1_COMMON *cm) { @@ -72,21 +73,24 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride, #endif // Return number of filtered blocks -int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, - const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm, - int enable_fb_flag, unsigned int strength, - unsigned int fb_size_log2, uint8_t *blocks, - int (*decision)(int, int, const YV12_BUFFER_CONFIG *, - const YV12_BUFFER_CONFIG *, - const AV1_COMMON *cm, int, int, int, - unsigned int, unsigned int, uint8_t *)) { +int av1_clpf_frame( + const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *org, + AV1_COMMON *cm, int enable_fb_flag, unsigned int strength, + unsigned int fb_size_log2, uint8_t *blocks, int plane, + int (*decision)(int, int, const YV12_BUFFER_CONFIG *, + const YV12_BUFFER_CONFIG *, const AV1_COMMON *cm, int, int, + int, unsigned int, unsigned int, uint8_t *, int)) { /* Constrained low-pass filter (CLPF) */ int c, k, l, m, n; - const int bs = MI_SIZE; - const int width = frame->y_crop_width; - const int height = frame->y_crop_height; + const int subx = plane != AOM_PLANE_Y && frame->subsampling_x; + const int suby = plane != AOM_PLANE_Y && frame->subsampling_y; + const int bs = (subx || suby) ? 4 : 8; + const int bslog = get_msb(bs); + int width = plane != AOM_PLANE_Y ? frame->uv_crop_width : frame->y_crop_width; + int height = + plane != AOM_PLANE_Y ? frame->uv_crop_height : frame->y_crop_height; int xpos, ypos; - const int sstride = frame->y_stride; + const int sstride = plane != AOM_PLANE_Y ? frame->uv_stride : frame->y_stride; int dstride = bs; const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2; const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2; @@ -97,9 +101,11 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, int cache_idx = 0; const int cache_size = num_fb_hor << (2 * fb_size_log2); const int cache_blocks = cache_size / (bs * bs); - YV12_BUFFER_CONFIG dst = *frame; - - assert(bs == 8); // Optimised code assumes this. + uint8_t *src_buffer = + plane != AOM_PLANE_Y + ? (plane == AOM_PLANE_U ? frame->u_buffer : frame->v_buffer) + : frame->y_buffer; + uint8_t *dst_buffer; #if CONFIG_AOM_HIGHBITDEPTH strength <<= (cm->bit_depth - 8); @@ -108,10 +114,10 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, // Make buffer space for in-place filtering #if CONFIG_AOM_HIGHBITDEPTH CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size << !!cm->use_highbitdepth)); - dst.y_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache; + dst_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache; #else CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size)); - dst.y_buffer = cache; + dst_buffer = cache; #endif CHECK_MEM_ERROR(cm, cache_ptr, aom_malloc(cache_blocks * sizeof(*cache_ptr))); CHECK_MEM_ERROR(cm, cache_dst, aom_malloc(cache_blocks * sizeof(*cache_dst))); @@ -130,7 +136,8 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, ypos = yoff + m * bs; if (xpos < width && ypos < height) { allskip &= - cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] + cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride + + (xpos << subx) / MI_SIZE] ->mbmi.skip; } } @@ -144,13 +151,14 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, if (!allskip && // Do not filter the block if all is skip encoded (!enable_fb_flag || decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength, - fb_size_log2, blocks + block_index))) { + fb_size_log2, blocks + block_index, plane))) { // Iterate over all smaller blocks inside the filter block - for (m = 0; m < (h + bs - 1) / bs; m++) { - for (n = 0; n < (w + bs - 1) / bs; n++) { + for (m = 0; m < ((h + bs - 1) >> bslog); m++) { + for (n = 0; n < ((w + bs - 1) >> bslog); n++) { xpos = xoff + n * bs; ypos = yoff + m * bs; - if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] + if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride + + (xpos << subx) / MI_SIZE] ->mbmi.skip) { // Not skip block // Temporary buffering needed if filtering in-place if (cache_ptr[cache_idx]) { @@ -161,50 +169,59 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, for (c = 0; c < bs; c++) { *(uint64_t *)(d + c * sstride) = *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); - *(uint64_t *)(d + c * sstride + 4) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8); + if (bs == 8) + *(uint64_t *)(d + c * sstride + 4) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8); } } else { for (c = 0; c < bs; c++) - *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs); + if (bs == 8) + *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs); + else + *(uint32_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint32_t *)(cache_ptr[cache_idx] + c * bs); } #else for (c = 0; c < bs; c++) - *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs); + if (bs == 8) + *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs); + else + *(uint32_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint32_t *)(cache_ptr[cache_idx] + c * bs); #endif } #if CONFIG_AOM_HIGHBITDEPTH if (cm->use_highbitdepth) { cache_ptr[cache_idx] = cache + cache_idx * bs * bs * 2; - dst.y_buffer = + dst_buffer = CONVERT_TO_BYTEPTR(cache_ptr[cache_idx]) - ypos * bs - xpos; } else { cache_ptr[cache_idx] = cache + cache_idx * bs * bs; - dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos; + dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos; } #else cache_ptr[cache_idx] = cache + cache_idx * bs * bs; - dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos; + dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos; #endif - cache_dst[cache_idx] = frame->y_buffer + ypos * sstride + xpos; + cache_dst[cache_idx] = src_buffer + ypos * sstride + xpos; if (++cache_idx >= cache_blocks) cache_idx = 0; // Apply the filter #if CONFIG_AOM_HIGHBITDEPTH if (cm->use_highbitdepth) { - aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(frame->y_buffer), - CONVERT_TO_SHORTPTR(dst.y_buffer), sstride, + aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer), + CONVERT_TO_SHORTPTR(dst_buffer), sstride, dstride, xpos, ypos, bs, bs, width, height, strength); } else { - aom_clpf_block(frame->y_buffer, dst.y_buffer, sstride, dstride, - xpos, ypos, bs, bs, width, height, strength); + aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos, + ypos, bs, bs, width, height, strength); } #else - aom_clpf_block(frame->y_buffer, dst.y_buffer, sstride, dstride, - xpos, ypos, bs, bs, width, height, strength); + aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos, + ypos, bs, bs, width, height, strength); #endif } } @@ -223,18 +240,27 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, for (c = 0; c < bs; c++) { *(uint64_t *)(d + c * sstride) = *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); - *(uint64_t *)(d + c * sstride + 4) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8); + if (bs == 8) + *(uint64_t *)(d + c * sstride + 4) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8); } } else { for (c = 0; c < bs; c++) - *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs); + if (bs == 4) + *(uint32_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint32_t *)(cache_ptr[cache_idx] + c * bs); + else + *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs); } #else for (c = 0; c < bs; c++) - *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs); + if (bs == 4) + *(uint32_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint32_t *)(cache_ptr[cache_idx] + c * bs); + else + *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs); #endif } diff --git a/av1/common/clpf.h b/av1/common/clpf.h index 932ef3c29d..46d10e466c 100644 --- a/av1/common/clpf.h +++ b/av1/common/clpf.h @@ -20,10 +20,10 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b); int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm, int enable_fb_flag, unsigned int strength, - unsigned int fb_size_log2, uint8_t *blocks, + unsigned int fb_size_log2, uint8_t *blocks, int plane, int (*decision)(int, int, const YV12_BUFFER_CONFIG *, const YV12_BUFFER_CONFIG *, const AV1_COMMON *cm, int, int, int, - unsigned int, unsigned int, uint8_t *)); + unsigned int, unsigned int, uint8_t *, int)); #endif diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h index 2507c151a4..979856b49a 100644 --- a/av1/common/clpf_simd.h +++ b/av1/common/clpf_simd.h @@ -10,131 +10,165 @@ */ #include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" -SIMD_INLINE void calc_delta(v128 o, v128 x, v128 a, v128 b, v128 c, v128 d, - v128 e, v128 f, uint8_t *dst, v128 sp, v128 sm, - int dstride) { +// delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) + +// 3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) + +// 1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s) +SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, + v128 f, v128 sp, v128 sm) { + // The difference will be 9 bit, offset by 128 so we can use saturated + // sub to avoid going to 16 bit temporarily before "strength" clipping. + const v128 c128 = v128_dup_8(128); + const v128 x = v128_add_8(c128, o); const v128 c8 = v128_dup_8(8); - const v128 tmp = - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); + const v128 tmp = v128_add_8( + v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, c), x), sp), sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, d), x), sp), sm)); const v128 delta = v128_add_8( v128_add_8( v128_shl_8( - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), + v128_add_8( + v128_max_s8( + v128_min_s8(v128_ssub_s8(v128_add_8(c128, a), x), sp), + sm), + v128_max_s8( + v128_min_s8(v128_ssub_s8(v128_add_8(c128, f), x), sp), + sm)), 2), - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), + v128_add_8( + v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, b), x), sp), + sm), + v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, e), x), sp), + sm))), v128_add_8(v128_add_8(tmp, tmp), tmp)); - o = v128_add_8( + return v128_add_8( o, v128_shr_s8( v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), 4)); - v64_store_aligned(dst, v128_high_v64(o)); - v64_store_aligned(dst + dstride, v128_low_v64(o)); } -static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride, - int dstride, int x0, int y0, int sizey, int width, - int height, unsigned int strength) { - int bottom = height - 2 - y0; +// Process blocks of width 8, two lines at a time, 8 bit. +static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride, + int dstride, int x0, int y0, int sizey, int width, + int height, unsigned int strength) { + const int bottom = height - 2 - y0; + const int right = width - 8 - x0; const v128 sp = v128_dup_8(strength); const v128 sm = v128_dup_8(-(int)strength); - const v128 c128 = v128_dup_8(128); + DECLARE_ALIGNED(16, static const uint64_t, + b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL }; + DECLARE_ALIGNED(16, static const uint64_t, + c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL }; + DECLARE_ALIGNED(16, static const uint64_t, + d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL }; + DECLARE_ALIGNED(16, static const uint64_t, + e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL }; + int y; + dst += x0 + y0 * dstride; src += x0 + y0 * sstride; - if (!x0) { // Clip left - const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), - v64_from_64(0x0504030201000000LL)); - const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), - v64_from_64(0x0605040302010000LL)); - int y; + for (y = 0; y < sizey; y += 2) { + const v64 l1 = v64_load_aligned(src); + const v64 l2 = v64_load_aligned(src + sstride); + v128 o = v128_from_v64(l1, l2); + const v128 a = + v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1); + v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0), + v64_load_unaligned(src - 2 * !!x0 + sstride)); + v128 c = v128_from_v64(v64_load_unaligned(src - !!x0), + v64_load_unaligned(src - !!x0 + sstride)); + v128 d = v128_from_v64(v64_load_unaligned(src + !!right), + v64_load_unaligned(src + !!right + sstride)); + v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right), + v64_load_unaligned(src + 2 * !!right + sstride)); + const v128 f = v128_from_v64( + l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)); - for (y = 0; y < sizey; y += 2) { - const v64 l1 = v64_load_aligned(src); - const v64 l2 = v64_load_aligned(src + sstride); - v128 o = v128_from_v64(l1, l2); - const v128 x = v128_add_8(c128, o); - const v128 a = v128_add_8( - c128, - v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); - const v128 b = v128_shuffle_8(x, b_shuff); - const v128 c = v128_shuffle_8(x, c_shuff); - const v128 d = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src + 1), - v64_load_unaligned(src + 1 + sstride))); - const v128 e = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src + 2), - v64_load_unaligned(src + 2 + sstride))); - const v128 f = v128_add_8( - c128, v128_from_v64( - l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride))); - calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride); - src += sstride * 2; - dst += dstride * 2; + if (!x0) { // Left clipping + b = v128_shuffle_8(b, v128_load_aligned(b_shuff)); + c = v128_shuffle_8(c, v128_load_aligned(c_shuff)); + } + if (!right) { // Right clipping + d = v128_shuffle_8(d, v128_load_aligned(d_shuff)); + e = v128_shuffle_8(e, v128_load_aligned(e_shuff)); } - } else if (!(width - x0 - 8)) { // Clip right - const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), - v64_from_64(0x0707060504030201LL)); - const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), - v64_from_64(0x0707070605040302LL)); - int y; - for (y = 0; y < sizey; y += 2) { - const v64 l1 = v64_load_aligned(src); - const v64 l2 = v64_load_aligned(src + sstride); - v128 o = v128_from_v64(l1, l2); - const v128 x = v128_add_8(c128, o); - const v128 a = v128_add_8( - c128, - v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); - const v128 b = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src - 2), - v64_load_unaligned(src - 2 + sstride))); - const v128 c = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src - 1), - v64_load_unaligned(src - 1 + sstride))); - const v128 d = v128_shuffle_8(x, d_shuff); - const v128 e = v128_shuffle_8(x, e_shuff); - const v128 f = v128_add_8( - c128, v128_from_v64( - l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride))); - calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride); - src += sstride * 2; - dst += dstride * 2; + o = calc_delta(o, a, b, c, d, e, f, sp, sm); + v64_store_aligned(dst, v128_high_v64(o)); + v64_store_aligned(dst + dstride, v128_low_v64(o)); + src += sstride * 2; + dst += dstride * 2; + } +} + +// Process blocks of width 4, four lines at a time, 8 bit. +static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride, + int dstride, int x0, int y0, int sizey, int width, + int height, unsigned int strength) { + const v128 sp = v128_dup_8(strength); + const v128 sm = v128_dup_8(-(int)strength); + const int right = width - 4 - x0; + const int bottom = height - 4 - y0; + DECLARE_ALIGNED(16, static const uint64_t, + b_shuff[]) = { 0x0504040401000000LL, 0x0d0c0c0c09080808LL }; + DECLARE_ALIGNED(16, static const uint64_t, + c_shuff[]) = { 0x0605040402010000LL, 0x0e0d0c0c0a090808LL }; + DECLARE_ALIGNED(16, static const uint64_t, + d_shuff[]) = { 0x0707060503030201LL, 0x0f0f0e0d0b0b0a09LL }; + DECLARE_ALIGNED(16, static const uint64_t, + e_shuff[]) = { 0x0707070603030302LL, 0x0f0f0f0e0b0b0b0aLL }; + int y; + + dst += x0 + y0 * dstride; + src += x0 + y0 * sstride; + + for (y = 0; y < sizey; y += 4) { + const uint32_t l0 = u32_load_aligned(src - (y != -y0) * sstride); + const uint32_t l1 = u32_load_aligned(src); + const uint32_t l2 = u32_load_aligned(src + sstride); + const uint32_t l3 = u32_load_aligned(src + 2 * sstride); + const uint32_t l4 = u32_load_aligned(src + 3 * sstride); + const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride); + v128 o = v128_from_32(l1, l2, l3, l4); + const v128 a = v128_from_32(l0, l1, l2, l3); + v128 b = v128_from_32(u32_load_unaligned(src - 2 * !!x0), + u32_load_unaligned(src + sstride - 2 * !!x0), + u32_load_unaligned(src + 2 * sstride - 2 * !!x0), + u32_load_unaligned(src + 3 * sstride - 2 * !!x0)); + v128 c = v128_from_32(u32_load_unaligned(src - !!x0), + u32_load_unaligned(src + sstride - !!x0), + u32_load_unaligned(src + 2 * sstride - !!x0), + u32_load_unaligned(src + 3 * sstride - !!x0)); + v128 d = v128_from_32(u32_load_unaligned(src + !!right), + u32_load_unaligned(src + sstride + !!right), + u32_load_unaligned(src + 2 * sstride + !!right), + u32_load_unaligned(src + 3 * sstride + !!right)); + v128 e = v128_from_32(u32_load_unaligned(src + 2 * !!right), + u32_load_unaligned(src + sstride + 2 * !!right), + u32_load_unaligned(src + 2 * sstride + 2 * !!right), + u32_load_unaligned(src + 3 * sstride + 2 * !!right)); + const v128 f = v128_from_32(l2, l3, l4, l5); + + if (!x0) { // Left clipping + b = v128_shuffle_8(b, v128_load_aligned(b_shuff)); + c = v128_shuffle_8(c, v128_load_aligned(c_shuff)); } - } else { // No left/right clipping - int y; - for (y = 0; y < sizey; y += 2) { - const v64 l1 = v64_load_aligned(src); - const v64 l2 = v64_load_aligned(src + sstride); - v128 o = v128_from_v64(l1, l2); - const v128 x = v128_add_8(c128, o); - const v128 a = v128_add_8( - c128, - v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); - const v128 b = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src - 2), - v64_load_unaligned(src - 2 + sstride))); - const v128 c = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src - 1), - v64_load_unaligned(src - 1 + sstride))); - const v128 d = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src + 1), - v64_load_unaligned(src + 1 + sstride))); - const v128 e = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(src + 2), - v64_load_unaligned(src + 2 + sstride))); - const v128 f = v128_add_8( - c128, v128_from_v64( - l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride))); - calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride); - src += sstride * 2; - dst += dstride * 2; + if (!right) { // Right clipping + d = v128_shuffle_8(d, v128_load_aligned(d_shuff)); + e = v128_shuffle_8(e, v128_load_aligned(e_shuff)); } + + o = calc_delta(o, a, b, c, d, e, f, sp, sm); + u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12))); + u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8))); + u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4))); + u32_store_aligned(dst + 3 * dstride, v128_low_u32(o)); + + dst += 4 * dstride; + src += 4 * sstride; } } @@ -142,24 +176,23 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength) { - // TODO(stemidts): - // A sizex different from 8 will only be needed if CLPF is extended to chroma. - // This will only be used if 4:2:0 and width not a multiple of 16 and along - // the right edge only, so we can fall back to the plain C implementation in - // this case. If not extended to chroma, this test will be redundant. - if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) { + if ((sizex != 4 && sizex != 8) || y0 + 4 > height || + (sizey & 3 && sizex == 4) || x0 + 4 > width) { // Fallback to C for odd sizes aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width, height, strength); } else { - clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height, - strength); + (sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0, y0, + sizey, width, height, strength); } } #if CONFIG_AOM_HIGHBITDEPTH -static void calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, - v128 f, uint16_t *dst, v128 sp, v128 sm) { +// delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) + +// 3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) + +// 1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s) +SIMD_INLINE v128 calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, + v128 f, v128 sp, v128 sm) { const v128 c8 = v128_dup_16(8); const v128 tmp = v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(c, o), sp), sm), @@ -174,73 +207,124 @@ static void calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(b, o), sp), sm), v128_max_s16(v128_min_s16(v128_sub_16(e, o), sp), sm))), v128_add_16(v128_add_16(tmp, tmp), tmp)); - v128_store_aligned( - dst, - v128_add_16( - o, v128_shr_s16( - v128_add_16(c8, v128_add_16(delta, v128_cmplt_s16( - delta, v128_zero()))), - 4))); + return v128_add_16( + o, v128_shr_s16( + v128_add_16( + c8, v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))), + 4)); } +static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, + v128 f, uint16_t *dst, v128 sp, v128 sm, + int dstride) { + o = calc_delta_hbd(o, a, b, c, d, e, f, sp, sm); + v64_store_aligned(dst, v128_high_v64(o)); + v64_store_aligned(dst + dstride, v128_low_v64(o)); +} + +static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, + v128 f, uint16_t *dst, v128 sp, v128 sm) { + v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, sp, sm)); +} + +// Process blocks of width 4, two lines at time. +SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst, + int sstride, int dstride, int x0, int y0, + int sizey, int width, int height, + unsigned int strength) { + const v128 sp = v128_dup_16(strength); + const v128 sm = v128_dup_16(-(int)strength); + const int right = width - 4 - x0; + const int bottom = height - 2 - y0; + DECLARE_ALIGNED(16, static const uint64_t, + b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090809080908LL }; + DECLARE_ALIGNED(16, static const uint64_t, + c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080908LL }; + DECLARE_ALIGNED(16, static const uint64_t, + d_shuff[]) = { 0x0706070605040302LL, 0x0f0e0f0e0d0c0b0aLL }; + DECLARE_ALIGNED(16, static const uint64_t, + e_shuff[]) = { 0x0706070607060504LL, 0x0f0e0f0e0f0e0d0cLL }; + int y; + + dst += x0 + y0 * dstride; + src += x0 + y0 * sstride; + + for (y = 0; y < sizey; y += 2) { + const v64 l1 = v64_load_aligned(src); + const v64 l2 = v64_load_aligned(src + sstride); + v128 o = v128_from_v64(l1, l2); + const v128 a = + v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1); + v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0), + v64_load_unaligned(src - 2 * !!x0 + sstride)); + v128 c = v128_from_v64(v64_load_unaligned(src - !!x0), + v64_load_unaligned(src - !!x0 + sstride)); + v128 d = v128_from_v64(v64_load_unaligned(src + !!right), + v64_load_unaligned(src + !!right + sstride)); + v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right), + v64_load_unaligned(src + 2 * !!right + sstride)); + const v128 f = v128_from_v64( + l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)); + + if (!x0) { // Left clipping + b = v128_shuffle_8(b, v128_load_aligned(b_shuff)); + c = v128_shuffle_8(c, v128_load_aligned(c_shuff)); + } + if (!right) { // Right clipping + d = v128_shuffle_8(d, v128_load_aligned(d_shuff)); + e = v128_shuffle_8(e, v128_load_aligned(e_shuff)); + } + calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride); + src += sstride * 2; + dst += dstride * 2; + } +} + +// The most simple case. Start here if you need to understand the functions. SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizey, int width, int height, unsigned int strength) { - int y; - int bottom = height - 2 - y0; const v128 sp = v128_dup_16(strength); const v128 sm = v128_dup_16(-(int)strength); + const int right = width - 8 - x0; + const int bottom = height - 2 - y0; + DECLARE_ALIGNED(16, static const uint64_t, + b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090807060504LL }; + DECLARE_ALIGNED(16, static const uint64_t, + c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080706LL }; + DECLARE_ALIGNED(16, static const uint64_t, + d_shuff[]) = { 0x0908070605040302LL, 0x0f0e0f0e0d0c0b0aLL }; + DECLARE_ALIGNED(16, static const uint64_t, + e_shuff[]) = { 0x0b0a090807060504LL, 0x0f0e0f0e0f0e0d0cLL }; + int y; dst += x0 + y0 * dstride; src += x0 + y0 * sstride; - if (!x0) { // Clip left - const v128 b_shuff = v128_from_v64(v64_from_64(0x0b0a090807060504LL), - v64_from_64(0x0302010001000100LL)); - const v128 c_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080706LL), - v64_from_64(0x0504030201000100LL)); - for (y = 0; y < sizey; y++) { - const v128 o = v128_load_aligned(src); - const v128 a = v128_load_aligned(src - (y != -y0) * sstride); - const v128 b = v128_shuffle_8(o, b_shuff); - const v128 c = v128_shuffle_8(o, c_shuff); - const v128 d = v128_load_unaligned(src + 1); - const v128 e = v128_load_unaligned(src + 2); - const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride); - calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm); - src += sstride; - dst += dstride; - } - } else if (!(width - x0 - 8)) { // Clip right - const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0e0f0e0d0c0b0aLL), - v64_from_64(0x0908070605040302LL)); - const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0e0f0e0f0e0d0cLL), - v64_from_64(0x0b0a090807060504LL)); - for (y = 0; y < sizey; y++) { - const v128 o = v128_load_aligned(src); - const v128 a = v128_load_aligned(src - (y != -y0) * sstride); - const v128 b = v128_load_unaligned(src - 2); - const v128 c = v128_load_unaligned(src - 1); - const v128 d = v128_shuffle_8(o, d_shuff); - const v128 e = v128_shuffle_8(o, e_shuff); - const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride); - calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm); - src += sstride; - dst += dstride; + // Read 8 set of pixels at a time. Clipping along upper and lower + // edges is handled by reading the upper or lower line twice. + // Clipping along the left and right edges is handled by shuffle + // instructions doing shift and pad. + for (y = 0; y < sizey; y++) { + const v128 o = v128_load_aligned(src); + const v128 a = v128_load_aligned(src - (y != -y0) * sstride); + const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride); + v128 b = v128_load_unaligned(src - 2 * !!x0); + v128 c = v128_load_unaligned(src - !!x0); + v128 d = v128_load_unaligned(src + !!right); + v128 e = v128_load_unaligned(src + 2 * !!right); + + if (!x0) { // Left clipping + b = v128_shuffle_8(b, v128_load_aligned(b_shuff)); + c = v128_shuffle_8(c, v128_load_aligned(c_shuff)); } - } else { // No left/right clipping - for (y = 0; y < sizey; y++) { - const v128 o = v128_load_aligned(src); - const v128 a = v128_load_aligned(src - (y != -y0) * sstride); - const v128 b = v128_load_unaligned(src - 2); - const v128 c = v128_load_unaligned(src - 1); - const v128 d = v128_load_unaligned(src + 1); - const v128 e = v128_load_unaligned(src + 2); - const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride); - calc_delta_hbd(o, a, b, c, d, e, f, dst, sp, sm); - src += sstride; - dst += dstride; + if (!right) { // Right clipping + d = v128_shuffle_8(d, v128_load_aligned(d_shuff)); + e = v128_shuffle_8(e, v128_load_aligned(e_shuff)); } + calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm); + src += sstride; + dst += dstride; } } @@ -248,13 +332,13 @@ void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength) { - if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) { + if ((sizex != 4 && sizex != 8) || y0 + 4 > height || x0 + 4 > width) { // Fallback to C for odd sizes aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width, height, strength); } else { - clpf_block_hbd(src, dst, sstride, dstride, x0, y0, sizey, width, height, - strength); + (sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)( + src, dst, sstride, dstride, x0, y0, sizey, width, height, strength); } } #endif diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h index 7b83ba84ff..cd6b2b4651 100644 --- a/av1/common/onyxc_int.h +++ b/av1/common/onyxc_int.h @@ -154,7 +154,9 @@ typedef struct AV1Common { #if CONFIG_CLPF int clpf_numblocks; int clpf_size; - int clpf_strength; + int clpf_strength_y; + int clpf_strength_u; + int clpf_strength_v; uint8_t *clpf_blocks; #endif diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c index 243bde9d05..c13e226ff5 100644 --- a/av1/decoder/decodeframe.c +++ b/av1/decoder/decodeframe.c @@ -29,6 +29,7 @@ #include "av1/common/alloccommon.h" #if CONFIG_CLPF +#include "aom/aom_image.h" #include "av1/common/clpf.h" #endif #include "av1/common/common.h" @@ -866,8 +867,10 @@ static void setup_loopfilter(struct loopfilter *lf, #if CONFIG_CLPF static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { cm->clpf_blocks = 0; - cm->clpf_strength = aom_rb_read_literal(rb, 2); - if (cm->clpf_strength) { + cm->clpf_strength_y = aom_rb_read_literal(rb, 2); + cm->clpf_strength_u = aom_rb_read_literal(rb, 2); + cm->clpf_strength_v = aom_rb_read_literal(rb, 2); + if (cm->clpf_strength_y) { cm->clpf_size = aom_rb_read_literal(rb, 2); if (cm->clpf_size) { int i; @@ -885,7 +888,8 @@ static int clpf_bit(UNUSED int k, UNUSED int l, UNUSED const YV12_BUFFER_CONFIG *org, UNUSED const AV1_COMMON *cm, UNUSED int block_size, UNUSED int w, UNUSED int h, UNUSED unsigned int strength, - UNUSED unsigned int fb_size_log2, uint8_t *bit) { + UNUSED unsigned int fb_size_log2, uint8_t *bit, + UNUSED int comp) { return *bit; } #endif @@ -2404,10 +2408,23 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data, } #if CONFIG_CLPF - if (cm->clpf_strength && !cm->skip_loop_filter) { - av1_clpf_frame(&pbi->cur_buf->buf, 0, cm, !!cm->clpf_size, - cm->clpf_strength + (cm->clpf_strength == 3), - 4 + cm->clpf_size, cm->clpf_blocks, clpf_bit); + if (!cm->skip_loop_filter) { + const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf; + if (cm->clpf_strength_y) { + av1_clpf_frame(frame, NULL, cm, !!cm->clpf_size, + cm->clpf_strength_y + (cm->clpf_strength_y == 3), + 4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, clpf_bit); + } + if (cm->clpf_strength_u) { + av1_clpf_frame(frame, NULL, cm, 0, + cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4, NULL, + AOM_PLANE_U, NULL); + } + if (cm->clpf_strength_v) { + av1_clpf_frame(frame, NULL, cm, 0, + cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4, NULL, + AOM_PLANE_V, NULL); + } } if (cm->clpf_blocks) aom_free(cm->clpf_blocks); #endif diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c index f6693dc8cc..bb58e39ade 100644 --- a/av1/encoder/bitstream.c +++ b/av1/encoder/bitstream.c @@ -1428,8 +1428,10 @@ static void encode_loopfilter(struct loopfilter *lf, #if CONFIG_CLPF static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { - aom_wb_write_literal(wb, cm->clpf_strength, 2); - if (cm->clpf_strength) { + aom_wb_write_literal(wb, cm->clpf_strength_y, 2); + aom_wb_write_literal(wb, cm->clpf_strength_u, 2); + aom_wb_write_literal(wb, cm->clpf_strength_v, 2); + if (cm->clpf_strength_y) { aom_wb_write_literal(wb, cm->clpf_size, 2); if (cm->clpf_size) { int i; diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c index 905a8080ef..dc05937f27 100644 --- a/av1/encoder/clpf_rdo.c +++ b/av1/encoder/clpf_rdo.c @@ -11,16 +11,17 @@ #include "av1/common/clpf.h" #include "./aom_dsp_rtcd.h" +#include "aom/aom_image.h" #include "aom/aom_integer.h" #include "av1/common/quant_common.h" // Calculate the error of a filtered and unfiltered block void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, - int *sum0, int *sum1, unsigned int strength) { + int *sum0, int *sum1, unsigned int strength, int size) { int x, y; - for (y = y0; y < y0 + 8; y++) { - for (x = x0; x < x0 + 8; x++) { + for (y = y0; y < y0 + size; y++) { + for (x = x0; x < x0 + size; x++) { int O = org[y * ostride + x]; int X = rec[y * rstride + x]; int A = rec[AOMMAX(0, y - 1) * rstride + x]; @@ -39,11 +40,11 @@ void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride, void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, - int width, int height, int *sum) { + int width, int height, int *sum, int size) { int x, y; - for (y = y0; y < y0 + 8; y++) { - for (x = x0; x < x0 + 8; x++) { + for (y = y0; y < y0 + size; y++) { + for (x = x0; x < x0 + size; x++) { int O = org[y * ostride + x]; int X = rec[y * rstride + x]; int A = rec[AOMMAX(0, y - 1) * rstride + x]; @@ -71,10 +72,10 @@ void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org, void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, - unsigned int strength, int shift) { + unsigned int strength, int shift, int size) { int x, y; - for (y = y0; y < y0 + 8; y++) { - for (x = x0; x < x0 + 8; x++) { + for (y = y0; y < y0 + size; y++) { + for (x = x0; x < x0 + size; x++) { int O = org[y * ostride + x] >> shift; int X = rec[y * rstride + x] >> shift; int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift; @@ -94,11 +95,12 @@ void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org, // aom_clpf_detect_multi_c() apart from "rec" and "org". void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, - int width, int height, int *sum, int shift) { + int width, int height, int *sum, int shift, + int size) { int x, y; - for (y = y0; y < y0 + 8; y++) { - for (x = x0; x < x0 + 8; x++) { + for (y = y0; y < y0 + size; y++) { + for (x = x0; x < x0 + size; x++) { int O = org[y * ostride + x] >> shift; int X = rec[y * rstride + x] >> shift; int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift; @@ -125,31 +127,45 @@ void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org, int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, int block_size, int w, int h, unsigned int strength, - unsigned int fb_size_log2, uint8_t *res) { + unsigned int fb_size_log2, uint8_t *res, int plane) { int m, n, sum0 = 0, sum1 = 0; + const int subx = plane != AOM_PLANE_Y && rec->subsampling_x; + const int suby = plane != AOM_PLANE_Y && rec->subsampling_y; + uint8_t *rec_buffer = + plane != AOM_PLANE_Y + ? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer) + : rec->y_buffer; + uint8_t *org_buffer = + plane != AOM_PLANE_Y + ? (plane == AOM_PLANE_U ? org->u_buffer : org->v_buffer) + : org->y_buffer; + int rec_width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width; + int rec_height = + plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height; + int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride; + int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride; for (m = 0; m < h; m++) { for (n = 0; n < w; n++) { int xpos = (l << fb_size_log2) + n * block_size; int ypos = (k << fb_size_log2) + m * block_size; - const int bs = MAX_MIB_SIZE; - if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] + if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride + + (xpos << subx) / MI_SIZE] ->mbmi.skip) { #if CONFIG_AOM_HIGHBITDEPTH if (cm->use_highbitdepth) { - aom_clpf_detect_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer), - CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride, - org->y_stride, xpos, ypos, rec->y_crop_width, - rec->y_crop_height, &sum0, &sum1, strength, - cm->bit_depth - 8); + aom_clpf_detect_hbd( + CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer), + rec_stride, org_stride, xpos, ypos, rec_width, rec_height, &sum0, + &sum1, strength, cm->bit_depth - 8, block_size); } else { - aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, - org->y_stride, xpos, ypos, rec->y_crop_width, - rec->y_crop_height, &sum0, &sum1, strength); + aom_clpf_detect(rec_buffer, org_buffer, rec_stride, org_stride, xpos, + ypos, rec_width, rec_height, &sum0, &sum1, strength, + block_size); } #else - aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, - org->y_stride, xpos, ypos, rec->y_crop_width, - rec->y_crop_height, &sum0, &sum1, strength); + aom_clpf_detect(rec_buffer, org_buffer, rec_stride, org_stride, xpos, + ypos, rec_width, rec_height, &sum0, &sum1, strength, + block_size); #endif } } @@ -161,6 +177,7 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, // Calculate the square error of all filter settings. Result: // res[0][0] : unfiltered // res[0][1-3] : strength=1,2,4, no signals +// (Only for luma:) // res[1][0] : (bit count, fb size = 128) // res[1][1-3] : strength=1,2,4, fb size = 128 // res[2][0] : (bit count, fb size = 64) @@ -170,12 +187,28 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, unsigned int block_size, unsigned int fb_size_log2, int w, - int h, int64_t res[4][4]) { + int h, int64_t res[4][4], int plane) { int c, m, n, filtered = 0; int sum[4]; + const int subx = plane != AOM_PLANE_Y && rec->subsampling_x; + const int suby = plane != AOM_PLANE_Y && rec->subsampling_y; int bslog = get_msb(block_size); + uint8_t *rec_buffer = + plane != AOM_PLANE_Y + ? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer) + : rec->y_buffer; + uint8_t *org_buffer = + plane != AOM_PLANE_Y + ? (plane == AOM_PLANE_U ? org->u_buffer : org->v_buffer) + : org->y_buffer; + int rec_width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width; + int rec_height = + plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height; + int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride; + int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride; sum[0] = sum[1] = sum[2] = sum[3] = 0; - if (fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) { + if (plane == AOM_PLANE_Y && + fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) { int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered; fb_size_log2--; @@ -190,16 +223,17 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec, oldfiltered = res[i][0]; res[i][0] = 0; - filtered = - clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1, res); + filtered = clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1, + res, plane); if (1 << (fb_size_log2 - bslog) < w) filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size, - fb_size_log2, w2, h1, res); + fb_size_log2, w2, h1, res, plane); if (1 << (fb_size_log2 - bslog) < h) { filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size, - fb_size_log2, w1, h2, res); - filtered |= clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2), - rec, org, cm, block_size, fb_size_log2, w2, h2, res); + fb_size_log2, w1, h2, res, plane); + filtered |= + clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2), rec, org, + cm, block_size, fb_size_log2, w2, h2, res, plane); } res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]); @@ -213,32 +247,31 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec, for (n = 0; n < w; n++) { int xpos = x + n * block_size; int ypos = y + m * block_size; - if (!cm->mi_grid_visible[ypos / MAX_MIB_SIZE * cm->mi_stride + - xpos / MAX_MIB_SIZE] + if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride + + (xpos << subx) / MI_SIZE] ->mbmi.skip) { #if CONFIG_AOM_HIGHBITDEPTH if (cm->use_highbitdepth) { - aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer), - CONVERT_TO_SHORTPTR(org->y_buffer), - rec->y_stride, org->y_stride, xpos, ypos, - rec->y_crop_width, rec->y_crop_height, sum, - cm->bit_depth - 8); + aom_clpf_detect_multi_hbd( + CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer), + rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum, + cm->bit_depth - 8, block_size); } else { - aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride, - org->y_stride, xpos, ypos, rec->y_crop_width, - rec->y_crop_height, sum); + aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride, + xpos, ypos, rec_width, rec_height, sum, + block_size); } #else - aom_clpf_detect_multi(rec->y_buffer, org->y_buffer, rec->y_stride, - org->y_stride, xpos, ypos, rec->y_crop_width, - rec->y_crop_height, sum); + aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride, + xpos, ypos, rec_width, rec_height, sum, + block_size); #endif filtered = 1; } } } - for (c = 0; c < 4; c++) { + for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++) { res[c][0] += sum[0]; res[c][1] += sum[1]; res[c][2] += sum[2]; @@ -249,30 +282,42 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec, void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, - int *best_strength, int *best_bs) { + int *best_strength, int *best_bs, int plane) { int c, j, k, l; int64_t best, sums[4][4]; - int width = rec->y_crop_width, height = rec->y_crop_height; - const int bs = MAX_MIB_SIZE; + int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width; + int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height; + const int bs = MI_SIZE; + const int bslog = get_msb(bs); int fb_size_log2 = get_msb(MAX_FB_SIZE); int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2; int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2; memset(sums, 0, sizeof(sums)); - for (k = 0; k < num_fb_ver; k++) { - for (l = 0; l < num_fb_hor; l++) { - // Calculate the block size after frame border clipping - int h = - AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); - int w = - AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); - h += !h << fb_size_log2; - w += !w << fb_size_log2; - clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, bs, - fb_size_log2, w / bs, h / bs, sums); + if (plane != AOM_PLANE_Y) + // Use a block size of MI_SIZE regardless of the subsampling. This + // This is accurate enough to determine the best strength and + // we don't need to add SIMD optimisations for 4x4 blocks. + clpf_rdo(0, 0, rec, org, cm, bs, fb_size_log2, width >> bslog, + height >> bslog, sums, plane); + else + for (k = 0; k < num_fb_ver; k++) { + for (l = 0; l < num_fb_hor; l++) { + // Calculate the block size after frame border clipping + int h = + AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); + int w = + AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); + h += !h << fb_size_log2; + w += !w << fb_size_log2; + clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, MI_SIZE, + fb_size_log2, w >> bslog, h >> bslog, sums, plane); + } } - } + + if (plane != AOM_PLANE_Y) // Slightly favour unfiltered chroma + sums[0][0] -= sums[0][0] >> 7; for (j = 0; j < 4; j++) { static const double lambda_square[] = { @@ -290,13 +335,13 @@ void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec, // Estimate the bit costs and adjust the square errors double lambda = lambda_square[av1_get_qindex(&cm->seg, 0, cm->base_qindex) >> 2]; - int i, cost = (int)((lambda * (sums[j][0] + 2 + 2 * (j > 0)) + 0.5)); + int i, cost = (int)((lambda * (sums[j][0] + 6 + 2 * (j > 0)) + 0.5)); for (i = 0; i < 4; i++) sums[j][i] = ((sums[j][i] + (i && j) * cost) << 4) + j * 4 + i; } best = (int64_t)1 << 62; - for (c = 0; c < 4; c++) + for (c = 0; c < (plane == AOM_PLANE_Y ? 4 : 1); c++) for (j = 0; j < 4; j++) if ((!c || j) && sums[c][j] < best) best = sums[c][j]; best &= 15; diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h index 3dd5478fcb..98b3c6765d 100644 --- a/av1/encoder/clpf_rdo.h +++ b/av1/encoder/clpf_rdo.h @@ -17,10 +17,10 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, int block_size, int w, int h, unsigned int strength, - unsigned int fb_size_log2, uint8_t *res); + unsigned int fb_size_log2, uint8_t *res, int plane); void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, - int *best_strength, int *best_bs); + int *best_strength, int *best_bs, int plane); #endif diff --git a/av1/encoder/clpf_rdo_simd.h b/av1/encoder/clpf_rdo_simd.h index 1bc5af6477..7c07329a3a 100644 --- a/av1/encoder/clpf_rdo_simd.h +++ b/av1/encoder/clpf_rdo_simd.h @@ -9,342 +9,171 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include "./aom_dsp_rtcd.h" #include "aom_dsp/aom_simd.h" +#include "aom_ports/mem.h" -SIMD_INLINE v128 calc_delta(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e, - v128 f, v128 sp, v128 sm) { - const v128 tmp = - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm)); - v128 delta = v128_add_8( - v128_add_8( - v128_shl_8( - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm), - v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))), +SIMD_INLINE void calc_diff(v128 o, v128 *a, v128 *b, v128 *c, v128 *d, v128 *e, + v128 *f) { + // The difference will be 9 bit, offset by 128 so we can use saturated + // sub to avoid going to 16 bit temporarily before "strength" clipping. + const v128 c128 = v128_dup_8(128); + v128 x = v128_add_8(c128, o); + *a = v128_ssub_s8(v128_add_8(c128, *a), x); + *b = v128_ssub_s8(v128_add_8(c128, *b), x); + *c = v128_ssub_s8(v128_add_8(c128, *c), x); + *d = v128_ssub_s8(v128_add_8(c128, *d), x); + *e = v128_ssub_s8(v128_add_8(c128, *e), x); + *f = v128_ssub_s8(v128_add_8(c128, *f), x); +} + +SIMD_INLINE v128 delta_kernel(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, + v128 f, v128 sp, v128 sm) { + const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(c, sp), sm), + v128_max_s8(v128_min_s8(d, sp), sm)); + const v128 delta = v128_add_8( + v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, sp), sm), + v128_max_s8(v128_min_s8(f, sp), sm)), + 2), + v128_add_8(v128_max_s8(v128_min_s8(b, sp), sm), + v128_max_s8(v128_min_s8(e, sp), sm))), v128_add_8(v128_add_8(tmp, tmp), tmp)); - return v128_shr_s8( - v128_add_8(v128_dup_8(8), - v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), - 4); + return v128_add_8( + o, v128_shr_s8( + v128_add_8(v128_dup_8(8), + v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))), + 4)); +} + +SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, + v128 f, v128 sp, v128 sm) { + calc_diff(o, &a, &b, &c, &d, &e, &f); + return delta_kernel(o, a, b, c, d, e, f, sp, sm); +} + +SIMD_INLINE void clip_sides(v128 *b, v128 *c, v128 *d, v128 *e, int left, + int right) { + DECLARE_ALIGNED(16, static const uint64_t, + b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL }; + DECLARE_ALIGNED(16, static const uint64_t, + c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL }; + DECLARE_ALIGNED(16, static const uint64_t, + d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL }; + DECLARE_ALIGNED(16, static const uint64_t, + e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL }; + + if (!left) { // Left clipping + *b = v128_shuffle_8(*b, v128_load_aligned(b_shuff)); + *c = v128_shuffle_8(*c, v128_load_aligned(c_shuff)); + } + if (!right) { // Right clipping + *d = v128_shuffle_8(*d, v128_load_aligned(d_shuff)); + *e = v128_shuffle_8(*e, v128_load_aligned(e_shuff)); + } +} + +SIMD_INLINE void read_two_lines(const uint8_t *rec, const uint8_t *org, + int rstride, int ostride, int x0, int y0, + int bottom, int right, int y, v128 *o, v128 *r, + v128 *a, v128 *b, v128 *c, v128 *d, v128 *e, + v128 *f) { + const v64 k1 = v64_load_aligned(org); + const v64 k2 = v64_load_aligned(org + ostride); + const v64 l1 = v64_load_aligned(rec); + const v64 l2 = v64_load_aligned(rec + rstride); + *o = v128_from_v64(k1, k2); + *r = v128_from_v64(l1, l2); + *a = v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1); + *f = v128_from_v64(l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride)); + *b = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0), + v64_load_unaligned(rec - 2 * !!x0 + rstride)); + *c = v128_from_v64(v64_load_unaligned(rec - !!x0), + v64_load_unaligned(rec - !!x0 + rstride)); + *d = v128_from_v64(v64_load_unaligned(rec + !!right), + v64_load_unaligned(rec + !!right + rstride)); + *e = v128_from_v64(v64_load_unaligned(rec + 2 * !!right), + v64_load_unaligned(rec + 2 * !!right + rstride)); + clip_sides(b, c, d, e, x0, right); } void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, - unsigned int strength) { - ssd128_internal ssd0 = v128_ssd_u8_init(); - ssd128_internal ssd1 = v128_ssd_u8_init(); - const v128 c128 = v128_dup_8(128); + unsigned int strength, int size) { const v128 sp = v128_dup_8(strength); const v128 sm = v128_dup_8(-(int)strength); + const int right = width - 8 - x0; const int bottom = height - 2 - y0; + ssd128_internal ssd0 = v128_ssd_u8_init(); + ssd128_internal ssd1 = v128_ssd_u8_init(); + int y; + + if (size != 8) { // Fallback to plain C + aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0, + sum1, strength, size); + return; + } rec += x0 + y0 * rstride; org += x0 + y0 * ostride; - if (!x0) { // Clip left - const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), - v64_from_64(0x0504030201000000LL)); - const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), - v64_from_64(0x0605040302010000LL)); - int y; - - for (y = 0; y < 8; y += 2) { - const v64 k1 = v64_load_aligned(org); - const v64 k2 = v64_load_aligned(org + ostride); - const v64 l1 = v64_load_aligned(rec); - const v64 l2 = v64_load_aligned(rec + rstride); - v128 o = v128_from_v64(k1, k2); - const v128 q = v128_from_v64(l1, l2); - const v128 x = v128_add_8(c128, q); - const v128 a = v128_add_8( - c128, - v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); - const v128 b = v128_shuffle_8(x, b_shuff); - const v128 c = v128_shuffle_8(x, c_shuff); - const v128 d = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(rec + 1), - v64_load_unaligned(rec + 1 + rstride))); - const v128 e = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(rec + 2), - v64_load_unaligned(rec + 2 + rstride))); - const v128 f = v128_add_8( - c128, v128_from_v64( - l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); - ssd0 = v128_ssd_u8(ssd0, o, q); - ssd1 = v128_ssd_u8( - ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); - rec += rstride * 2; - org += ostride * 2; - } - } else if (!(width - x0 - 8)) { // Clip right - const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), - v64_from_64(0x0707060504030201LL)); - const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), - v64_from_64(0x0707070605040302LL)); - int y; - - for (y = 0; y < 8; y += 2) { - const v64 k1 = v64_load_aligned(org); - const v64 k2 = v64_load_aligned(org + ostride); - const v64 l1 = v64_load_aligned(rec); - const v64 l2 = v64_load_aligned(rec + rstride); - v128 o = v128_from_v64(k1, k2); - const v128 q = v128_from_v64(l1, l2); - const v128 x = v128_add_8(c128, q); - const v128 a = v128_add_8( - c128, - v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); - const v128 b = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(rec - 2), - v64_load_unaligned(rec - 2 + rstride))); - const v128 c = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(rec - 1), - v64_load_unaligned(rec - 1 + rstride))); - const v128 d = v128_shuffle_8(x, d_shuff); - const v128 e = v128_shuffle_8(x, e_shuff); - const v128 f = v128_add_8( - c128, v128_from_v64( - l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); - ssd0 = v128_ssd_u8(ssd0, o, q); - ssd1 = v128_ssd_u8( - ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); - rec += rstride * 2; - org += ostride * 2; - } - } else { // No left/right clipping - int y; - for (y = 0; y < 8; y += 2) { - const v64 k1 = v64_load_aligned(org); - const v64 k2 = v64_load_aligned(org + ostride); - const v64 l1 = v64_load_aligned(rec); - const v64 l2 = v64_load_aligned(rec + rstride); - v128 o = v128_from_v64(k1, k2); - const v128 q = v128_from_v64(l1, l2); - const v128 x = v128_add_8(c128, q); - const v128 a = v128_add_8( - c128, - v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); - const v128 b = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(rec - 2), - v64_load_unaligned(rec - 2 + rstride))); - const v128 c = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(rec - 1), - v64_load_unaligned(rec - 1 + rstride))); - const v128 d = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(rec + 1), - v64_load_unaligned(rec + 1 + rstride))); - const v128 e = v128_add_8( - c128, v128_from_v64(v64_load_unaligned(rec + 2), - v64_load_unaligned(rec + 2 + rstride))); - const v128 f = v128_add_8( - c128, v128_from_v64( - l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); - ssd0 = v128_ssd_u8(ssd0, o, q); - ssd1 = v128_ssd_u8( - ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); - rec += rstride * 2; - org += ostride * 2; - } + for (y = 0; y < 8; y += 2) { + v128 a, b, c, d, e, f, o, r; + read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r, + &a, &b, &c, &d, &e, &f); + ssd0 = v128_ssd_u8(ssd0, o, r); + ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, sp, sm)); + rec += rstride * 2; + org += ostride * 2; } *sum0 += v128_ssd_u8_sum(ssd0); *sum1 += v128_ssd_u8_sum(ssd1); } -SIMD_INLINE void calc_delta_multi(v128 x, v128 q, v128 o, v128 a, v128 b, - v128 c, v128 d, v128 e, v128 f, v128 cp1, - v128 cm1, v128 cp2, v128 cm2, v128 cp4, - v128 cm4, ssd128_internal *ssd1, +SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c, + v128 d, v128 e, v128 f, ssd128_internal *ssd1, ssd128_internal *ssd2, ssd128_internal *ssd3) { - v128 tmp, delta1, delta2, delta3; - const v128 c8 = v128_dup_8(8); - - a = v128_ssub_s8(a, x); - b = v128_ssub_s8(b, x); - c = v128_ssub_s8(c, x); - d = v128_ssub_s8(d, x); - e = v128_ssub_s8(e, x); - f = v128_ssub_s8(f, x); - tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp1), cm1), - v128_max_s8(v128_min_s8(d, cp1), cm1)); - delta1 = v128_add_8( - v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp1), cm1), - v128_max_s8(v128_min_s8(f, cp1), cm1)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(b, cp1), cm1), - v128_max_s8(v128_min_s8(e, cp1), cm1))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp2), cm2), - v128_max_s8(v128_min_s8(d, cp2), cm2)); - delta2 = v128_add_8( - v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp2), cm2), - v128_max_s8(v128_min_s8(f, cp2), cm2)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(b, cp2), cm2), - v128_max_s8(v128_min_s8(e, cp2), cm2))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - tmp = v128_add_8(v128_max_s8(v128_min_s8(c, cp4), cm4), - v128_max_s8(v128_min_s8(d, cp4), cm4)); - delta3 = v128_add_8( - v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, cp4), cm4), - v128_max_s8(v128_min_s8(f, cp4), cm4)), - 2), - v128_add_8(v128_max_s8(v128_min_s8(b, cp4), cm4), - v128_max_s8(v128_min_s8(e, cp4), cm4))), - v128_add_8(v128_add_8(tmp, tmp), tmp)); - - *ssd1 = v128_ssd_u8( - *ssd1, o, - v128_add_8( - q, v128_shr_s8( - v128_add_8(c8, v128_add_8(delta1, - v128_cmplt_s8(delta1, v128_zero()))), - 4))); - *ssd2 = v128_ssd_u8( - *ssd2, o, - v128_add_8( - q, v128_shr_s8( - v128_add_8(c8, v128_add_8(delta2, - v128_cmplt_s8(delta2, v128_zero()))), - 4))); - *ssd3 = v128_ssd_u8( - *ssd3, o, - v128_add_8( - q, v128_shr_s8( - v128_add_8(c8, v128_add_8(delta3, - v128_cmplt_s8(delta3, v128_zero()))), - 4))); + calc_diff(r, &a, &b, &c, &d, &e, &f); + *ssd1 = v128_ssd_u8(*ssd1, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(1), + v128_dup_8(-1))); + *ssd2 = v128_ssd_u8(*ssd2, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(2), + v128_dup_8(-2))); + *ssd3 = v128_ssd_u8(*ssd3, o, delta_kernel(r, a, b, c, d, e, f, v128_dup_8(4), + v128_dup_8(-4))); } // Test multiple filter strengths at once. void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, - int width, int height, int *sum) { - const v128 c128 = v128_dup_8(128); - const v128 cp1 = v128_dup_8(1); - const v128 cm1 = v128_dup_8(-1); - const v128 cp2 = v128_dup_8(2); - const v128 cm2 = v128_dup_8(-2); - const v128 cp4 = v128_dup_8(4); - const v128 cm4 = v128_dup_8(-4); + int width, int height, int *sum, + int size) { const int bottom = height - 2 - y0; + const int right = width - 8 - x0; ssd128_internal ssd0 = v128_ssd_u8_init(); ssd128_internal ssd1 = v128_ssd_u8_init(); ssd128_internal ssd2 = v128_ssd_u8_init(); ssd128_internal ssd3 = v128_ssd_u8_init(); + int y; + + if (size != 8) { // Fallback to plain C + aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height, + sum, size); + return; + } rec += x0 + y0 * rstride; org += x0 + y0 * ostride; - if (!x0) { // Clip left - const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), - v64_from_64(0x0504030201000000LL)); - const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), - v64_from_64(0x0605040302010000LL)); - int y; - - for (y = 0; y < 8; y += 2) { - const v64 k1 = v64_load_aligned(org); - const v64 k2 = v64_load_aligned(org + ostride); - const v64 l1 = v64_load_aligned(rec); - const v64 l2 = v64_load_aligned(rec + rstride); - v128 o = v128_from_v64(k1, k2); - const v128 q = v128_from_v64(l1, l2); - const v128 x = v128_add_8(c128, q); - v128 a = v128_add_8( - c128, - v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); - v128 b = v128_shuffle_8(x, b_shuff); - v128 c = v128_shuffle_8(x, c_shuff); - v128 d = v128_add_8(c128, - v128_from_v64(v64_load_unaligned(rec + 1), - v64_load_unaligned(rec + 1 + rstride))); - v128 e = v128_add_8(c128, - v128_from_v64(v64_load_unaligned(rec + 2), - v64_load_unaligned(rec + 2 + rstride))); - v128 f = v128_add_8( - c128, v128_from_v64( - l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); - ssd0 = v128_ssd_u8(ssd0, o, q); - calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, - &ssd1, &ssd2, &ssd3); - rec += 2 * rstride; - org += 2 * ostride; - } - } else if (!(width - x0 - 8)) { // Clip right - const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), - v64_from_64(0x0707060504030201LL)); - const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), - v64_from_64(0x0707070605040302LL)); - int y; - - for (y = 0; y < 8; y += 2) { - const v64 k1 = v64_load_aligned(org); - const v64 k2 = v64_load_aligned(org + ostride); - const v64 l1 = v64_load_aligned(rec); - const v64 l2 = v64_load_aligned(rec + rstride); - v128 o = v128_from_v64(k1, k2); - const v128 q = v128_from_v64(l1, l2); - const v128 x = v128_add_8(c128, q); - v128 a = v128_add_8( - c128, - v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); - v128 b = v128_add_8(c128, - v128_from_v64(v64_load_unaligned(rec - 2), - v64_load_unaligned(rec - 2 + rstride))); - v128 c = v128_add_8(c128, - v128_from_v64(v64_load_unaligned(rec - 1), - v64_load_unaligned(rec - 1 + rstride))); - v128 d = v128_shuffle_8(x, d_shuff); - v128 e = v128_shuffle_8(x, e_shuff); - v128 f = v128_add_8( - c128, v128_from_v64( - l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); - ssd0 = v128_ssd_u8(ssd0, o, q); - calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, - &ssd1, &ssd2, &ssd3); - rec += 2 * rstride; - org += 2 * ostride; - } - } else { // No left/right clipping - int y; - for (y = 0; y < 8; y += 2) { - const v64 k1 = v64_load_aligned(org); - const v64 k2 = v64_load_aligned(org + ostride); - const v64 l1 = v64_load_aligned(rec); - const v64 l2 = v64_load_aligned(rec + rstride); - v128 o = v128_from_v64(k1, k2); - const v128 q = v128_from_v64(l1, l2); - const v128 x = v128_add_8(c128, q); - v128 a = v128_add_8( - c128, - v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1)); - v128 b = v128_add_8(c128, - v128_from_v64(v64_load_unaligned(rec - 2), - v64_load_unaligned(rec - 2 + rstride))); - v128 c = v128_add_8(c128, - v128_from_v64(v64_load_unaligned(rec - 1), - v64_load_unaligned(rec - 1 + rstride))); - v128 d = v128_add_8(c128, - v128_from_v64(v64_load_unaligned(rec + 1), - v64_load_unaligned(rec + 1 + rstride))); - v128 e = v128_add_8(c128, - v128_from_v64(v64_load_unaligned(rec + 2), - v64_load_unaligned(rec + 2 + rstride))); - v128 f = v128_add_8( - c128, v128_from_v64( - l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride))); - ssd0 = v128_ssd_u8(ssd0, o, q); - calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, - &ssd1, &ssd2, &ssd3); - rec += 2 * rstride; - org += 2 * ostride; - } + for (y = 0; y < 8; y += 2) { + v128 a, b, c, d, e, f, o, r; + read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r, + &a, &b, &c, &d, &e, &f); + ssd0 = v128_ssd_u8(ssd0, o, r); + calc_delta_multi(r, o, a, b, c, d, e, f, &ssd1, &ssd2, &ssd3); + rec += 2 * rstride; + org += 2 * ostride; } sum[0] += v128_ssd_u8_sum(ssd0); sum[1] += v128_ssd_u8_sum(ssd1); @@ -353,154 +182,66 @@ void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org, } #if CONFIG_AOM_HIGHBITDEPTH +SIMD_INLINE void read_two_lines_hbd(const uint16_t *rec, const uint16_t *org, + int rstride, int ostride, int x0, int y0, + int bottom, int right, int y, v128 *o, + v128 *r, v128 *a, v128 *b, v128 *c, v128 *d, + v128 *e, v128 *f, int shift) { + const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); + const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); + *o = v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), + v128_shr_u16(v128_load_aligned(org + ostride), shift)); + *r = v128_unziplo_8(n1, n2); + *a = v128_unziplo_8( + v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), shift), n1); + *f = v128_unziplo_8( + n2, v128_shr_u16(v128_load_unaligned(rec + ((y != bottom) + 1) * rstride), + shift)); + *b = v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0), shift), + v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0 + rstride), shift)); + *c = v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec - !!x0), shift), + v128_shr_u16(v128_load_unaligned(rec - !!x0 + rstride), shift)); + *d = v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec + !!right), shift), + v128_shr_u16(v128_load_unaligned(rec + !!right + rstride), shift)); + *e = v128_unziplo_8( + v128_shr_u16(v128_load_unaligned(rec + 2 * !!right), shift), + v128_shr_u16(v128_load_unaligned(rec + 2 * !!right + rstride), shift)); + clip_sides(b, c, d, e, x0, right); +} + void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, - unsigned int strength, int shift) { - ssd128_internal ssd0 = v128_ssd_u8_init(); - ssd128_internal ssd1 = v128_ssd_u8_init(); - const v128 c128 = v128_dup_8(128); + unsigned int strength, int shift, + int size) { const v128 sp = v128_dup_8(strength >> shift); const v128 sm = v128_dup_8(-(int)(strength >> shift)); const int bottom = height - 2 - y0; + const int right = width - 8 - x0; + ssd128_internal ssd0 = v128_ssd_u8_init(); + ssd128_internal ssd1 = v128_ssd_u8_init(); + int y; + + if (size != 8) { // Fallback to plain C + aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height, + sum0, sum1, strength, shift, size); + return; + } rec += x0 + y0 * rstride; org += x0 + y0 * ostride; - if (!x0) { // Clip left - const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), - v64_from_64(0x0504030201000000LL)); - const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), - v64_from_64(0x0605040302010000LL)); - int y; - - for (y = 0; y < 8; y += 2) { - const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); - const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); - const v128 o = - v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), - v128_shr_u16(v128_load_aligned(org + ostride), shift)); - const v128 q = v128_unziplo_8(n1, n2); - const v128 x = v128_add_8(c128, q); - const v128 a = v128_add_8( - c128, v128_unziplo_8( - v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), - shift), - n1)); - const v128 b = v128_shuffle_8(x, b_shuff); - const v128 c = v128_shuffle_8(x, c_shuff); - const v128 d = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec + 1), shift), - v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift))); - const v128 e = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec + 2), shift), - v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift))); - const v128 f = v128_add_8( - c128, v128_unziplo_8( - n2, v128_shr_u16(v128_load_unaligned( - rec + ((y != bottom) + 1) * rstride), - shift))); - - ssd0 = v128_ssd_u8(ssd0, o, q); - ssd1 = v128_ssd_u8( - ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); - rec += rstride * 2; - org += ostride * 2; - } - } else if (!(width - x0 - 8)) { // Clip right - const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), - v64_from_64(0x0707060504030201LL)); - const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), - v64_from_64(0x0707070605040302LL)); - int y; - - for (y = 0; y < 8; y += 2) { - const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); - const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); - const v128 o = - v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), - v128_shr_u16(v128_load_aligned(org + ostride), shift)); - const v128 q = v128_unziplo_8(n1, n2); - const v128 x = v128_add_8(c128, q); - const v128 a = v128_add_8( - c128, v128_unziplo_8( - v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), - shift), - n1)); - const v128 b = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec - 2), shift), - v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift))); - const v128 c = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec - 1), shift), - v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift))); - const v128 d = v128_shuffle_8(x, d_shuff); - const v128 e = v128_shuffle_8(x, e_shuff); - const v128 f = v128_add_8( - c128, v128_unziplo_8( - n2, v128_shr_u16(v128_load_unaligned( - rec + ((y != bottom) + 1) * rstride), - shift))); - - ssd0 = v128_ssd_u8(ssd0, o, q); - ssd1 = v128_ssd_u8( - ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); - rec += rstride * 2; - org += ostride * 2; - } - } else { // No left/right clipping - int y; - for (y = 0; y < 8; y += 2) { - const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); - const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); - const v128 o = - v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), - v128_shr_u16(v128_load_aligned(org + ostride), shift)); - const v128 q = v128_unziplo_8(n1, n2); - const v128 x = v128_add_8(c128, q); - const v128 a = v128_add_8( - c128, v128_unziplo_8( - v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), - shift), - n1)); - const v128 b = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec - 2), shift), - v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift))); - const v128 c = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec - 1), shift), - v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift))); - const v128 d = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec + 1), shift), - v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift))); - const v128 e = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec + 2), shift), - v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift))); - const v128 f = v128_add_8( - c128, v128_unziplo_8( - n2, v128_shr_u16(v128_load_unaligned( - rec + ((y != bottom) + 1) * rstride), - shift))); - ssd0 = v128_ssd_u8(ssd0, o, q); - ssd1 = v128_ssd_u8( - ssd1, o, v128_add_8(q, calc_delta(x, a, b, c, d, e, f, sp, sm))); - rec += rstride * 2; - org += ostride * 2; - } + for (y = 0; y < 8; y += 2) { + v128 a, b, c, d, e, f, o, r; + read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, + &r, &a, &b, &c, &d, &e, &f, shift); + ssd0 = v128_ssd_u8(ssd0, o, r); + ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, sp, sm)); + rec += rstride * 2; + org += ostride * 2; } *sum0 += v128_ssd_u8_sum(ssd0); *sum1 += v128_ssd_u8_sum(ssd1); @@ -510,158 +251,32 @@ void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, - int shift) { - const v128 c128 = v128_dup_8(128); - const v128 cp1 = v128_dup_8(1); - const v128 cm1 = v128_dup_8(-1); - const v128 cp2 = v128_dup_8(2); - const v128 cm2 = v128_dup_8(-2); - const v128 cp4 = v128_dup_8(4); - const v128 cm4 = v128_dup_8(-4); + int shift, int size) { const int bottom = height - 2 - y0; + const int right = width - 8 - x0; ssd128_internal ssd0 = v128_ssd_u8_init(); ssd128_internal ssd1 = v128_ssd_u8_init(); ssd128_internal ssd2 = v128_ssd_u8_init(); ssd128_internal ssd3 = v128_ssd_u8_init(); + int y; + + if (size != 8) { // Fallback to plain C + aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width, + height, sum, shift, size); + return; + } rec += x0 + y0 * rstride; org += x0 + y0 * ostride; - if (!x0) { // Clip left - const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL), - v64_from_64(0x0504030201000000LL)); - const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL), - v64_from_64(0x0605040302010000LL)); - int y; - - for (y = 0; y < 8; y += 2) { - const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); - const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); - const v128 o = - v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), - v128_shr_u16(v128_load_aligned(org + ostride), shift)); - const v128 q = v128_unziplo_8(n1, n2); - const v128 x = v128_add_8(c128, q); - const v128 a = v128_add_8( - c128, v128_unziplo_8( - v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), - shift), - n1)); - const v128 b = v128_shuffle_8(x, b_shuff); - const v128 c = v128_shuffle_8(x, c_shuff); - const v128 d = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec + 1), shift), - v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift))); - const v128 e = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec + 2), shift), - v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift))); - const v128 f = v128_add_8( - c128, v128_unziplo_8( - n2, v128_shr_u16(v128_load_unaligned( - rec + ((y != bottom) + 1) * rstride), - shift))); - - ssd0 = v128_ssd_u8(ssd0, o, q); - calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, - &ssd1, &ssd2, &ssd3); - rec += 2 * rstride; - org += 2 * ostride; - } - } else if (!(width - x0 - 8)) { // Clip right - const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), - v64_from_64(0x0707060504030201LL)); - const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL), - v64_from_64(0x0707070605040302LL)); - int y; - - for (y = 0; y < 8; y += 2) { - const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); - const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); - const v128 o = - v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), - v128_shr_u16(v128_load_aligned(org + ostride), shift)); - const v128 q = v128_unziplo_8(n1, n2); - const v128 x = v128_add_8(c128, q); - const v128 a = v128_add_8( - c128, v128_unziplo_8( - v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), - shift), - n1)); - const v128 b = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec - 2), shift), - v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift))); - const v128 c = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec - 1), shift), - v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift))); - const v128 d = v128_shuffle_8(x, d_shuff); - const v128 e = v128_shuffle_8(x, e_shuff); - const v128 f = v128_add_8( - c128, v128_unziplo_8( - n2, v128_shr_u16(v128_load_unaligned( - rec + ((y != bottom) + 1) * rstride), - shift))); - - ssd0 = v128_ssd_u8(ssd0, o, q); - calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, - &ssd1, &ssd2, &ssd3); - rec += 2 * rstride; - org += 2 * ostride; - } - } else { // No left/right clipping - int y; - for (y = 0; y < 8; y += 2) { - const v128 n1 = v128_shr_u16(v128_load_aligned(rec), shift); - const v128 n2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift); - const v128 o = - v128_unziplo_8(v128_shr_u16(v128_load_aligned(org), shift), - v128_shr_u16(v128_load_aligned(org + ostride), shift)); - const v128 q = v128_unziplo_8(n1, n2); - const v128 x = v128_add_8(c128, q); - const v128 a = v128_add_8( - c128, v128_unziplo_8( - v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), - shift), - n1)); - const v128 b = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec - 2), shift), - v128_shr_u16(v128_load_unaligned(rec - 2 + rstride), shift))); - const v128 c = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec - 1), shift), - v128_shr_u16(v128_load_unaligned(rec - 1 + rstride), shift))); - const v128 d = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec + 1), shift), - v128_shr_u16(v128_load_unaligned(rec + 1 + rstride), shift))); - const v128 e = v128_add_8( - c128, - v128_unziplo_8( - v128_shr_u16(v128_load_unaligned(rec + 2), shift), - v128_shr_u16(v128_load_unaligned(rec + 2 + rstride), shift))); - const v128 f = v128_add_8( - c128, v128_unziplo_8( - n2, v128_shr_u16(v128_load_unaligned( - rec + ((y != bottom) + 1) * rstride), - shift))); - - ssd0 = v128_ssd_u8(ssd0, o, q); - calc_delta_multi(x, q, o, a, b, c, d, e, f, cp1, cm1, cp2, cm2, cp4, cm4, - &ssd1, &ssd2, &ssd3); - rec += 2 * rstride; - org += 2 * ostride; - } + for (y = 0; y < 8; y += 2) { + v128 a, b, c, d, e, f, o, r; + read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, + &r, &a, &b, &c, &d, &e, &f, shift); + ssd0 = v128_ssd_u8(ssd0, o, r); + calc_delta_multi(r, o, a, b, c, d, e, f, &ssd1, &ssd2, &ssd3); + rec += 2 * rstride; + org += 2 * ostride; } sum[0] += v128_ssd_u8_sum(ssd0); sum[1] += v128_ssd_u8_sum(ssd1); diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c index 0706090167..6c20ea02bf 100644 --- a/av1/encoder/encoder.c +++ b/av1/encoder/encoder.c @@ -17,6 +17,7 @@ #include "av1/common/alloccommon.h" #if CONFIG_CLPF +#include "aom/aom_image.h" #include "av1/common/clpf.h" #include "av1/encoder/clpf_rdo.h" #endif @@ -2620,7 +2621,7 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { } #if CONFIG_CLPF - cm->clpf_strength = 0; + cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0; cm->clpf_size = 2; CHECK_MEM_ERROR( cm, cm->clpf_blocks, @@ -2628,21 +2629,37 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { ((cm->frame_to_show->y_crop_height + 31) & ~31) >> 10)); if (!is_lossless_requested(&cpi->oxcf)) { + const YV12_BUFFER_CONFIG *const frame = cm->frame_to_show; + // Find the best strength and block size for the entire frame - int fb_size_log2, strength; - av1_clpf_test_frame(cm->frame_to_show, cpi->Source, cm, &strength, - &fb_size_log2); + int fb_size_log2, strength_y, strength_u, strength_v; + av1_clpf_test_frame(frame, cpi->Source, cm, &strength_y, &fb_size_log2, + AOM_PLANE_Y); + av1_clpf_test_frame(frame, cpi->Source, cm, &strength_u, &fb_size_log2, + AOM_PLANE_U); + av1_clpf_test_frame(frame, cpi->Source, cm, &strength_v, &fb_size_log2, + AOM_PLANE_V); if (!fb_size_log2) fb_size_log2 = get_msb(MAX_FB_SIZE); - if (strength) { + if (strength_y) { // Apply the filter using the chosen strength - cm->clpf_strength = strength - (strength == 4); + cm->clpf_strength_y = strength_y - (strength_y == 4); cm->clpf_size = fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0; cm->clpf_numblocks = av1_clpf_frame( - cm->frame_to_show, cpi->Source, cm, !!cm->clpf_size, strength, - 4 + cm->clpf_size, cm->clpf_blocks, av1_clpf_decision); + frame, cpi->Source, cm, !!cm->clpf_size, strength_y, + 4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, av1_clpf_decision); + } + if (strength_u) { + cm->clpf_strength_u = strength_u - (strength_u == 4); + av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, NULL, AOM_PLANE_U, + NULL); + } + if (strength_v) { + cm->clpf_strength_v = strength_v - (strength_v == 4); + av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, NULL, AOM_PLANE_V, + NULL); } } #endif diff --git a/test/clpf_test.cc b/test/clpf_test.cc index 74a41a8b9e..df5c5ac497 100644 --- a/test/clpf_test.cc +++ b/test/clpf_test.cc @@ -147,6 +147,8 @@ void test_clpf(int w, int h, int depth, int iterations, << "strength: " << (1 << strength) << std::endl << "xpos: " << xpos << std::endl << "ypos: " << ypos << std::endl + << "w: " << w << std::endl + << "h: " << h << std::endl << "A=" << (pos > size ? (int16_t)s[pos - size] : -1) << std::endl << "B=" << (pos % size - 2 >= 0 ? (int16_t)s[pos - 2] : -1) << std::endl << "C=" << (pos % size - 1 >= 0 ? (int16_t)s[pos - 1] : -1) << std::endl -- GitLab