Commit ecf9a0c8 authored by Steinar Midtskogen's avatar Steinar Midtskogen Committed by Yaowu Xu

Extend CLPF to chroma.

Objective quality impact (low latency):

PSNR YCbCr:      0.13%     -1.37%     -1.79%
   PSNRHVS:      0.03%
      SSIM:      0.24%
    MSSSIM:      0.10%
 CIEDE2000:     -0.83%

Change-Id: I8ddf0def569286775f0f9d4d4005932766a7fc27
parent 9021d09f
......@@ -590,16 +590,16 @@ if (aom_config("CONFIG_CLPF") eq "yes") {
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift";
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift, int size";
specialize qw/aom_clpf_detect_hbd sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift";
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift, int size";
specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/;
}
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size";
specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum";
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size";
specialize qw/aom_clpf_detect_multi sse2 ssse3 sse4_1 neon/;
}
......
......@@ -8,9 +8,10 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "av1/common/clpf.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_image.h"
#include "aom_dsp/aom_dsp_common.h"
int av1_clpf_maxbits(const AV1_COMMON *cm) {
......@@ -72,21 +73,24 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
#endif
// Return number of filtered blocks
int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, uint8_t *blocks,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *,
const AV1_COMMON *cm, int, int, int,
unsigned int, unsigned int, uint8_t *)) {
int av1_clpf_frame(
const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *org,
AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, uint8_t *blocks, int plane,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *, const AV1_COMMON *cm, int, int,
int, unsigned int, unsigned int, uint8_t *, int)) {
/* Constrained low-pass filter (CLPF) */
int c, k, l, m, n;
const int bs = MI_SIZE;
const int width = frame->y_crop_width;
const int height = frame->y_crop_height;
const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
const int suby = plane != AOM_PLANE_Y && frame->subsampling_y;
const int bs = (subx || suby) ? 4 : 8;
const int bslog = get_msb(bs);
int width = plane != AOM_PLANE_Y ? frame->uv_crop_width : frame->y_crop_width;
int height =
plane != AOM_PLANE_Y ? frame->uv_crop_height : frame->y_crop_height;
int xpos, ypos;
const int sstride = frame->y_stride;
const int sstride = plane != AOM_PLANE_Y ? frame->uv_stride : frame->y_stride;
int dstride = bs;
const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
......@@ -97,9 +101,11 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
int cache_idx = 0;
const int cache_size = num_fb_hor << (2 * fb_size_log2);
const int cache_blocks = cache_size / (bs * bs);
YV12_BUFFER_CONFIG dst = *frame;
assert(bs == 8); // Optimised code assumes this.
uint8_t *src_buffer =
plane != AOM_PLANE_Y
? (plane == AOM_PLANE_U ? frame->u_buffer : frame->v_buffer)
: frame->y_buffer;
uint8_t *dst_buffer;
#if CONFIG_AOM_HIGHBITDEPTH
strength <<= (cm->bit_depth - 8);
......@@ -108,10 +114,10 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
// Make buffer space for in-place filtering
#if CONFIG_AOM_HIGHBITDEPTH
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size << !!cm->use_highbitdepth));
dst.y_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache;
dst_buffer = cm->use_highbitdepth ? CONVERT_TO_BYTEPTR(cache) : cache;
#else
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
dst.y_buffer = cache;
dst_buffer = cache;
#endif
CHECK_MEM_ERROR(cm, cache_ptr, aom_malloc(cache_blocks * sizeof(*cache_ptr)));
CHECK_MEM_ERROR(cm, cache_dst, aom_malloc(cache_blocks * sizeof(*cache_dst)));
......@@ -130,7 +136,8 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
ypos = yoff + m * bs;
if (xpos < width && ypos < height) {
allskip &=
cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE]
->mbmi.skip;
}
}
......@@ -144,13 +151,14 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
if (!allskip && // Do not filter the block if all is skip encoded
(!enable_fb_flag ||
decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
fb_size_log2, blocks + block_index))) {
fb_size_log2, blocks + block_index, plane))) {
// Iterate over all smaller blocks inside the filter block
for (m = 0; m < (h + bs - 1) / bs; m++) {
for (n = 0; n < (w + bs - 1) / bs; n++) {
for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
xpos = xoff + n * bs;
ypos = yoff + m * bs;
if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
(xpos << subx) / MI_SIZE]
->mbmi.skip) { // Not skip block
// Temporary buffering needed if filtering in-place
if (cache_ptr[cache_idx]) {
......@@ -161,50 +169,59 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
for (c = 0; c < bs; c++) {
*(uint64_t *)(d + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
*(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
if (bs == 8)
*(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
}
} else {
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
if (bs == 8)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
else
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
}
#else
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
if (bs == 8)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
else
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
#endif
}
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
cache_ptr[cache_idx] = cache + cache_idx * bs * bs * 2;
dst.y_buffer =
dst_buffer =
CONVERT_TO_BYTEPTR(cache_ptr[cache_idx]) - ypos * bs - xpos;
} else {
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
}
#else
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
dst_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
#endif
cache_dst[cache_idx] = frame->y_buffer + ypos * sstride + xpos;
cache_dst[cache_idx] = src_buffer + ypos * sstride + xpos;
if (++cache_idx >= cache_blocks) cache_idx = 0;
// Apply the filter
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(frame->y_buffer),
CONVERT_TO_SHORTPTR(dst.y_buffer), sstride,
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
CONVERT_TO_SHORTPTR(dst_buffer), sstride,
dstride, xpos, ypos, bs, bs, width, height,
strength);
} else {
aom_clpf_block(frame->y_buffer, dst.y_buffer, sstride, dstride,
xpos, ypos, bs, bs, width, height, strength);
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
ypos, bs, bs, width, height, strength);
}
#else
aom_clpf_block(frame->y_buffer, dst.y_buffer, sstride, dstride,
xpos, ypos, bs, bs, width, height, strength);
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
ypos, bs, bs, width, height, strength);
#endif
}
}
......@@ -223,18 +240,27 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
for (c = 0; c < bs; c++) {
*(uint64_t *)(d + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2);
*(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
if (bs == 8)
*(uint64_t *)(d + c * sstride + 4) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8);
}
} else {
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
if (bs == 4)
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
else
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
}
#else
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
if (bs == 4)
*(uint32_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint32_t *)(cache_ptr[cache_idx] + c * bs);
else
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
#endif
}
......
......@@ -20,10 +20,10 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, uint8_t *blocks,
unsigned int fb_size_log2, uint8_t *blocks, int plane,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *,
const AV1_COMMON *cm, int, int, int,
unsigned int, unsigned int, uint8_t *));
unsigned int, unsigned int, uint8_t *, int));
#endif
......@@ -10,131 +10,165 @@
*/
#include "./aom_dsp_rtcd.h"
#include "aom_ports/mem.h"
SIMD_INLINE void calc_delta(v128 o, v128 x, v128 a, v128 b, v128 c, v128 d,
v128 e, v128 f, uint8_t *dst, v128 sp, v128 sm,
int dstride) {
// delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) +
// 3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) +
// 1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s)
SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
// The difference will be 9 bit, offset by 128 so we can use saturated
// sub to avoid going to 16 bit temporarily before "strength" clipping.
const v128 c128 = v128_dup_8(128);
const v128 x = v128_add_8(c128, o);
const v128 c8 = v128_dup_8(8);
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(d, x), sp), sm));
const v128 tmp = v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, c), x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, d), x), sp), sm));
const v128 delta = v128_add_8(
v128_add_8(
v128_shl_8(
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(a, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(f, x), sp), sm)),
v128_add_8(
v128_max_s8(
v128_min_s8(v128_ssub_s8(v128_add_8(c128, a), x), sp),
sm),
v128_max_s8(
v128_min_s8(v128_ssub_s8(v128_add_8(c128, f), x), sp),
sm)),
2),
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(b, x), sp), sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(e, x), sp), sm))),
v128_add_8(
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, b), x), sp),
sm),
v128_max_s8(v128_min_s8(v128_ssub_s8(v128_add_8(c128, e), x), sp),
sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
o = v128_add_8(
return v128_add_8(
o,
v128_shr_s8(
v128_add_8(c8, v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4));
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
}
static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int width,
int height, unsigned int strength) {
int bottom = height - 2 - y0;
// Process blocks of width 8, two lines at a time, 8 bit.
static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int width,
int height, unsigned int strength) {
const int bottom = height - 2 - y0;
const int right = width - 8 - x0;
const v128 sp = v128_dup_8(strength);
const v128 sm = v128_dup_8(-(int)strength);
const v128 c128 = v128_dup_8(128);
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
if (!x0) { // Clip left
const v128 b_shuff = v128_from_v64(v64_from_64(0x0d0c0b0a09080808LL),
v64_from_64(0x0504030201000000LL));
const v128 c_shuff = v128_from_v64(v64_from_64(0x0e0d0c0b0a090808LL),
v64_from_64(0x0605040302010000LL));
int y;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 a =
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1);
v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0),
v64_load_unaligned(src - 2 * !!x0 + sstride));
v128 c = v128_from_v64(v64_load_unaligned(src - !!x0),
v64_load_unaligned(src - !!x0 + sstride));
v128 d = v128_from_v64(v64_load_unaligned(src + !!right),
v64_load_unaligned(src + !!right + sstride));
v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right),
v64_load_unaligned(src + 2 * !!right + sstride));
const v128 f = v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride));
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
if (!x0) { // Left clipping
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
}
if (!right) { // Right clipping
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
v64_from_64(0x0707060504030201LL));
const v128 e_shuff = v128_from_v64(v64_from_64(0x0f0f0f0e0d0c0b0aLL),
v64_from_64(0x0707070605040302LL));
int y;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
src += sstride * 2;
dst += dstride * 2;
}
}
// Process blocks of width 4, four lines at a time, 8 bit.
static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int width,
int height, unsigned int strength) {
const v128 sp = v128_dup_8(strength);
const v128 sm = v128_dup_8(-(int)strength);
const int right = width - 4 - x0;
const int bottom = height - 4 - y0;
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0504040401000000LL, 0x0d0c0c0c09080808LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0605040402010000LL, 0x0e0d0c0c0a090808LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0707060503030201LL, 0x0f0f0e0d0b0b0a09LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0707070603030302LL, 0x0f0f0f0e0b0b0b0aLL };
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
for (y = 0; y < sizey; y += 4) {
const uint32_t l0 = u32_load_aligned(src - (y != -y0) * sstride);
const uint32_t l1 = u32_load_aligned(src);
const uint32_t l2 = u32_load_aligned(src + sstride);
const uint32_t l3 = u32_load_aligned(src + 2 * sstride);
const uint32_t l4 = u32_load_aligned(src + 3 * sstride);
const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
v128 o = v128_from_32(l1, l2, l3, l4);
const v128 a = v128_from_32(l0, l1, l2, l3);
v128 b = v128_from_32(u32_load_unaligned(src - 2 * !!x0),
u32_load_unaligned(src + sstride - 2 * !!x0),
u32_load_unaligned(src + 2 * sstride - 2 * !!x0),
u32_load_unaligned(src + 3 * sstride - 2 * !!x0));
v128 c = v128_from_32(u32_load_unaligned(src - !!x0),
u32_load_unaligned(src + sstride - !!x0),
u32_load_unaligned(src + 2 * sstride - !!x0),
u32_load_unaligned(src + 3 * sstride - !!x0));
v128 d = v128_from_32(u32_load_unaligned(src + !!right),
u32_load_unaligned(src + sstride + !!right),
u32_load_unaligned(src + 2 * sstride + !!right),
u32_load_unaligned(src + 3 * sstride + !!right));
v128 e = v128_from_32(u32_load_unaligned(src + 2 * !!right),
u32_load_unaligned(src + sstride + 2 * !!right),
u32_load_unaligned(src + 2 * sstride + 2 * !!right),
u32_load_unaligned(src + 3 * sstride + 2 * !!right));
const v128 f = v128_from_32(l2, l3, l4, l5);
if (!x0) { // Left clipping
b = v128_shuffle_8(b, v128_load_aligned(b_shuff));
c = v128_shuffle_8(c, v128_load_aligned(c_shuff));
}
} else { // No left/right clipping
int y;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(
l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)));
calc_delta(o, x, a, b, c, d, e, f, dst, sp, sm, dstride);
src += sstride * 2;
dst += dstride * 2;
if (!right) { // Right clipping
d = v128_shuffle_8(d, v128_load_aligned(d_shuff));
e = v128_shuffle_8(e, v128_load_aligned(e_shuff));
}
o = calc_delta(o, a, b, c, d, e, f, sp, sm);
u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
dst += 4 * dstride;
src += 4 * sstride;
}
}
......@@ -142,24 +176,23 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex,
int sizey, int width, int height,
unsigned int strength) {
// TODO(stemidts):
// A sizex different from 8 will only be needed if CLPF is extended to chroma.
// This will only be used if 4:2:0 and width not a multiple of 16 and along
// the right edge only, so we can fall back to the plain C implementation in
// this case. If not extended to chroma, this test will be redundant.
if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
if ((sizex != 4 && sizex != 8) || y0 + 4 > height ||
(sizey & 3 && sizex == 4) || x0 + 4 > width) {
// Fallback to C for odd sizes
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
height, strength);
} else {
clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
strength);
(sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0, y0,
sizey, width, height, strength);
}
}
#if CONFIG_AOM_HIGHBITDEPTH
static void calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, uint16_t *dst, v128 sp, v128 sm) {
// delta = 4/16 * clamp(a - o, -s, s) + 1/16 * clamp(b - o, -s, s) +
// 3/16 * clamp(c - o, -s, s) + 3/16 * clamp(d - o, -s, s) +
// 1/16 * clamp(e - o, -s, s) + 4/16 * clamp(f - o, -s, s)
SIMD_INLINE v128 calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
const v128 c8 = v128_dup_16(8);
const v128 tmp =
v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(c, o), sp), sm),
......@@ -174,73 +207,124 @@ static void calc_delta_hbd(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128_add_16(v128_max_s16(v128_min_s16(v128_sub_16(b, o), sp), sm),
v128_max_s16(v128_min_s16(v128_sub_16(e, o), sp), sm))),
v128_add_16(v128_add_16(tmp, tmp), tmp));
v128_store_aligned(
dst,
v128_add_16(
o, v128_shr_s16(
v128_add_16(c8, v128_add_16(delta, v128_cmplt_s16(
delta, v128_zero()))),
4)));
return v128_add_16(
o, v128_shr_s16(
v128_add_16(
c8, v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
4));
}
static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, uint16_t *dst, v128 sp, v128 sm,
int dstride) {
o = calc_delta_hbd(o, a, b, c, d, e, f, sp, sm);
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
}
static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, uint16_t *dst, v128 sp, v128 sm) {
v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, sp, sm));
}
// Process blocks of width 4, two lines at time.
SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizey, int width, int height,
unsigned int strength) {
const v128 sp = v128_dup_16(strength);
const v128 sm = v128_dup_16(-(int)strength);
const int right = width - 4 - x0;
const int bottom = height - 2 - y0;
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0302010001000100LL, 0x0b0a090809080908LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080908LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0706070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0706070607060504LL, 0x0f0e0f0e0f0e0d0cLL };
int y;