Commit 4305e6be authored by Steinar Midtskogen's avatar Steinar Midtskogen

CLPF: Add quality dependent damping in the constrain function

PSNR YCbCr:  -0.17%     -0.03%     -0.40%
APSNR YCbCr: -0.17%     -0.02%     -0.39%
PSNRHVS:     -0.06%
SSIM:        -0.17%
MSSSIM:      -0.07%
CIEDE2000:   -0.12%

Change-Id: I69a4b6a4e18c22c3930069396540a6fee45cb30d
parent ee4b3a80
......@@ -854,8 +854,8 @@ specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
if (aom_config("CONFIG_CDEF") eq "yes") {
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd";
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd";
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd";
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd, unsigned int dmp";
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd, unsigned int dmp";
# VS compiling for 32 bit targets does not support vector types in
# structs as arguments, which makes the v256 type of the intrinsics
# hard to support, so optimizations for this target are disabled.
......@@ -866,8 +866,8 @@ if (aom_config("CONFIG_CDEF") eq "yes") {
}
}
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd";
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd";
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd";
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int dmp";
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int dmp";
# VS compiling for 32 bit targets does not support vector types in
# structs as arguments, which makes the v256 type of the intrinsics
# hard to support, so optimizations for this target are disabled.
......
......@@ -16,25 +16,25 @@
int sign(int i) { return i < 0 ? -1 : 1; }
int constrain(int x, int s, unsigned int bitdepth) {
int constrain(int x, int s, unsigned int damping) {
return sign(x) *
AOMMAX(0, abs(x) - AOMMAX(0, abs(x) - s + (abs(x) >> (bitdepth - 3 -
get_msb(s)))));
AOMMAX(0, abs(x) - AOMMAX(0, abs(x) - s +
(abs(x) >> (damping - get_msb(s)))));
}
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
int H, int s, unsigned int bd) {
int delta = 1 * constrain(A - X, s, bd) + 3 * constrain(B - X, s, bd) +
1 * constrain(C - X, s, bd) + 3 * constrain(D - X, s, bd) +
3 * constrain(E - X, s, bd) + 1 * constrain(F - X, s, bd) +
3 * constrain(G - X, s, bd) + 1 * constrain(H - X, s, bd);
int H, int s, unsigned int dmp) {
int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
1 * constrain(C - X, s, dmp) + 3 * constrain(D - X, s, dmp) +
3 * constrain(E - X, s, dmp) + 1 * constrain(F - X, s, dmp) +
3 * constrain(G - X, s, dmp) + 1 * constrain(H - X, s, dmp);
return (8 + delta - (delta < 0)) >> 4;
}
void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt,
unsigned int bitdepth) {
unsigned int damping) {
int x, y;
const int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
const int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY) * 2;
......@@ -53,7 +53,7 @@ void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
const int G = src[AOMMIN(ymax, y + 1) * sstride + x];
const int H = src[AOMMIN(ymax, y + 2) * sstride + x];
const int delta =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bitdepth);
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping);
dst[y * dstride + x] = X + delta;
}
}
......@@ -64,7 +64,7 @@ void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt,
unsigned int bitdepth) {
unsigned int damping) {
int x, y;
const int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
const int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY) * 2;
......@@ -83,7 +83,7 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
const int G = src[AOMMIN(ymax, y + 1) * sstride + x];
const int H = src[AOMMIN(ymax, y + 2) * sstride + x];
const int delta =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bitdepth);
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping);
dst[y * dstride + x] = X + delta;
}
}
......@@ -91,14 +91,13 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
#endif
// Return number of filtered blocks
void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, int plane,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *,
const AV1_COMMON *cm, int, int, int,
unsigned int, unsigned int, int8_t *)) {
void av1_clpf_frame(
const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *org,
AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, int plane,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *, const AV1_COMMON *cm, int, int,
int, unsigned int, unsigned int, int8_t *, int)) {
/* Constrained low-pass filter (CLPF) */
int c, k, l, m, n;
const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
......@@ -124,6 +123,11 @@ void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
? (plane == AOM_PLANE_U ? frame->u_buffer : frame->v_buffer)
: frame->y_buffer;
uint8_t *dst_buffer;
// Damping is the filter cut-off log2 point for the constrain function.
// For instance, if the damping is 5, neighbour differences above 32 will
// be ignored and half of the strength will be applied for a difference of 16.
int damping =
cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6);
// Make buffer space for in-place filtering
#if CONFIG_AOM_HIGHBITDEPTH
......@@ -169,7 +173,8 @@ void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
fb_size_log2,
cm->clpf_blocks + yoff / MIN_FB_SIZE * cm->clpf_stride +
xoff / MIN_FB_SIZE))) {
xoff / MIN_FB_SIZE,
plane))) {
// Iterate over all smaller blocks inside the filter block
for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
......@@ -260,16 +265,16 @@ void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
CONVERT_TO_SHORTPTR(dst_buffer), sstride,
dstride, xpos, ypos, sizex, sizey, strength,
boundary_type, cm->bit_depth);
boundary_type, damping);
} else {
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
ypos, sizex, sizey, strength, boundary_type,
cm->bit_depth);
damping);
}
#else
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
ypos, sizex, sizey, strength, boundary_type,
cm->bit_depth);
damping);
#endif
}
}
......
......@@ -19,7 +19,7 @@
#define MIN_FB_SIZE (1 << MIN_FB_SIZE_LOG2)
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
int H, int b, unsigned int bd);
int H, int b, unsigned int dmp);
void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength,
......@@ -27,6 +27,6 @@ void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *,
const AV1_COMMON *cm, int, int, int,
unsigned int, unsigned int, int8_t *));
unsigned int, unsigned int, int8_t *, int));
#endif
......@@ -17,7 +17,8 @@
// Process blocks of width 8, two lines at a time, 8 bit.
static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey,
BOUNDARY_TYPE bt, unsigned int strength) {
BOUNDARY_TYPE bt, unsigned int strength,
unsigned int dmp) {
const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1;
const int right = !(bt & TILE_RIGHT_BOUNDARY);
const int left = !(bt & TILE_LEFT_BOUNDARY);
......@@ -68,7 +69,7 @@ static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
}
o = calc_delta(o, a, b, c, d, e, f, g, h, strength);
o = calc_delta(o, a, b, c, d, e, f, g, h, strength, dmp);
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
src += sstride * 2;
......@@ -79,7 +80,7 @@ static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
// As above, but with no clipping tests
static void clpf_block8_noclip(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey,
unsigned int strength) {
unsigned int strength, unsigned int dmp) {
int y;
dst += x0 + y0 * dstride;
......@@ -102,8 +103,8 @@ static void clpf_block8_noclip(const uint8_t *src, uint8_t *dst, int sstride,
v64_load_unaligned(src + 1 + sstride));
const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride));
const v128 o =
calc_delta(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, strength);
const v128 o = calc_delta(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h,
strength, dmp);
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
......@@ -115,7 +116,8 @@ static void clpf_block8_noclip(const uint8_t *src, uint8_t *dst, int sstride,
// Process blocks of width 4, four lines at a time, 8 bit.
static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey,
BOUNDARY_TYPE bt, unsigned int strength) {
BOUNDARY_TYPE bt, unsigned int strength,
unsigned int dmp) {
const int right = !(bt & TILE_RIGHT_BOUNDARY);
const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 4 : -1;
const int left = !(bt & TILE_LEFT_BOUNDARY);
......@@ -178,7 +180,7 @@ static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
}
o = calc_delta(o, a, b, c, d, e, f, g, h, strength);
o = calc_delta(o, a, b, c, d, e, f, g, h, strength, dmp);
u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
......@@ -192,7 +194,7 @@ static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
// As above, but with no clipping tests
static void clpf_block4_noclip(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey,
unsigned int strength) {
unsigned int strength, unsigned int dmp) {
int y;
dst += x0 + y0 * dstride;
......@@ -229,7 +231,7 @@ static void clpf_block4_noclip(const uint8_t *src, uint8_t *dst, int sstride,
u32_load_unaligned(src + 3 * sstride + 2));
const v128 o = calc_delta(v128_from_32(l2, l3, l4, l5), a, b, c, d, e, f, g,
h, strength);
h, strength, dmp);
u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
......@@ -244,34 +246,34 @@ static void clpf_block4_noclip(const uint8_t *src, uint8_t *dst, int sstride,
void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex,
int sizey, unsigned int strength,
BOUNDARY_TYPE bt, unsigned int bd) {
BOUNDARY_TYPE bt, unsigned int dmp) {
if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
// Fallback to C for odd sizes:
// * block widths not 4 or 8
// * block heights not a multiple of 4 if the block width is 4
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, strength,
bt, bd);
bt, dmp);
} else {
if (bt)
(sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0,
y0, sizey, bt, strength);
y0, sizey, bt, strength, dmp);
else
(sizex == 4 ? clpf_block4_noclip : clpf_block8_noclip)(
src, dst, sstride, dstride, x0, y0, sizey, strength);
src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
}
}
#if CONFIG_AOM_HIGHBITDEPTH
// sign(a - b) * max(0, abs(a - b) - max(0, abs(a - b) -
// strength + (abs(a - b) >> (bd - 3 - log2(s)))))
// strength + (abs(a - b) >> (dmp - log2(s)))))
SIMD_INLINE v128 constrain_hbd(v128 a, v128 b, unsigned int strength,
unsigned int bd) {
unsigned int dmp) {
const v128 diff = v128_sub_16(v128_max_s16(a, b), v128_min_s16(a, b));
const v128 sign = v128_cmpeq_16(v128_min_s16(a, b), a); // -(a <= b)
const v128 zero = v128_zero();
const v128 s = v128_max_s16(
zero, v128_sub_16(v128_dup_16(strength),
v128_shr_u16(diff, bd - 3 - get_msb(strength))));
v128_shr_u16(diff, dmp - get_msb(strength))));
return v128_sub_16(
v128_xor(sign,
v128_max_s16(
......@@ -280,20 +282,21 @@ SIMD_INLINE v128 constrain_hbd(v128 a, v128 b, unsigned int strength,
sign);
}
// delta = 1/16 * constrain(a, x, s, bd) + 3/16 * constrain(b, x, s, bd) +
// 1/16 * constrain(c, x, s, bd) + 3/16 * constrain(d, x, s, bd) +
// 3/16 * constrain(e, x, s, bd) + 1/16 * constrain(f, x, s, bd) +
// 3/16 * constrain(g, x, s, bd) + 1/16 * constrain(h, x, s, bd)
// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
// 1/16 * constrain(c, x, s, dmp) + 3/16 * constrain(d, x, s, dmp) +
// 3/16 * constrain(e, x, s, dmp) + 1/16 * constrain(f, x, s, dmp) +
// 3/16 * constrain(g, x, s, dmp) + 1/16 * constrain(h, x, s, dmp)
SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 g, v128 h, unsigned int s,
unsigned int bd) {
unsigned int dmp) {
const v128 bdeg = v128_add_16(
v128_add_16(constrain_hbd(b, x, s, bd), constrain_hbd(d, x, s, bd)),
v128_add_16(constrain_hbd(e, x, s, bd), constrain_hbd(g, x, s, bd)));
v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(d, x, s, dmp)),
v128_add_16(constrain_hbd(e, x, s, dmp), constrain_hbd(g, x, s, dmp)));
const v128 delta = v128_add_16(
v128_add_16(
v128_add_16(constrain_hbd(a, x, s, bd), constrain_hbd(c, x, s, bd)),
v128_add_16(constrain_hbd(f, x, s, bd), constrain_hbd(h, x, s, bd))),
v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(c, x, s, dmp)),
v128_add_16(constrain_hbd(f, x, s, dmp),
constrain_hbd(h, x, s, dmp))),
v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
return v128_add_16(
x,
......@@ -305,23 +308,23 @@ SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 g, v128 h, uint16_t *dst,
unsigned int s, unsigned int bd, int dstride) {
o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, bd);
unsigned int s, unsigned int dmp, int dstride) {
o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp);
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
}
static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 g, v128 h, uint16_t *dst,
unsigned int s, unsigned int bd) {
v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, bd));
unsigned int s, unsigned int dmp) {
v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp));
}
// Process blocks of width 4, two lines at time.
SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizey, unsigned int strength,
BOUNDARY_TYPE bt, unsigned int bd) {
BOUNDARY_TYPE bt, unsigned int dmp) {
const int right = !(bt & TILE_RIGHT_BOUNDARY);
const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1;
const int left = !(bt & TILE_LEFT_BOUNDARY);
......@@ -372,7 +375,7 @@ SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
}
calc_delta_hbd4(o, a, b, c, d, e, f, g, h, dst, strength, bd, dstride);
calc_delta_hbd4(o, a, b, c, d, e, f, g, h, dst, strength, dmp, dstride);
src += sstride * 2;
dst += dstride * 2;
}
......@@ -383,7 +386,7 @@ SIMD_INLINE void clpf_block_hbd4_noclip(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0,
int y0, int sizey,
unsigned int strength,
unsigned int bd) {
unsigned int dmp) {
int y;
dst += x0 + y0 * dstride;
......@@ -408,7 +411,7 @@ SIMD_INLINE void clpf_block_hbd4_noclip(const uint16_t *src, uint16_t *dst,
v64_load_unaligned(src + 2 + sstride));
calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst,
strength, bd, dstride);
strength, dmp, dstride);
src += sstride * 2;
dst += dstride * 2;
}
......@@ -418,7 +421,7 @@ SIMD_INLINE void clpf_block_hbd4_noclip(const uint16_t *src, uint16_t *dst,
SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizey,
unsigned int strength, BOUNDARY_TYPE bt,
unsigned int bd) {
unsigned int dmp) {
const int right = !(bt & TILE_RIGHT_BOUNDARY);
const int left = !(bt & TILE_LEFT_BOUNDARY);
const int ymin = -!(bt & TILE_ABOVE_BOUNDARY) * 2;
......@@ -463,7 +466,7 @@ SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
}
calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, bd);
calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, dmp);
dst += dstride;
}
}
......@@ -472,7 +475,7 @@ SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
SIMD_INLINE void clpf_block_hbd_noclip(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizey, unsigned int strength,
unsigned int bd) {
unsigned int dmp) {
int y;
dst += x0 + y0 * dstride;
......@@ -489,7 +492,7 @@ SIMD_INLINE void clpf_block_hbd_noclip(const uint16_t *src, uint16_t *dst,
const v128 e = v128_load_unaligned(src + 1);
const v128 f = v128_load_unaligned(src + 2);
calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, bd);
calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, dmp);
src += sstride;
dst += dstride;
}
......@@ -498,20 +501,20 @@ SIMD_INLINE void clpf_block_hbd_noclip(const uint16_t *src, uint16_t *dst,
void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizex, int sizey, unsigned int strength,
BOUNDARY_TYPE bt, unsigned int bd) {
BOUNDARY_TYPE bt, unsigned int dmp) {
if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
// Fallback to C for odd sizes:
// * block width not 4 or 8
// * block heights not a multiple of 2 if the block width is 4
aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
strength, bt, bd);
strength, bt, dmp);
} else {
if (bt)
(sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)(
src, dst, sstride, dstride, x0, y0, sizey, strength, bt, bd);
src, dst, sstride, dstride, x0, y0, sizey, strength, bt, dmp);
else
(sizex == 4 ? clpf_block_hbd4_noclip : clpf_block_hbd_noclip)(
src, dst, sstride, dstride, x0, y0, sizey, strength, bd);
src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
}
}
#endif
......@@ -16,11 +16,12 @@
// sign(a - b) * max(0, abs(a - b) - max(0, abs(a - b) -
// strength + (abs(a - b) >> (5 - log2(s)))))
SIMD_INLINE v128 constrain(v128 a, v128 b, unsigned int strength) {
SIMD_INLINE v128 constrain(v128 a, v128 b, unsigned int strength,
unsigned int damping) {
const v128 diff = v128_sub_8(v128_max_u8(a, b), v128_min_u8(a, b));
const v128 sign = v128_cmpeq_8(v128_min_u8(a, b), a); // -(a <= b)
const v128 s = v128_ssub_u8(v128_dup_8(strength),
v128_shr_u8(diff, 5 - get_msb(strength)));
v128_shr_u8(diff, damping - get_msb(strength)));
return v128_sub_8(v128_xor(sign, v128_ssub_u8(diff, v128_ssub_u8(diff, s))),
sign);
}
......@@ -30,14 +31,15 @@ SIMD_INLINE v128 constrain(v128 a, v128 b, unsigned int strength) {
// 3/16 * constrain(e, x, s) + 1/16 * constrain(f, x, s) +
// 3/16 * constrain(g, x, s) + 1/16 * constrain(h, x, s)
SIMD_INLINE v128 calc_delta(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 g, v128 h, unsigned int s) {
v128 f, v128 g, v128 h, unsigned int s,
unsigned int dmp) {
const v128 bdeg =
v128_add_8(v128_add_8(constrain(b, x, s), constrain(d, x, s)),
v128_add_8(constrain(e, x, s), constrain(g, x, s)));
const v128 delta =
v128_add_8(v128_add_8(v128_add_8(constrain(a, x, s), constrain(c, x, s)),
v128_add_8(constrain(f, x, s), constrain(h, x, s))),
v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
v128_add_8(v128_add_8(constrain(b, x, s, dmp), constrain(d, x, s, dmp)),
v128_add_8(constrain(e, x, s, dmp), constrain(g, x, s, dmp)));
const v128 delta = v128_add_8(
v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(c, x, s, dmp)),
v128_add_8(constrain(f, x, s, dmp), constrain(h, x, s, dmp))),
v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
return v128_add_8(
x, v128_shr_s8(
v128_add_8(v128_dup_8(8),
......
......@@ -2746,7 +2746,8 @@ static int clpf_bit(UNUSED int k, UNUSED int l,
UNUSED const YV12_BUFFER_CONFIG *org,
UNUSED const AV1_COMMON *cm, UNUSED int block_size,
UNUSED int w, UNUSED int h, UNUSED unsigned int strength,
UNUSED unsigned int fb_size_log2, int8_t *bit) {
UNUSED unsigned int fb_size_log2, int8_t *bit,
UNUSED int plane) {
return *bit;
}
......
......@@ -19,7 +19,7 @@
void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
int ostride, int x0, int y0, int width, int height,
int *sum0, int *sum1, unsigned int strength, int size,
unsigned int bd) {
unsigned int dmp) {
int x, y;
for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + size; x++) {
......@@ -34,7 +34,7 @@ void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x];
const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x];
const int delta =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bd);
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, dmp);
const int Y = X + delta;
*sum0 += (O - X) * (O - X);
*sum1 += (O - Y) * (O - Y);
......@@ -45,7 +45,7 @@ void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum, int size,
unsigned int bd) {
unsigned int dmp) {
int x, y;
for (y = y0; y < y0 + size; y++) {
......@@ -60,9 +60,9 @@ void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)];
const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x];
const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x];
const int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, bd);
const int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, bd);
const int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, bd);
const int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, dmp);
const int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, dmp);
const int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, dmp);
const int F1 = X + delta1;
const int F2 = X + delta2;
const int F3 = X + delta3;
......@@ -79,7 +79,8 @@ void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0, int width,
int height, int *sum0, int *sum1,
unsigned int strength, int size, unsigned int bd) {
unsigned int strength, int size, unsigned int bd,
unsigned int dmp) {
const int shift = bd - 8;
int x, y;
for (y = y0; y < y0 + size; y++) {
......@@ -95,7 +96,7 @@ void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x] >> shift;
const int delta = av1_clpf_sample(X, A, B, C, D, E, F, G, H,
strength >> shift, bd - shift);
strength >> shift, dmp - shift);
const int Y = X + delta;
*sum0 += (O - X) * (O - X);
*sum1 += (O - Y) * (O - Y);
......@@ -107,7 +108,7 @@ void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum, int size,
unsigned int bd) {
unsigned int bd, unsigned int dmp) {
const int shift = bd - 8;
int x, y;
......@@ -124,11 +125,11 @@ void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x] >> shift;
const int delta1 =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, bd - shift);
av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, dmp - shift);
const int delta2 =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, bd - shift);
av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, dmp - shift);
const int delta3 =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, bd - shift);
av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, dmp - shift);
const int F1 = X + delta1;
const int F2 = X + delta2;
const int F3 = X + delta3;
......@@ -144,8 +145,10 @@ void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int block_size, int w, int h, unsigned int strength,
unsigned int fb_size_log2, int8_t *res) {
unsigned int fb_size_log2, int8_t *res, int plane) {
int m, n, sum0 = 0, sum1 = 0;
int damping =
cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6);
for (m = 0; m < h; m++) {
for (n = 0; n < w; n++) {
......@@ -160,18 +163,18 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, &sum0, &sum1, strength,
block_size, cm->bit_depth);
block_size, cm->bit_depth, damping);
} else {
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, &sum0, &sum1, strength,
block_size, cm->bit_depth);
block_size, damping);
}
#else
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, &sum0, &sum1, strength, block_size,
cm->bit_depth);
damping);
#endif
}
}
......@@ -214,6 +217,9 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
int damping =
cm->bit_depth - 5 - (plane != AOM_PLANE_Y) + (cm->base_qindex >> 6);
sum[0] = sum[1] = sum[2] = sum[3] = sum[4] = sum[5] = sum[6] = sum[7] = 0;
if (plane == AOM_PLANE_Y &&
fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
......@@ -270,19 +276,19 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
->mbmi.skip;
#if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) {
aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec_buffer),
CONVERT_TO_SHORTPTR(org_buffer), rec_stride,
org_stride, xpos, ypos, rec_width, rec_height,
sum + skip, block_size, cm->bit_depth);
aom_clpf_detect_multi_hbd(
CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
rec_stride, org_stride, xpos, ypos, rec_width, rec_height,
sum + skip, block_size, cm->bit_depth, damping);
} else {
aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
xpos, ypos, rec_width, rec_height, sum + skip,
block_size, cm->bit_depth);
block_size, damping);
}
#else
aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
xpos, ypos, rec_width, rec_height, sum + skip,
block_size, cm->bit_depth);
block_size, damping);
#endif
filtered |= !skip;
}
......
......@@ -17,7 +17,7 @@
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int block_size, int w, int h, unsigned int strength,
unsigned int fb_size_log2, int8_t *res);
unsigned int fb_size_log2, int8_t *res, int plane);
void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
......
......@@ -69,7 +69,7 @@ void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,