Commit 4f0b3ed8 authored by Steinar Midtskogen's avatar Steinar Midtskogen

Retune the CLPF kernel

CLPF performance had degraded by about 0.5% over the past six months,
which isn't totally surprising since the codec is a moving target.
About half of that degradation comes from the improved 7 bit filter
coefficients.  Therefore, CLPF needs to be retuned for the current
codec.

This patch makes two (normative) changes to the CLPF kernel:

* The clipping function was changed from clamp(x, -s, s) to
      sign(x) * max(0, abs(x) - max(0, abs(x) - s +
             (abs(x) >> (bitdepth - 3 - log2(s)))))
  This adds a rampdown to 0 at -32 and 32 (for 8 bit, -128 & 128
  for 10 bit, etc), so large differences are ignored.

* 8 taps instead of 6 taps:
               1
    4          3
  13 31  ->  13 31
    4          3
               1

AWCY results: low delay  high delay
PSNR:           -0.40%     -0.47%
PSNR HVS:        0.00%     -0.11%
SSIM:           -0.31%     -0.39%
CIEDE 2000:     -0.22%     -0.31%
APSNR:          -0.40%     -0.48%
MS SSIM:         0.01%     -0.12%

About 3/4 of the gains come from the new clipping function.

Change-Id: Idad9dc4004e71a9c7ec81ba62ebd12fb76fb044a
parent 76ebf7ce
......@@ -853,9 +853,9 @@ specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
if (aom_config("CONFIG_CLPF") eq "yes") {
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt";
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift, int size";
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift, int size";
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd";
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd";
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd";
# VS compiling for 32 bit targets does not support vector types in
# structs as arguments, which makes the v256 type of the intrinsics
# hard to support, so optimizations for this target are disabled.
......@@ -865,9 +865,9 @@ if (aom_config("CONFIG_CLPF") eq "yes") {
specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/;
}
}
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt";
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size";
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size";
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd";
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd";
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd";
# VS compiling for 32 bit targets does not support vector types in
# structs as arguments, which makes the v256 type of the intrinsics
# hard to support, so optimizations for this target are disabled.
......
......@@ -85,6 +85,7 @@ ifeq ($(CONFIG_CLPF),yes)
AV1_COMMON_SRCS-yes += common/clpf.c
AV1_COMMON_SRCS-yes += common/clpf.h
AV1_COMMON_SRCS-yes += common/clpf_simd.h
AV1_COMMON_SRCS-yes += common/clpf_simd_kernel.h
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
......
......@@ -14,32 +14,46 @@
#include "aom/aom_image.h"
#include "aom_dsp/aom_dsp_common.h"
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) {
int delta = 4 * clamp(A - X, -b, b) + clamp(B - X, -b, b) +
3 * clamp(C - X, -b, b) + 3 * clamp(D - X, -b, b) +
clamp(E - X, -b, b) + 4 * clamp(F - X, -b, b);
int sign(int i) { return i < 0 ? -1 : 1; }
int constrain(int x, int s, unsigned int bitdepth) {
return sign(x) *
AOMMAX(0, abs(x) - AOMMAX(0, abs(x) - s + (abs(x) >> (bitdepth - 3 -
get_msb(s)))));
}
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
int H, int s, unsigned int bd) {
int delta = 1 * constrain(A - X, s, bd) + 3 * constrain(B - X, s, bd) +
1 * constrain(C - X, s, bd) + 3 * constrain(D - X, s, bd) +
3 * constrain(E - X, s, bd) + 1 * constrain(F - X, s, bd) +
3 * constrain(G - X, s, bd) + 1 * constrain(H - X, s, bd);
return (8 + delta - (delta < 0)) >> 4;
}
void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt) {
unsigned int strength, BOUNDARY_TYPE bt,
unsigned int bitdepth) {
int x, y;
int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY);
int xmax = x0 + sizex + !(bt & TILE_RIGHT_BOUNDARY) * 2 - 1;
int ymax = y0 + sizey + !(bt & TILE_BOTTOM_BOUNDARY) - 1;
const int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
const int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY) * 2;
const int xmax = x0 + sizex + !(bt & TILE_RIGHT_BOUNDARY) * 2 - 1;
const int ymax = y0 + sizey + !(bt & TILE_BOTTOM_BOUNDARY) * 2 - 1;
for (y = y0; y < y0 + sizey; y++) {
for (x = x0; x < x0 + sizex; x++) {
int X = src[y * sstride + x];
int A = src[AOMMAX(ymin, y - 1) * sstride + x];
int B = src[y * sstride + AOMMAX(xmin, x - 2)];
int C = src[y * sstride + AOMMAX(xmin, x - 1)];
int D = src[y * sstride + AOMMIN(xmax, x + 1)];
int E = src[y * sstride + AOMMIN(xmax, x + 2)];
int F = src[AOMMIN(ymax, y + 1) * sstride + x];
int delta;
delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
const int X = src[y * sstride + x];
const int A = src[AOMMAX(ymin, y - 2) * sstride + x];
const int B = src[AOMMAX(ymin, y - 1) * sstride + x];
const int C = src[y * sstride + AOMMAX(xmin, x - 2)];
const int D = src[y * sstride + AOMMAX(xmin, x - 1)];
const int E = src[y * sstride + AOMMIN(xmax, x + 1)];
const int F = src[y * sstride + AOMMIN(xmax, x + 2)];
const int G = src[AOMMIN(ymax, y + 1) * sstride + x];
const int H = src[AOMMIN(ymax, y + 2) * sstride + x];
const int delta =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bitdepth);
dst[y * dstride + x] = X + delta;
}
}
......@@ -49,24 +63,27 @@ void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
// Identical to aom_clpf_block_c() apart from "src" and "dst".
void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt) {
unsigned int strength, BOUNDARY_TYPE bt,
unsigned int bitdepth) {
int x, y;
int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY);
int xmax = x0 + sizex + !(bt & TILE_RIGHT_BOUNDARY) * 2 - 1;
int ymax = y0 + sizey + !(bt & TILE_BOTTOM_BOUNDARY) - 1;
const int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
const int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY) * 2;
const int xmax = x0 + sizex + !(bt & TILE_RIGHT_BOUNDARY) * 2 - 1;
const int ymax = y0 + sizey + !(bt & TILE_BOTTOM_BOUNDARY) * 2 - 1;
for (y = y0; y < y0 + sizey; y++) {
for (x = x0; x < x0 + sizex; x++) {
int X = src[y * sstride + x];
int A = src[AOMMAX(ymin, y - 1) * sstride + x];
int B = src[y * sstride + AOMMAX(xmin, x - 2)];
int C = src[y * sstride + AOMMAX(xmin, x - 1)];
int D = src[y * sstride + AOMMIN(xmax, x + 1)];
int E = src[y * sstride + AOMMIN(xmax, x + 2)];
int F = src[AOMMIN(ymax, y + 1) * sstride + x];
int delta;
delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
const int X = src[y * sstride + x];
const int A = src[AOMMAX(ymin, y - 2) * sstride + x];
const int B = src[AOMMAX(ymin, y - 1) * sstride + x];
const int C = src[y * sstride + AOMMAX(xmin, x - 2)];
const int D = src[y * sstride + AOMMAX(xmin, x - 1)];
const int E = src[y * sstride + AOMMIN(xmax, x + 1)];
const int F = src[y * sstride + AOMMIN(xmax, x + 2)];
const int G = src[AOMMIN(ymax, y + 1) * sstride + x];
const int H = src[AOMMIN(ymax, y + 2) * sstride + x];
const int delta =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bitdepth);
dst[y * dstride + x] = X + delta;
}
}
......@@ -243,14 +260,16 @@ void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
CONVERT_TO_SHORTPTR(dst_buffer), sstride,
dstride, xpos, ypos, sizex, sizey, strength,
boundary_type);
boundary_type, cm->bit_depth);
} else {
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
ypos, sizex, sizey, strength, boundary_type);
ypos, sizex, sizey, strength, boundary_type,
cm->bit_depth);
}
#else
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
ypos, sizex, sizey, strength, boundary_type);
ypos, sizex, sizey, strength, boundary_type,
cm->bit_depth);
#endif
}
}
......
......@@ -18,7 +18,8 @@
#define MAX_FB_SIZE (1 << MAX_FB_SIZE_LOG2)
#define MIN_FB_SIZE (1 << MIN_FB_SIZE_LOG2)
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
int H, int b, unsigned int bd);
void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength,
......
This diff is collapsed.
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AV1_COMMON_CLPF_SIMD_KERNEL_H_
#define AV1_COMMON_CLPF_SIMD_KERNEL_H_
#include "aom_dsp/aom_simd.h"
// sign(a - b) * max(0, abs(a - b) - max(0, abs(a - b) -
// strength + (abs(a - b) >> (5 - log2(s)))))
SIMD_INLINE v128 constrain(v128 a, v128 b, unsigned int strength) {
const v128 diff = v128_sub_8(v128_max_u8(a, b), v128_min_u8(a, b));
const v128 sign = v128_cmpeq_8(v128_min_u8(a, b), a); // -(a <= b)
const v128 s = v128_ssub_u8(v128_dup_8(strength),
v128_shr_u8(diff, 5 - get_msb(strength)));
return v128_sub_8(v128_xor(sign, v128_ssub_u8(diff, v128_ssub_u8(diff, s))),
sign);
}
// delta = 1/16 * constrain(a, x, s) + 3/16 * constrain(b, x, s) +
// 1/16 * constrain(c, x, s) + 3/16 * constrain(d, x, s) +
// 3/16 * constrain(e, x, s) + 1/16 * constrain(f, x, s) +
// 3/16 * constrain(g, x, s) + 1/16 * constrain(h, x, s)
SIMD_INLINE v128 calc_delta(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 g, v128 h, unsigned int s) {
const v128 bdeg =
v128_add_8(v128_add_8(constrain(b, x, s), constrain(d, x, s)),
v128_add_8(constrain(e, x, s), constrain(g, x, s)));
const v128 delta =
v128_add_8(v128_add_8(v128_add_8(constrain(a, x, s), constrain(c, x, s)),
v128_add_8(constrain(f, x, s), constrain(h, x, s))),
v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
return v128_add_8(
x, v128_shr_s8(
v128_add_8(v128_dup_8(8),
v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4));
}
#endif
This diff is collapsed.
This diff is collapsed.
......@@ -28,7 +28,8 @@ namespace {
typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt);
unsigned int strength, BOUNDARY_TYPE bt,
unsigned int bitdepth);
typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
clpf_block_param_t;
......@@ -58,7 +59,7 @@ typedef ClpfBlockTest ClpfSpeedTest;
typedef void (*clpf_block_hbd_t)(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizex, int sizey, unsigned int strength,
BOUNDARY_TYPE bt);
BOUNDARY_TYPE bt, unsigned int bitdepth);
typedef std::tr1::tuple<clpf_block_hbd_t, clpf_block_hbd_t, int, int>
clpf_block_hbd_param_t;
......@@ -90,11 +91,12 @@ template <typename pixel>
void test_clpf(int w, int h, int depth, int iterations,
void (*clpf)(const pixel *src, pixel *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt),
unsigned int strength, BOUNDARY_TYPE bt,
unsigned int bitdepth),
void (*ref_clpf)(const pixel *src, pixel *dst, int sstride,
int dstride, int x0, int y0, int sizex,
int sizey, unsigned int strength,
BOUNDARY_TYPE bt)) {
BOUNDARY_TYPE bt, unsigned int bitdepth)) {
const int size = 24;
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, pixel, s[size * size]);
......@@ -129,10 +131,10 @@ void test_clpf(int w, int h, int depth, int iterations,
(TILE_RIGHT_BOUNDARY & -(xpos + w == size)) |
(TILE_BOTTOM_BOUNDARY & -(ypos + h == size)));
ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, 1 << strength,
bt);
bt, depth);
if (clpf != ref_clpf)
ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w,
h, 1 << strength, bt));
h, 1 << strength, bt, depth));
if (ref_clpf != clpf)
for (pos = 0; pos < size * size && !error; pos++) {
error = ref_d[pos] != d[pos];
......@@ -154,13 +156,17 @@ void test_clpf(int w, int h, int depth, int iterations,
<< "ypos: " << ypos << std::endl
<< "w: " << w << std::endl
<< "h: " << h << std::endl
<< "A=" << (pos > size ? (int16_t)s[pos - size] : -1) << std::endl
<< "B=" << (pos % size - 2 >= 0 ? (int16_t)s[pos - 2] : -1) << std::endl
<< "C=" << (pos % size - 1 >= 0 ? (int16_t)s[pos - 1] : -1) << std::endl
<< "A=" << (pos > 2 * size ? (int16_t)s[pos - 2 * size] : -1) << std::endl
<< "B=" << (pos > size ? (int16_t)s[pos - size] : -1) << std::endl
<< "C=" << (pos % size - 2 >= 0 ? (int16_t)s[pos - 2] : -1) << std::endl
<< "D=" << (pos % size - 1 >= 0 ? (int16_t)s[pos - 1] : -1) << std::endl
<< "X=" << (int16_t)s[pos] << std::endl
<< "D=" << (pos % size + 1 < size ? (int16_t)s[pos + 1] : -1) << std::endl
<< "E=" << (pos % size + 2 < size ? (int16_t)s[pos + 2] : -1) << std::endl
<< "F=" << (pos + size < size * size ? (int16_t)s[pos + size] : -1)
<< "E=" << (pos % size + 1 < size ? (int16_t)s[pos + 1] : -1) << std::endl
<< "F=" << (pos % size + 2 < size ? (int16_t)s[pos + 2] : -1) << std::endl
<< "G=" << (pos + size < size * size ? (int16_t)s[pos + size] : -1)
<< std::endl
<< "H="
<< (pos + 2 * size < size * size ? (int16_t)s[pos + 2 * size] : -1)
<< std::endl;
}
......@@ -169,11 +175,12 @@ void test_clpf_speed(int w, int h, int depth, int iterations,
void (*clpf)(const pixel *src, pixel *dst, int sstride,
int dstride, int x0, int y0, int sizex,
int sizey, unsigned int strength,
BOUNDARY_TYPE bt),
BOUNDARY_TYPE bt, unsigned int bitdepth),
void (*ref_clpf)(const pixel *src, pixel *dst, int sstride,
int dstride, int x0, int y0, int sizex,
int sizey, unsigned int strength,
BOUNDARY_TYPE bt)) {
BOUNDARY_TYPE bt,
unsigned int bitdepth)) {
aom_usec_timer ref_timer;
aom_usec_timer timer;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment