Commit 4f0b3ed8 authored by Steinar Midtskogen's avatar Steinar Midtskogen
Browse files

Retune the CLPF kernel

CLPF performance had degraded by about 0.5% over the past six months,
which isn't totally surprising since the codec is a moving target.
About half of that degradation comes from the improved 7 bit filter
coefficients.  Therefore, CLPF needs to be retuned for the current
codec.

This patch makes two (normative) changes to the CLPF kernel:

* The clipping function was changed from clamp(x, -s, s) to
      sign(x) * max(0, abs(x) - max(0, abs(x) - s +
             (abs(x) >> (bitdepth - 3 - log2(s)))))
  This adds a rampdown to 0 at -32 and 32 (for 8 bit, -128 & 128
  for 10 bit, etc), so large differences are ignored.

* 8 taps instead of 6 taps:
               1
    4          3
  13 31  ->  13 31
    4          3
               1

AWCY results: low delay  high delay
PSNR:           -0.40%     -0.47%
PSNR HVS:        0.00%     -0.11%
SSIM:           -0.31%     -0.39%
CIEDE 2000:     -0.22%     -0.31%
APSNR:          -0.40%     -0.48%
MS SSIM:         0.01%     -0.12%

About 3/4 of the gains come from the new clipping function.

Change-Id: Idad9dc4004e71a9c7ec81ba62ebd12fb76fb044a
parent 76ebf7ce
......@@ -853,9 +853,9 @@ specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
if (aom_config("CONFIG_CLPF") eq "yes") {
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt";
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int shift, int size";
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int shift, int size";
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd";
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd";
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd";
# VS compiling for 32 bit targets does not support vector types in
# structs as arguments, which makes the v256 type of the intrinsics
# hard to support, so optimizations for this target are disabled.
......@@ -865,9 +865,9 @@ if (aom_config("CONFIG_CLPF") eq "yes") {
specialize qw/aom_clpf_detect_multi_hbd sse2 ssse3 sse4_1 neon/;
}
}
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt";
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size";
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size";
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd";
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd";
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd";
# VS compiling for 32 bit targets does not support vector types in
# structs as arguments, which makes the v256 type of the intrinsics
# hard to support, so optimizations for this target are disabled.
......
......@@ -85,6 +85,7 @@ ifeq ($(CONFIG_CLPF),yes)
AV1_COMMON_SRCS-yes += common/clpf.c
AV1_COMMON_SRCS-yes += common/clpf.h
AV1_COMMON_SRCS-yes += common/clpf_simd.h
AV1_COMMON_SRCS-yes += common/clpf_simd_kernel.h
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
......
......@@ -14,32 +14,46 @@
#include "aom/aom_image.h"
#include "aom_dsp/aom_dsp_common.h"
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) {
int delta = 4 * clamp(A - X, -b, b) + clamp(B - X, -b, b) +
3 * clamp(C - X, -b, b) + 3 * clamp(D - X, -b, b) +
clamp(E - X, -b, b) + 4 * clamp(F - X, -b, b);
int sign(int i) { return i < 0 ? -1 : 1; }
int constrain(int x, int s, unsigned int bitdepth) {
return sign(x) *
AOMMAX(0, abs(x) - AOMMAX(0, abs(x) - s + (abs(x) >> (bitdepth - 3 -
get_msb(s)))));
}
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
int H, int s, unsigned int bd) {
int delta = 1 * constrain(A - X, s, bd) + 3 * constrain(B - X, s, bd) +
1 * constrain(C - X, s, bd) + 3 * constrain(D - X, s, bd) +
3 * constrain(E - X, s, bd) + 1 * constrain(F - X, s, bd) +
3 * constrain(G - X, s, bd) + 1 * constrain(H - X, s, bd);
return (8 + delta - (delta < 0)) >> 4;
}
void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt) {
unsigned int strength, BOUNDARY_TYPE bt,
unsigned int bitdepth) {
int x, y;
int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY);
int xmax = x0 + sizex + !(bt & TILE_RIGHT_BOUNDARY) * 2 - 1;
int ymax = y0 + sizey + !(bt & TILE_BOTTOM_BOUNDARY) - 1;
const int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
const int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY) * 2;
const int xmax = x0 + sizex + !(bt & TILE_RIGHT_BOUNDARY) * 2 - 1;
const int ymax = y0 + sizey + !(bt & TILE_BOTTOM_BOUNDARY) * 2 - 1;
for (y = y0; y < y0 + sizey; y++) {
for (x = x0; x < x0 + sizex; x++) {
int X = src[y * sstride + x];
int A = src[AOMMAX(ymin, y - 1) * sstride + x];
int B = src[y * sstride + AOMMAX(xmin, x - 2)];
int C = src[y * sstride + AOMMAX(xmin, x - 1)];
int D = src[y * sstride + AOMMIN(xmax, x + 1)];
int E = src[y * sstride + AOMMIN(xmax, x + 2)];
int F = src[AOMMIN(ymax, y + 1) * sstride + x];
int delta;
delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
const int X = src[y * sstride + x];
const int A = src[AOMMAX(ymin, y - 2) * sstride + x];
const int B = src[AOMMAX(ymin, y - 1) * sstride + x];
const int C = src[y * sstride + AOMMAX(xmin, x - 2)];
const int D = src[y * sstride + AOMMAX(xmin, x - 1)];
const int E = src[y * sstride + AOMMIN(xmax, x + 1)];
const int F = src[y * sstride + AOMMIN(xmax, x + 2)];
const int G = src[AOMMIN(ymax, y + 1) * sstride + x];
const int H = src[AOMMIN(ymax, y + 2) * sstride + x];
const int delta =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bitdepth);
dst[y * dstride + x] = X + delta;
}
}
......@@ -49,24 +63,27 @@ void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
// Identical to aom_clpf_block_c() apart from "src" and "dst".
void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt) {
unsigned int strength, BOUNDARY_TYPE bt,
unsigned int bitdepth) {
int x, y;
int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY);
int xmax = x0 + sizex + !(bt & TILE_RIGHT_BOUNDARY) * 2 - 1;
int ymax = y0 + sizey + !(bt & TILE_BOTTOM_BOUNDARY) - 1;
const int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
const int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY) * 2;
const int xmax = x0 + sizex + !(bt & TILE_RIGHT_BOUNDARY) * 2 - 1;
const int ymax = y0 + sizey + !(bt & TILE_BOTTOM_BOUNDARY) * 2 - 1;
for (y = y0; y < y0 + sizey; y++) {
for (x = x0; x < x0 + sizex; x++) {
int X = src[y * sstride + x];
int A = src[AOMMAX(ymin, y - 1) * sstride + x];
int B = src[y * sstride + AOMMAX(xmin, x - 2)];
int C = src[y * sstride + AOMMAX(xmin, x - 1)];
int D = src[y * sstride + AOMMIN(xmax, x + 1)];
int E = src[y * sstride + AOMMIN(xmax, x + 2)];
int F = src[AOMMIN(ymax, y + 1) * sstride + x];
int delta;
delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
const int X = src[y * sstride + x];
const int A = src[AOMMAX(ymin, y - 2) * sstride + x];
const int B = src[AOMMAX(ymin, y - 1) * sstride + x];
const int C = src[y * sstride + AOMMAX(xmin, x - 2)];
const int D = src[y * sstride + AOMMAX(xmin, x - 1)];
const int E = src[y * sstride + AOMMIN(xmax, x + 1)];
const int F = src[y * sstride + AOMMIN(xmax, x + 2)];
const int G = src[AOMMIN(ymax, y + 1) * sstride + x];
const int H = src[AOMMIN(ymax, y + 2) * sstride + x];
const int delta =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bitdepth);
dst[y * dstride + x] = X + delta;
}
}
......@@ -243,14 +260,16 @@ void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer),
CONVERT_TO_SHORTPTR(dst_buffer), sstride,
dstride, xpos, ypos, sizex, sizey, strength,
boundary_type);
boundary_type, cm->bit_depth);
} else {
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
ypos, sizex, sizey, strength, boundary_type);
ypos, sizex, sizey, strength, boundary_type,
cm->bit_depth);
}
#else
aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos,
ypos, sizex, sizey, strength, boundary_type);
ypos, sizex, sizey, strength, boundary_type,
cm->bit_depth);
#endif
}
}
......
......@@ -18,7 +18,8 @@
#define MAX_FB_SIZE (1 << MAX_FB_SIZE_LOG2)
#define MIN_FB_SIZE (1 << MIN_FB_SIZE_LOG2)
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
int H, int b, unsigned int bd);
void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength,
......
This diff is collapsed.
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AV1_COMMON_CLPF_SIMD_KERNEL_H_
#define AV1_COMMON_CLPF_SIMD_KERNEL_H_
#include "aom_dsp/aom_simd.h"
// sign(a - b) * max(0, abs(a - b) - max(0, abs(a - b) -
// strength + (abs(a - b) >> (5 - log2(s)))))
SIMD_INLINE v128 constrain(v128 a, v128 b, unsigned int strength) {
const v128 diff = v128_sub_8(v128_max_u8(a, b), v128_min_u8(a, b));
const v128 sign = v128_cmpeq_8(v128_min_u8(a, b), a); // -(a <= b)
const v128 s = v128_ssub_u8(v128_dup_8(strength),
v128_shr_u8(diff, 5 - get_msb(strength)));
return v128_sub_8(v128_xor(sign, v128_ssub_u8(diff, v128_ssub_u8(diff, s))),
sign);
}
// delta = 1/16 * constrain(a, x, s) + 3/16 * constrain(b, x, s) +
// 1/16 * constrain(c, x, s) + 3/16 * constrain(d, x, s) +
// 3/16 * constrain(e, x, s) + 1/16 * constrain(f, x, s) +
// 3/16 * constrain(g, x, s) + 1/16 * constrain(h, x, s)
SIMD_INLINE v128 calc_delta(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 g, v128 h, unsigned int s) {
const v128 bdeg =
v128_add_8(v128_add_8(constrain(b, x, s), constrain(d, x, s)),
v128_add_8(constrain(e, x, s), constrain(g, x, s)));
const v128 delta =
v128_add_8(v128_add_8(v128_add_8(constrain(a, x, s), constrain(c, x, s)),
v128_add_8(constrain(f, x, s), constrain(h, x, s))),
v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
return v128_add_8(
x, v128_shr_s8(
v128_add_8(v128_dup_8(8),
v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4));
}
#endif
......@@ -18,20 +18,24 @@
// Calculate the error of a filtered and unfiltered block
void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
int ostride, int x0, int y0, int width, int height,
int *sum0, int *sum1, unsigned int strength, int size) {
int *sum0, int *sum1, unsigned int strength, int size,
unsigned int bd) {
int x, y;
for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x];
int X = rec[y * rstride + x];
int A = rec[AOMMAX(0, y - 1) * rstride + x];
int B = rec[y * rstride + AOMMAX(0, x - 2)];
int C = rec[y * rstride + AOMMAX(0, x - 1)];
int D = rec[y * rstride + AOMMIN(width - 1, x + 1)];
int E = rec[y * rstride + AOMMIN(width - 1, x + 2)];
int F = rec[AOMMIN(height - 1, y + 1) * rstride + x];
int delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
int Y = X + delta;
const int O = org[y * ostride + x];
const int X = rec[y * rstride + x];
const int A = rec[AOMMAX(0, y - 2) * rstride + x];
const int B = rec[AOMMAX(0, y - 1) * rstride + x];
const int C = rec[y * rstride + AOMMAX(0, x - 2)];
const int D = rec[y * rstride + AOMMAX(0, x - 1)];
const int E = rec[y * rstride + AOMMIN(width - 1, x + 1)];
const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)];
const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x];
const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x];
const int delta =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, bd);
const int Y = X + delta;
*sum0 += (O - X) * (O - X);
*sum1 += (O - Y) * (O - Y);
}
......@@ -40,25 +44,28 @@ void aom_clpf_detect_c(const uint8_t *rec, const uint8_t *org, int rstride,
void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum, int size) {
int width, int height, int *sum, int size,
unsigned int bd) {
int x, y;
for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x];
int X = rec[y * rstride + x];
int A = rec[AOMMAX(0, y - 1) * rstride + x];
int B = rec[y * rstride + AOMMAX(0, x - 2)];
int C = rec[y * rstride + AOMMAX(0, x - 1)];
int D = rec[y * rstride + AOMMIN(width - 1, x + 1)];
int E = rec[y * rstride + AOMMIN(width - 1, x + 2)];
int F = rec[AOMMIN(height - 1, y + 1) * rstride + x];
int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, 1);
int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, 2);
int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, 4);
int F1 = X + delta1;
int F2 = X + delta2;
int F3 = X + delta3;
const int O = org[y * ostride + x];
const int X = rec[y * rstride + x];
const int A = rec[AOMMAX(0, y - 2) * rstride + x];
const int B = rec[AOMMAX(0, y - 1) * rstride + x];
const int C = rec[y * rstride + AOMMAX(0, x - 2)];
const int D = rec[y * rstride + AOMMAX(0, x - 1)];
const int E = rec[y * rstride + AOMMIN(width - 1, x + 1)];
const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)];
const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x];
const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x];
const int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, bd);
const int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, bd);
const int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, bd);
const int F1 = X + delta1;
const int F2 = X + delta2;
const int F3 = X + delta3;
sum[0] += (O - X) * (O - X);
sum[1] += (O - F1) * (O - F1);
sum[2] += (O - F2) * (O - F2);
......@@ -72,20 +79,24 @@ void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0, int width,
int height, int *sum0, int *sum1,
unsigned int strength, int shift, int size) {
unsigned int strength, int size, unsigned int bd) {
const int shift = bd - 8;
int x, y;
for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x] >> shift;
int X = rec[y * rstride + x] >> shift;
int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
int B = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
int C = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
int D = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
int E = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
int F = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
int delta = av1_clpf_sample(X, A, B, C, D, E, F, strength >> shift);
int Y = X + delta;
const int O = org[y * ostride + x] >> shift;
const int X = rec[y * rstride + x] >> shift;
const int A = rec[AOMMAX(0, y - 2) * rstride + x] >> shift;
const int B = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
const int C = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
const int D = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
const int E = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x] >> shift;
const int delta = av1_clpf_sample(X, A, B, C, D, E, F, G, H,
strength >> shift, bd - shift);
const int Y = X + delta;
*sum0 += (O - X) * (O - X);
*sum1 += (O - Y) * (O - Y);
}
......@@ -95,26 +106,32 @@ void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
// aom_clpf_detect_multi_c() apart from "rec" and "org".
void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum, int shift,
int size) {
int width, int height, int *sum, int size,
unsigned int bd) {
const int shift = bd - 8;
int x, y;
for (y = y0; y < y0 + size; y++) {
for (x = x0; x < x0 + size; x++) {
int O = org[y * ostride + x] >> shift;
int X = rec[y * rstride + x] >> shift;
int A = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
int B = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
int C = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
int D = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
int E = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
int F = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, 1);
int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, 2);
int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, 4);
int F1 = X + delta1;
int F2 = X + delta2;
int F3 = X + delta3;
const int A = rec[AOMMAX(0, y - 2) * rstride + x] >> shift;
const int B = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
const int C = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
const int D = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
const int E = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x] >> shift;
const int delta1 =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, bd - shift);
const int delta2 =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, bd - shift);
const int delta3 =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, bd - shift);
const int F1 = X + delta1;
const int F2 = X + delta2;
const int F3 = X + delta3;
sum[0] += (O - X) * (O - X);
sum[1] += (O - F1) * (O - F1);
sum[2] += (O - F2) * (O - F2);
......@@ -143,17 +160,18 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, &sum0, &sum1, strength,
cm->bit_depth - 8, block_size);
block_size, cm->bit_depth);
} else {
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, &sum0, &sum1, strength,
block_size);
block_size, cm->bit_depth);
}
#else
aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride,
org->y_stride, xpos, ypos, rec->y_crop_width,
rec->y_crop_height, &sum0, &sum1, strength, block_size);
rec->y_crop_height, &sum0, &sum1, strength, block_size,
cm->bit_depth);
#endif
}
}
......@@ -255,16 +273,16 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec_buffer),
CONVERT_TO_SHORTPTR(org_buffer), rec_stride,
org_stride, xpos, ypos, rec_width, rec_height,
sum + skip, cm->bit_depth - 8, block_size);
sum + skip, block_size, cm->bit_depth);
} else {
aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
xpos, ypos, rec_width, rec_height, sum + skip,
block_size);
block_size, cm->bit_depth);
}
#else
aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
xpos, ypos, rec_width, rec_height, sum + skip,
block_size);
block_size, cm->bit_depth);
#endif
filtered |= !skip;
}
......
......@@ -12,64 +12,27 @@
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/aom_simd.h"
#include "aom_ports/mem.h"
#include "aom_ports/bitops.h"
#include "av1/common/clpf_simd_kernel.h"
SIMD_INLINE void calc_diff(v128 o, v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
v128 *f) {
// The difference will be 9 bit, offset by 128 so we can use saturated
// sub to avoid going to 16 bit temporarily before "strength" clipping.
const v128 c128 = v128_dup_8(128);
v128 x = v128_add_8(c128, o);
*a = v128_ssub_s8(v128_add_8(c128, *a), x);
*b = v128_ssub_s8(v128_add_8(c128, *b), x);
*c = v128_ssub_s8(v128_add_8(c128, *c), x);
*d = v128_ssub_s8(v128_add_8(c128, *d), x);
*e = v128_ssub_s8(v128_add_8(c128, *e), x);
*f = v128_ssub_s8(v128_add_8(c128, *f), x);
}
SIMD_INLINE v128 delta_kernel(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(c, sp), sm),
v128_max_s8(v128_min_s8(d, sp), sm));
const v128 delta = v128_add_8(
v128_add_8(v128_shl_8(v128_add_8(v128_max_s8(v128_min_s8(a, sp), sm),
v128_max_s8(v128_min_s8(f, sp), sm)),
2),
v128_add_8(v128_max_s8(v128_min_s8(b, sp), sm),
v128_max_s8(v128_min_s8(e, sp), sm))),
v128_add_8(v128_add_8(tmp, tmp), tmp));
return v128_add_8(
o, v128_shr_s8(
v128_add_8(v128_dup_8(8),
v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
4));
}
SIMD_INLINE v128 calc_delta(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 sp, v128 sm) {
calc_diff(o, &a, &b, &c, &d, &e, &f);
return delta_kernel(o, a, b, c, d, e, f, sp, sm);
}
SIMD_INLINE void clip_sides(v128 *b, v128 *c, v128 *d, v128 *e, int left,
SIMD_INLINE void clip_sides(v128 *c, v128 *d, v128 *e, v128 *f, int left,
int right) {
DECLARE_ALIGNED(16, static const uint64_t,
b_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
c_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
d_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
e_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
f_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
if (!left) { // Left clipping
*b = v128_shuffle_8(*b, v128_load_aligned(b_shuff));
*c = v128_shuffle_8(*c, v128_load_aligned(c_shuff));
*d = v128_shuffle_8(*d, v128_load_aligned(d_shuff));
}
if (!right) { // Right clipping
*d = v128_shuffle_8(*d, v128_load_aligned(d_shuff));
*e = v128_shuffle_8(*e, v128_load_aligned(e_shuff));
*f = v128_shuffle_8(*f, v128_load_aligned(f_shuff));
}
}
......@@ -77,41 +40,45 @@ SIMD_INLINE void read_two_lines(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
int bottom, int right, int y, v128 *o, v128 *r,
v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
v128 *f) {
v128 *f, v128 *g, v128 *h) {
const v64 k1 = v64_load_aligned(org);
const v64 k2 = v64_load_aligned(org + ostride);
const v64 l1 = v64_load_aligned(rec);
const v64 l2 = v64_load_aligned(rec + rstride);
const v64 l3 = v64_load_aligned(rec - (y != -y0) * rstride);
const v64 l4 = v64_load_aligned(rec + ((y != bottom) + 1) * rstride);
*o = v128_from_v64(k1, k2);
*r = v128_from_v64(l1, l2);
*a = v128_from_v64(v64_load_aligned(rec - (y != -y0) * rstride), l1);
*f = v128_from_v64(l2, v64_load_aligned(rec + ((y != bottom) + 1) * rstride));
*b = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0),
*a = v128_from_v64(v64_load_aligned(rec - 2 * (y != -y0) * rstride), l3);
*b = v128_from_v64(l3, l1);
*g = v128_from_v64(l2, l4);
*h = v128_from_v64(l4,
v64_load_aligned(rec + (2 * (y != bottom) + 1) * rstride));
*c = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0),
v64_load_unaligned(rec - 2 * !!x0 + rstride));
*c = v128_from_v64(v64_load_unaligned(rec - !!x0),
*d = v128_from_v64(v64_load_unaligned(rec - !!x0),
v64_load_unaligned(rec - !!x0 + rstride));
*d = v128_from_v64(v64_load_unaligned(rec + !!right),
*e = v128_from_v64(v64_load_unaligned(rec + !!right),
v64_load_unaligned(rec + !!right + rstride));
*e = v128_from_v64(v64_load_unaligned(rec + 2 * !!right),
*f = v128_from_v64(v64_load_unaligned(rec + 2 * !!right),
v64_load_unaligned(rec + 2 * !!right + rstride));
clip_sides(b, c, d, e, x0, right);
clip_sides(c, d, e, f, x0, right);
}
void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
int rstride, int ostride, int x0, int y0,
int width, int height, int *sum0, int *sum1,
unsigned int strength, int size) {
const v128 sp = v128_dup_8(strength);
const v128 sm = v128_dup_8(-(int)strength);
const int right = width - 8 - x0;
unsigned int strength, int size,
unsigned int bd) {
const int bottom = height - 2 - y0;
const int right = width - 8 - x0;
ssd128_internal ssd0 = v128_ssd_u8_init();
ssd128_internal ssd1 = v128_ssd_u8_init();
int y;
if (size != 8) { // Fallback to plain C
aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0,
sum1, strength, size);
sum1, strength, size, bd);
return;
}