Commit 8ff52fcc authored by Steinar Midtskogen's avatar Steinar Midtskogen

CDEF: Add damping to dering

high-latency, cpu-used=0:
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.1650 |  0.2545 |  0.2977 |  -0.0423 | -0.0947 | -0.0725 |    -0.0365

low-latency, cpu-used=0:
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.4006 |  0.0501 | -0.0108 |  -0.1790 | -0.1660 | -0.1992 |    -0.2135

low latency, cpu-used=4:
   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.5508 | -0.2445 | -0.2762 |  -0.1981 | -0.2878 | -0.2228 |    -0.3733

Change-Id: Ia20df28c8bbb6182215b02016053af33bd498145
parent 1d18460f
...@@ -227,6 +227,7 @@ if (CONFIG_CDEF) ...@@ -227,6 +227,7 @@ if (CONFIG_CDEF)
"${AOM_ROOT}/av1/common/clpf.c" "${AOM_ROOT}/av1/common/clpf.c"
"${AOM_ROOT}/av1/common/clpf.h" "${AOM_ROOT}/av1/common/clpf.h"
"${AOM_ROOT}/av1/common/clpf_simd.h" "${AOM_ROOT}/av1/common/clpf_simd.h"
"${AOM_ROOT}/av1/common/cdef_simd.h"
"${AOM_ROOT}/av1/common/cdef.c" "${AOM_ROOT}/av1/common/cdef.c"
"${AOM_ROOT}/av1/common/cdef.h" "${AOM_ROOT}/av1/common/cdef.h"
"${AOM_ROOT}/av1/common/od_dering.c" "${AOM_ROOT}/av1/common/od_dering.c"
......
...@@ -89,6 +89,7 @@ ifeq ($(CONFIG_CDEF),yes) ...@@ -89,6 +89,7 @@ ifeq ($(CONFIG_CDEF),yes)
AV1_COMMON_SRCS-yes += common/clpf.c AV1_COMMON_SRCS-yes += common/clpf.c
AV1_COMMON_SRCS-yes += common/clpf.h AV1_COMMON_SRCS-yes += common/clpf.h
AV1_COMMON_SRCS-yes += common/clpf_simd.h AV1_COMMON_SRCS-yes += common/clpf_simd.h
AV1_COMMON_SRCS-yes += common/cdef_simd.h
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
......
...@@ -626,8 +626,8 @@ if (aom_config("CONFIG_CDEF") eq "yes") { ...@@ -626,8 +626,8 @@ if (aom_config("CONFIG_CDEF") eq "yes") {
add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd"; add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd"; add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift"; add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift";
add_proto qw/void od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir"; add_proto qw/void od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
add_proto qw/void od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir"; add_proto qw/void od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride"; add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride"; add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
......
...@@ -23,6 +23,18 @@ ...@@ -23,6 +23,18 @@
#include "av1/common/onyxc_int.h" #include "av1/common/onyxc_int.h"
#include "./od_dering.h" #include "./od_dering.h"
static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
static INLINE int constrain(int diff, int threshold, unsigned int damping) {
return threshold
? sign(diff) *
AOMMIN(
abs(diff),
AOMMAX(0, threshold - (abs(diff) >>
(damping - get_msb(threshold)))))
: 0;
}
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
......
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AV1_COMMON_CDEF_SIMD_H_
#define AV1_COMMON_CDEF_SIMD_H_
#include "aom_dsp/aom_simd.h"
// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
SIMD_INLINE v128 constrain16(v128 a, v128 b, unsigned int threshold,
unsigned int adjdamp) {
v128 diff = v128_sub_16(a, b);
const v128 sign = v128_shr_n_s16(diff, 15);
diff = v128_abs_s16(diff);
const v128 s =
v128_ssub_u16(v128_dup_16(threshold), v128_shr_u16(diff, adjdamp));
return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
}
#endif // AV1_COMMON_CDEF_SIMD_H_
...@@ -9,18 +9,12 @@ ...@@ -9,18 +9,12 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent. * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/ */
#include "av1/common/clpf.h" #include "./clpf.h"
#include "./av1_rtcd.h" #include "./av1_rtcd.h"
#include "./cdef.h"
#include "aom/aom_image.h" #include "aom/aom_image.h"
#include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_dsp_common.h"
static int sign(int i) { return i < 0 ? -1 : 1; }
static int constrain(int x, int s, unsigned int damping) {
return sign(x) *
AOMMIN(abs(x), AOMMAX(0, s - (abs(x) >> (damping - get_msb(s)))));
}
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G, int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
int H, int s, unsigned int dmp) { int H, int s, unsigned int dmp) {
int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) + int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
......
...@@ -10,8 +10,9 @@ ...@@ -10,8 +10,9 @@
*/ */
#include "./av1_rtcd.h" #include "./av1_rtcd.h"
#include "aom_ports/mem.h" #include "./cdef_simd.h"
#include "aom_ports/bitops.h" #include "aom_ports/bitops.h"
#include "aom_ports/mem.h"
// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp))) // sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength, SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
...@@ -242,17 +243,6 @@ void SIMD_FUNC(aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride, ...@@ -242,17 +243,6 @@ void SIMD_FUNC(aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride,
} }
} }
// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
SIMD_INLINE v128 constrain_hbd(v128 a, v128 b, unsigned int strength,
unsigned int adjdamp) {
v128 diff = v128_sub_16(a, b);
const v128 sign = v128_shr_n_s16(diff, 15);
diff = v128_abs_s16(diff);
const v128 s =
v128_ssub_u16(v128_dup_16(strength), v128_shr_u16(diff, adjdamp));
return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
}
// delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) + // delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
// 1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) + // 1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
// 3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) + // 3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
...@@ -261,13 +251,12 @@ SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e, ...@@ -261,13 +251,12 @@ SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
v128 f, v128 g, v128 h, unsigned int s, v128 f, v128 g, v128 h, unsigned int s,
unsigned int dmp) { unsigned int dmp) {
const v128 bdeg = v128_add_16( const v128 bdeg = v128_add_16(
v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(d, x, s, dmp)), v128_add_16(constrain16(b, x, s, dmp), constrain16(d, x, s, dmp)),
v128_add_16(constrain_hbd(e, x, s, dmp), constrain_hbd(g, x, s, dmp))); v128_add_16(constrain16(e, x, s, dmp), constrain16(g, x, s, dmp)));
const v128 delta = v128_add_16( const v128 delta = v128_add_16(
v128_add_16( v128_add_16(
v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(c, x, s, dmp)), v128_add_16(constrain16(a, x, s, dmp), constrain16(c, x, s, dmp)),
v128_add_16(constrain_hbd(f, x, s, dmp), v128_add_16(constrain16(f, x, s, dmp), constrain16(h, x, s, dmp))),
constrain_hbd(h, x, s, dmp))),
v128_add_16(v128_add_16(bdeg, bdeg), bdeg)); v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
return v128_add_16( return v128_add_16(
x, x,
...@@ -297,9 +286,9 @@ static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e, ...@@ -297,9 +286,9 @@ static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d,
unsigned int s, unsigned int dmp) { unsigned int s, unsigned int dmp) {
const v128 bc = const v128 bc =
v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(c, x, s, dmp)); v128_add_16(constrain16(b, x, s, dmp), constrain16(c, x, s, dmp));
const v128 delta = v128_add_16( const v128 delta = v128_add_16(
v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(d, x, s, dmp)), v128_add_16(constrain16(a, x, s, dmp), constrain16(d, x, s, dmp)),
v128_add_16(v128_add_16(bc, bc), bc)); v128_add_16(v128_add_16(bc, bc), bc));
return v128_add_16( return v128_add_16(
x, x,
......
...@@ -115,7 +115,7 @@ int od_dir_find8_c(const uint16_t *img, int stride, int32_t *var, ...@@ -115,7 +115,7 @@ int od_dir_find8_c(const uint16_t *img, int stride, int32_t *var,
/* Smooth in the direction detected. */ /* Smooth in the direction detected. */
void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride,
const uint16_t *in, int threshold, const uint16_t *in, int threshold,
int dir) { int dir, int damping) {
int i; int i;
int j; int j;
int k; int k;
...@@ -134,8 +134,8 @@ void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, ...@@ -134,8 +134,8 @@ void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride,
xx; xx;
p1 = in[i * OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]] - p1 = in[i * OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]] -
xx; xx;
if (abs(p0) < threshold) sum += taps[k] * p0; sum += taps[k] * constrain(p0, threshold, damping);
if (abs(p1) < threshold) sum += taps[k] * p1; sum += taps[k] * constrain(p1, threshold, damping);
} }
sum = (sum + 8) >> 4; sum = (sum + 8) >> 4;
yy = xx + sum; yy = xx + sum;
...@@ -147,7 +147,7 @@ void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride, ...@@ -147,7 +147,7 @@ void od_filter_dering_direction_8x8_c(uint16_t *y, int ystride,
/* Smooth in the direction detected. */ /* Smooth in the direction detected. */
void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
const uint16_t *in, int threshold, const uint16_t *in, int threshold,
int dir) { int dir, int damping) {
int i; int i;
int j; int j;
int k; int k;
...@@ -166,8 +166,8 @@ void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, ...@@ -166,8 +166,8 @@ void od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
xx; xx;
p1 = in[i * OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]] - p1 = in[i * OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]] -
xx; xx;
if (abs(p0) < threshold) sum += taps[k] * p0; sum += taps[k] * constrain(p0, threshold, damping);
if (abs(p1) < threshold) sum += taps[k] * p1; sum += taps[k] * constrain(p1, threshold, damping);
} }
sum = (sum + 8) >> 4; sum = (sum + 8) >> 4;
yy = xx + sum; yy = xx + sum;
...@@ -298,6 +298,7 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec, ...@@ -298,6 +298,7 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES] = { od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES] = {
od_filter_dering_direction_4x4, od_filter_dering_direction_8x8 od_filter_dering_direction_4x4, od_filter_dering_direction_8x8
}; };
clpf_damping += coeff_shift;
bsize = OD_DERING_SIZE_LOG2 - xdec; bsize = OD_DERING_SIZE_LOG2 - xdec;
if (!skip_dering) { if (!skip_dering) {
if (pli == 0) { if (pli == 0) {
...@@ -325,7 +326,7 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec, ...@@ -325,7 +326,7 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
(filter_dering_direction[bsize - OD_LOG_BSIZE0])( (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
&y[bi << 2 * bsize], 1 << bsize, &y[bi << 2 * bsize], 1 << bsize,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]); od_adjust_thresh(threshold, var[by][bx]), dir[by][bx], 6);
} }
} else { } else {
for (bi = 0; bi < dering_count; bi++) { for (bi = 0; bi < dering_count; bi++) {
...@@ -334,7 +335,7 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec, ...@@ -334,7 +335,7 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
(filter_dering_direction[bsize - OD_LOG_BSIZE0])( (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
&y[bi << 2 * bsize], 1 << bsize, &y[bi << 2 * bsize], 1 << bsize,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold, &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
dir[by][bx]); dir[by][bx], threshold == 0 ? 0 : get_msb(threshold) + 1);
} }
} }
} }
...@@ -356,14 +357,14 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec, ...@@ -356,14 +357,14 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
dst ? (uint16_t *)dst + py * dstride + px : &y[bi << 2 * bsize], dst ? (uint16_t *)dst + py * dstride + px : &y[bi << 2 * bsize],
in + py * OD_FILT_BSTRIDE + px, dst && hbd ? dstride : 1 << bsize, in + py * OD_FILT_BSTRIDE + px, dst && hbd ? dstride : 1 << bsize,
OD_FILT_BSTRIDE, 1 << bsize, 1 << bsize, OD_FILT_BSTRIDE, 1 << bsize, 1 << bsize,
clpf_strength << coeff_shift, clpf_damping + coeff_shift); clpf_strength << coeff_shift, clpf_damping);
} else { } else {
// Do clpf and write the result to an 8 bit destination // Do clpf and write the result to an 8 bit destination
(!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block (!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block
: aom_clpf_hblock)( : aom_clpf_hblock)(
dst + py * dstride + px, in + py * OD_FILT_BSTRIDE + px, dstride, dst + py * dstride + px, in + py * OD_FILT_BSTRIDE + px, dstride,
OD_FILT_BSTRIDE, 1 << bsize, 1 << bsize, OD_FILT_BSTRIDE, 1 << bsize, 1 << bsize,
clpf_strength << coeff_shift, clpf_damping + coeff_shift); clpf_strength << coeff_shift, clpf_damping);
} }
} }
} else { } else {
......
...@@ -41,7 +41,8 @@ typedef struct { ...@@ -41,7 +41,8 @@ typedef struct {
typedef void (*od_filter_dering_direction_func)(uint16_t *y, int ystride, typedef void (*od_filter_dering_direction_func)(uint16_t *y, int ystride,
const uint16_t *in, const uint16_t *in,
int threshold, int dir); int threshold, int dir,
int damping);
void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src, void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
dering_list *dlist, int dering_count, dering_list *dlist, int dering_count,
int bsize); int bsize);
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
*/ */
#include "./av1_rtcd.h" #include "./av1_rtcd.h"
#include "./cdef_simd.h"
#include "./od_dering.h" #include "./od_dering.h"
/* partial A is a 16-bit vector of the form: /* partial A is a 16-bit vector of the form:
...@@ -210,141 +211,109 @@ int SIMD_FUNC(od_dir_find8)(const od_dering_in *img, int stride, int32_t *var, ...@@ -210,141 +211,109 @@ int SIMD_FUNC(od_dir_find8)(const od_dering_in *img, int stride, int32_t *var,
return best_dir; return best_dir;
} }
static INLINE v128 od_cmplt_abs_epi16(v128 in, v128 threshold) {
return v128_cmplt_s16(v128_abs_s16(in), threshold);
}
void SIMD_FUNC(od_filter_dering_direction_4x4)(uint16_t *y, int ystride, void SIMD_FUNC(od_filter_dering_direction_4x4)(uint16_t *y, int ystride,
const uint16_t *in, const uint16_t *in,
int threshold, int dir) { int threshold, int dir,
int damping) {
int i; int i;
v128 sum; v128 p0, p1, sum, row, res;
v128 p; int o1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
v128 cmp; int o2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
v128 row;
v128 res; if (threshold) damping -= get_msb(threshold);
v128 tmp;
v128 thresh;
int off1, off2;
off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
thresh = v128_dup_16(threshold);
for (i = 0; i < 4; i += 2) { for (i = 0; i < 4; i += 2) {
sum = v128_zero(); sum = v128_zero();
row = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE]), row = v128_from_v64(v64_load_aligned(&in[i * OD_FILT_BSTRIDE]),
v64_load_aligned(&in[i * OD_FILT_BSTRIDE])); v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE]));
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/ // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE + off1]), p0 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE + o1]),
v64_load_aligned(&in[i * OD_FILT_BSTRIDE + off1])); v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE + o1]));
p = v128_sub_16(tmp, row); p0 = constrain16(p0, row, threshold, damping);
/*if (abs(p) < thresh) sum += taps[k]*p*/
cmp = od_cmplt_abs_epi16(p, thresh); // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
p = v128_shl_n_16(p, 2); p1 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE - o1]),
p = v128_and(p, cmp); v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE - o1]));
sum = v128_add_16(sum, p); p1 = constrain16(p1, row, threshold, damping);
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE - off1]), // sum += 4 * (p0 + p1)
v64_load_aligned(&in[i * OD_FILT_BSTRIDE - off1])); sum = v128_add_16(sum, v128_shl_n_16(v128_add_16(p0, p1), 2));
p = v128_sub_16(tmp, row);
/*if (abs(p) < thresh) sum += taps[k]*p1*/ // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
cmp = od_cmplt_abs_epi16(p, thresh); p0 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE + o2]),
p = v128_shl_n_16(p, 2); v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE + o2]));
p = v128_and(p, cmp); p0 = constrain16(p0, row, threshold, damping);
sum = v128_add_16(sum, p);
// p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
/*p = in[i*OD_FILT_BSTRIDE + offset] - row*/ p1 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE - o2]),
tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE + off2]), v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE - o2]));
v64_load_aligned(&in[i * OD_FILT_BSTRIDE + off2])); p1 = constrain16(p1, row, threshold, damping);
p = v128_sub_16(tmp, row);
/*if (abs(p) < thresh) sum += taps[k]*p*/ // sum += 1 * (p0 + p1)
cmp = od_cmplt_abs_epi16(p, thresh); sum = v128_add_16(sum, v128_add_16(p0, p1));
p = v128_and(p, cmp);
sum = v128_add_16(sum, p); // res = row + ((sum + 8) >> 4)
/*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE - off2]),
v64_load_aligned(&in[i * OD_FILT_BSTRIDE - off2]));
p = v128_sub_16(tmp, row);
/*if (abs(p) < thresh) sum += taps[k]*p1*/
cmp = od_cmplt_abs_epi16(p, thresh);
p = v128_and(p, cmp);
sum = v128_add_16(sum, p);
/*res = row + ((sum + 8) >> 4)*/
res = v128_add_16(sum, v128_dup_16(8)); res = v128_add_16(sum, v128_dup_16(8));
res = v128_shr_n_s16(res, 4); res = v128_shr_n_s16(res, 4);
res = v128_add_16(row, res); res = v128_add_16(row, res);
v64_store_aligned(&y[i * ystride], v128_low_v64(res)); v64_store_aligned(&y[i * ystride], v128_high_v64(res));
v64_store_aligned(&y[(i + 1) * ystride], v128_high_v64(res)); v64_store_aligned(&y[(i + 1) * ystride], v128_low_v64(res));
} }
} }
void SIMD_FUNC(od_filter_dering_direction_8x8)(uint16_t *y, int ystride, void SIMD_FUNC(od_filter_dering_direction_8x8)(uint16_t *y, int ystride,
const uint16_t *in, const uint16_t *in,
int threshold, int dir) { int threshold, int dir,
int damping) {
int i; int i;
v128 sum; v128 sum, p0, p1, row, res;
v128 p0, p1; int o1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
v128 cmp; int o2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
v128 row; int o3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
v128 res;
v128 thresh; if (threshold) damping -= get_msb(threshold);
int off1, off2, off3;
off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
thresh = v128_dup_16(threshold);
for (i = 0; i < 8; i++) { for (i = 0; i < 8; i++) {
sum = v128_zero(); sum = v128_zero();
row = v128_load_aligned(&in[i * OD_FILT_BSTRIDE]); row = v128_load_aligned(&in[i * OD_FILT_BSTRIDE]);
/*p0 = in[i*OD_FILT_BSTRIDE + offset] - row*/ // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
p0 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off1]), row); p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o1]);
/*p0 = abs(p0) < thresh ? p0 : 0*/ p0 = constrain16(p0, row, threshold, damping);
cmp = od_cmplt_abs_epi16(p0, thresh);
p0 = v128_and(p0, cmp); // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o1]);
/*p1 = in[i*OD_FILT_BSTRIDE - offset] - row*/ p1 = constrain16(p1, row, threshold, damping);
p1 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off1]), row);
/*p1 = abs(p1) < thresh ? p1 : 0*/ // sum += 3 * (p0 + p1)
cmp = od_cmplt_abs_epi16(p1, thresh);
p1 = v128_and(p1, cmp);
/*sum += 3*(p0 + p1)*/
p0 = v128_add_16(p0, p1); p0 = v128_add_16(p0, p1);
p0 = v128_add_16(p0, v128_shl_n_16(p0, 1)); p0 = v128_add_16(p0, v128_shl_n_16(p0, 1));
sum = v128_add_16(sum, p0); sum = v128_add_16(sum, p0);
/*p0 = in[i*OD_FILT_BSTRIDE + offset] - row*/ // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
p0 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off2]), row); p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o2]);
/*p0 = abs(p0) < thresh ? p0 : 0*/ p0 = constrain16(p0, row, threshold, damping);
cmp = od_cmplt_abs_epi16(p0, thresh);
p0 = v128_and(p0, cmp); // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o2]);
/*p1 = in[i*OD_FILT_BSTRIDE - offset] - row*/ p1 = constrain16(p1, row, threshold, damping);
p1 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off2]), row);
/*p1 = abs(p1) < thresh ? p1 : 0*/ // sum += 2 * (p0 + p1)
cmp = od_cmplt_abs_epi16(p1, thresh);
p1 = v128_and(p1, cmp);
/* sum += 2*(p0 + p1)*/
p0 = v128_shl_n_16(v128_add_16(p0, p1), 1); p0 = v128_shl_n_16(v128_add_16(p0, p1), 1);
sum = v128_add_16(sum, p0); sum = v128_add_16(sum, p0);
/*p0 = in[i*OD_FILT_BSTRIDE + offset] - row*/ // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
p0 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off3]), row); p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o3]);
/*p0 = abs(p0) < thresh ? p0 : 0*/ p0 = constrain16(p0, row, threshold, damping);
cmp = od_cmplt_abs_epi16(p0, thresh);
p0 = v128_and(p0, cmp); // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o3]);
/*p1 = in[i*OD_FILT_BSTRIDE - offset] - row*/ p1 = constrain16(p1, row, threshold, damping);
p1 = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off3]), row);
/*p1 = abs(p1) < thresh ? p1 : 0*/ // sum += (p0 + p1)
cmp = od_cmplt_abs_epi16(p1, thresh);
p1 = v128_and(p1, cmp);
/*sum += (p0 + p1)*/
p0 = v128_add_16(p0, p1); p0 = v128_add_16(p0, p1);
sum = v128_add_16(sum, p0); sum = v128_add_16(sum, p0);
/*res = row + ((sum + 8) >> 4)*/ // res = row + ((sum + 8) >> 4)
res = v128_add_16(sum, v128_dup_16(8)); res = v128_add_16(sum, v128_dup_16(8));
res = v128_shr_n_s16(res, 4); res = v128_shr_n_s16(res, 4);
res = v128_add_16(row, res); res = v128_add_16(row, res);
......
...@@ -27,10 +27,9 @@ using libaom_test::ACMRandom; ...@@ -27,10 +27,9 @@ using libaom_test::ACMRandom;
namespace { namespace {
typedef void (*dering_dir_t)(uint16_t *y, int ystride, const uint16_t *in, typedef std::tr1::tuple<od_filter_dering_direction_func,
int threshold, int dir); od_filter_dering_direction_func, int>
dering_dir_param_t;
typedef std::tr1::tuple<dering_dir_t, dering_dir_t, int> dering_dir_param_t;
class CDEFDeringDirTest : public ::testing::TestWithParam<dering_dir_param_t> { class CDEFDeringDirTest : public ::testing::TestWithParam<dering_dir_param_t> {
public: public:
...@@ -45,18 +44,15 @@ class CDEFDeringDirTest : public ::testing::TestWithParam<dering_dir_param_t> { ...@@ -45,18 +44,15 @@ class CDEFDeringDirTest : public ::testing::TestWithParam<dering_dir_param_t> {
protected: protected:
int bsize; int bsize;
dering_dir_t dering; od_filter_dering_direction_func dering;