Commit 3c33def7 authored by Steinar Midtskogen's avatar Steinar Midtskogen

Limit line buffer to 6 lines

Change-Id: I6fedfa6427865e9a37fbdf9d9c1bf8be55222cba
parent d280a845
......@@ -853,6 +853,7 @@ specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
if (aom_config("CONFIG_CDEF") eq "yes") {
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
add_proto qw/void aom_clpf_hblock_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd, unsigned int dmp";
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd, unsigned int dmp";
......@@ -866,6 +867,7 @@ if (aom_config("CONFIG_CDEF") eq "yes") {
}
if ($opts{config} !~ /libs-x86-win32-vs.*/) {
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
}
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int dmp";
......
......@@ -31,6 +31,13 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
return (8 + delta - (delta < 0)) >> 4;
}
int av1_clpf_hsample(int X, int A, int B, int C, int D, int s,
unsigned int dmp) {
int delta = 1 * constrain(A - X, s, dmp) + 3 * constrain(B - X, s, dmp) +
3 * constrain(C - X, s, dmp) + 1 * constrain(D - X, s, dmp);
return (4 + delta - (delta < 0)) >> 3;
}
void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, unsigned int damping) {
......@@ -78,3 +85,22 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
}
}
}
// TODO(stemidts): Put under CONFIG_AOM_HIGHBITDEPTH if CDEF do 8 bit internally
void aom_clpf_hblock_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, unsigned int damping) {
int x, y;
for (y = y0; y < y0 + sizey; y++) {
for (x = x0; x < x0 + sizex; x++) {
const int X = src[y * sstride + x];
const int A = src[y * sstride + x - 2];
const int B = src[y * sstride + x - 1];
const int C = src[y * sstride + x + 1];
const int D = src[y * sstride + x + 2];
const int delta = av1_clpf_hsample(X, A, B, C, D, strength, damping);
dst[y * dstride + x] = X + delta;
}
}
}
......@@ -175,6 +175,36 @@ static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp));
}
// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
// 3/16 * constrain(c, x, s, dmp) + 1/16 * constrain(d, x, s, dmp)
SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d,
unsigned int s, unsigned int dmp) {
const v128 bc =
v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(c, x, s, dmp));
const v128 delta = v128_add_16(
v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(d, x, s, dmp)),
v128_add_16(v128_add_16(bc, bc), bc));
return v128_add_16(
x,
v128_shr_s16(
v128_add_16(v128_dup_16(4),
v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
3));
}
static void calc_hdelta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d,
uint16_t *dst, unsigned int s, unsigned int dmp,
int dstride) {
o = calc_hdelta_hbd(o, a, b, c, d, s, dmp);
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
}
static void calc_hdelta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d,
uint16_t *dst, unsigned int s, unsigned int dmp) {
v128_store_aligned(dst, calc_hdelta_hbd(o, a, b, c, d, s, dmp));
}
// Process blocks of width 4, two lines at time.
SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
......@@ -236,6 +266,57 @@ SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
}
}
// Process blocks of width 4, horizontal filter, two lines at time.
SIMD_INLINE void clpf_hblock_hbd4(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizey, unsigned int strength,
unsigned int dmp) {
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
for (y = 0; y < sizey; y += 2) {
const v128 a = v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + sstride));
const v128 b = v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride));
const v128 c = v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride));
const v128 d = v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride));
calc_hdelta_hbd4(v128_from_v64(v64_load_unaligned(src),
v64_load_unaligned(src + sstride)),
a, b, c, d, dst, strength, dmp, dstride);
src += sstride * 2;
dst += dstride * 2;
}
}
// Process blocks of width 8, horizontal filter, two lines at time.
SIMD_INLINE void clpf_hblock_hbd(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizey, unsigned int strength,
unsigned int dmp) {
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
for (y = 0; y < sizey; y++) {
const v128 o = v128_load_aligned(src);
const v128 a = v128_load_unaligned(src - 2);
const v128 b = v128_load_unaligned(src - 1);
const v128 c = v128_load_unaligned(src + 1);
const v128 d = v128_load_unaligned(src + 2);
calc_hdelta_hbd8(o, a, b, c, d, dst, strength, dmp);
src += sstride;
dst += dstride;
}
}
void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizex, int sizey, unsigned int strength,
......@@ -251,4 +332,20 @@ void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
}
}
void SIMD_FUNC(aom_clpf_hblock_hbd)(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizex, int sizey, unsigned int strength,
unsigned int dmp) {
if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
// Fallback to C for odd sizes:
// * block width not 4 or 8
// * block heights not a multiple of 2 if the block width is 4
aom_clpf_hblock_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
strength, dmp);
} else {
(sizex == 4 ? clpf_hblock_hbd4 : clpf_hblock_hbd)(
src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
}
}
#endif
......@@ -288,9 +288,10 @@ void od_dering(uint16_t *y, uint16_t *in, int xdec,
by = dlist[bi].by;
bx = dlist[bi].bx;
aom_clpf_block_hbd(in, &y[((bi - by) << 2 * bsize) - (bx << bsize)],
OD_FILT_BSTRIDE, 1 << bsize, bx << bsize, by << bsize,
1 << bsize, 1 << bsize, clpf_strength << coeff_shift,
clpf_damping + coeff_shift);
(!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd
: aom_clpf_hblock_hbd)(
in, &y[((bi - by) << 2 * bsize) - (bx << bsize)], OD_FILT_BSTRIDE,
1 << bsize, bx << bsize, by << bsize, 1 << bsize, 1 << bsize,
clpf_strength << coeff_shift, clpf_damping + coeff_shift);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment