Commit 8322ff04 authored by Steinar Midtskogen's avatar Steinar Midtskogen Committed by Sebastien Alaiwan

Remove CDEF_SINGLEPASS defines

The experiment has been adopted and has been enabled by default for a
while and the alternative code path has not been maintained for a long
time, which is now removed.

Change-Id: Iaf22f2969b45b71b2bf67707e131ab4c439b7fa6
parent 27b5136f
......@@ -279,30 +279,6 @@ set(AOM_AV1_COMMON_INTRIN_NEON
${AOM_AV1_COMMON_INTRIN_NEON}
"${AOM_ROOT}/av1/common/cdef_block_neon.c")
if (NOT CONFIG_CDEF_SINGLEPASS)
set(AOM_AV1_COMMON_SOURCES
${AOM_AV1_COMMON_SOURCES}
"${AOM_ROOT}/av1/common/clpf.c"
"${AOM_ROOT}/av1/common/clpf_simd.h"
"${AOM_ROOT}/av1/common/cdef_block_simd.h")
set(AOM_AV1_COMMON_INTRIN_SSE2
${AOM_AV1_COMMON_INTRIN_SSE2}
"${AOM_ROOT}/av1/common/clpf_sse2.c")
set(AOM_AV1_COMMON_INTRIN_SSSE3
${AOM_AV1_COMMON_INTRIN_SSSE3}
"${AOM_ROOT}/av1/common/clpf_ssse3.c")
set(AOM_AV1_COMMON_INTRIN_SSE4_1
${AOM_AV1_COMMON_INTRIN_SSE4_1}
"${AOM_ROOT}/av1/common/clpf_sse4.c")
set(AOM_AV1_COMMON_INTRIN_NEON
${AOM_AV1_COMMON_INTRIN_NEON}
"${AOM_ROOT}/av1/common/clpf_neon.c")
endif ()
set(AOM_AV1_COMMON_INTRIN_SSE2
${AOM_AV1_COMMON_INTRIN_SSE2}
"${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c")
......
......@@ -94,16 +94,7 @@ AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/intra_edge_sse4.c
endif
AV1_COMMON_SRCS-yes += common/warped_motion.h
AV1_COMMON_SRCS-yes += common/warped_motion.c
ifeq ($(CONFIG_CDEF_SINGLEPASS),yes)
AV1_COMMON_SRCS-$(HAVE_AVX2) += common/cdef_block_avx2.c
else
AV1_COMMON_SRCS-yes += common/clpf.c
AV1_COMMON_SRCS-yes += common/clpf_simd.h
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
AV1_COMMON_SRCS-$(HAVE_NEON) += common/clpf_neon.c
endif
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/cdef_block_sse2.c
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/cdef_block_ssse3.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/cdef_block_sse4.c
......
......@@ -499,20 +499,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
# Deringing Functions
add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
if (aom_config("CONFIG_CDEF_SINGLEPASS") ne "yes") {
add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
add_proto qw/void cdef_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
add_proto qw/void cdef_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir, int damping";
add_proto qw/void copy_8x8_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
add_proto qw/void copy_8x8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
add_proto qw/void copy_4x4_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
} else {
add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift";
}
add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift";
add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
......@@ -521,28 +508,10 @@ add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const
# structs as arguments, which makes the v256 type of the intrinsics
# hard to support, so optimizations for this target are disabled.
if ($opts{config} !~ /libs-x86-win32-vs.*/) {
if (aom_config("CONFIG_CDEF_SINGLEPASS") eq "yes") {
specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
} else {
specialize qw/cdef_find_dir sse2 ssse3 sse4_1 neon/;
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
specialize qw/cdef_find_dir sse2 ssse3 sse4_1 neon/;
specialize qw/cdef_direction_4x4 sse2 ssse3 sse4_1 neon/;
specialize qw/cdef_direction_8x8 sse2 ssse3 sse4_1 neon/;
specialize qw/copy_8x8_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
specialize qw/copy_4x4_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
specialize qw/copy_8x8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
specialize qw/copy_4x4_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 neon/;
specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
}
specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
}
# WARPED_MOTION / GLOBAL_MOTION functions
......
......@@ -281,9 +281,6 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
curr_row_cdef[fbc] = 1;
for (int pli = 0; pli < nplanes; pli++) {
#if !CONFIG_CDEF_SINGLEPASS
DECLARE_ALIGNED(16, uint16_t, dst[CDEF_BLOCKSIZE * CDEF_BLOCKSIZE]);
#endif
int coffset;
int rend, cend;
int pri_damping = cm->cdef_pri_damping;
......@@ -399,27 +396,16 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth) {
cdef_filter_fb(
#if CONFIG_CDEF_SINGLEPASS
NULL,
&CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
#else
(uint8_t *)&CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
#endif
[xd->plane[pli].dst.stride *
(MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
(fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
#if CONFIG_CDEF_SINGLEPASS
&CONVERT_TO_SHORTPTR(
xd->plane[pli]
.dst.buf)[xd->plane[pli].dst.stride *
(MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
(fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
xd->plane[pli].dst.stride,
#else
xd->plane[pli].dst.stride, dst,
#endif
&src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
#if CONFIG_CDEF_SINGLEPASS
sec_strength, pri_damping, sec_damping, coeff_shift);
#else
sec_strength, sec_damping, pri_damping, coeff_shift, 0, 1);
#endif
} else {
#endif
cdef_filter_fb(
......@@ -427,18 +413,10 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
.dst.buf[xd->plane[pli].dst.stride *
(MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
(fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
#if CONFIG_CDEF_SINGLEPASS
NULL, xd->plane[pli].dst.stride,
#else
xd->plane[pli].dst.stride, dst,
#endif
&src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
#if CONFIG_CDEF_SINGLEPASS
sec_strength, pri_damping, sec_damping, coeff_shift);
#else
sec_strength, sec_damping, pri_damping, coeff_shift, 0, 0);
#endif
#if CONFIG_HIGHBITDEPTH
}
......
......@@ -21,7 +21,7 @@
#include "./cdef.h"
/* Generated from gen_filter_tables.c. */
#if !CONFIG_CDEF_SINGLEPASS || CDEF_FULL
#if CDEF_FULL
DECLARE_ALIGNED(16, const int, cdef_directions[8][3]) = {
{ -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2, -3 * CDEF_BSTRIDE + 3 },
{ 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2, -1 * CDEF_BSTRIDE + 3 },
......@@ -123,7 +123,6 @@ int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var,
return best_dir;
}
#if CONFIG_CDEF_SINGLEPASS
#if CDEF_FULL
const int cdef_pri_taps[2][3] = { { 3, 2, 1 }, { 2, 2, 2 } };
const int cdef_sec_taps[2][2] = { { 3, 1 }, { 3, 1 } };
......@@ -209,67 +208,6 @@ void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
}
}
#else
/* Smooth in the direction detected. */
void cdef_direction_8x8_c(uint16_t *y, int ystride, const uint16_t *in,
int threshold, int dir, int damping) {
int i;
int j;
int k;
static const int taps[3] = { 3, 2, 1 };
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++) {
int16_t sum;
int16_t xx;
int16_t yy;
xx = in[i * CDEF_BSTRIDE + j];
sum = 0;
for (k = 0; k < 3; k++) {
int16_t p0;
int16_t p1;
p0 = in[i * CDEF_BSTRIDE + j + cdef_directions[dir][k]] - xx;
p1 = in[i * CDEF_BSTRIDE + j - cdef_directions[dir][k]] - xx;
sum += taps[k] * constrain(p0, threshold, damping);
sum += taps[k] * constrain(p1, threshold, damping);
}
sum = (sum + 8) >> 4;
yy = xx + sum;
y[i * ystride + j] = yy;
}
}
}
/* Smooth in the direction detected. */
void cdef_direction_4x4_c(uint16_t *y, int ystride, const uint16_t *in,
int threshold, int dir, int damping) {
int i;
int j;
int k;
static const int taps[2] = { 4, 1 };
for (i = 0; i < 4; i++) {
for (j = 0; j < 4; j++) {
int16_t sum;
int16_t xx;
int16_t yy;
xx = in[i * CDEF_BSTRIDE + j];
sum = 0;
for (k = 0; k < 2; k++) {
int16_t p0;
int16_t p1;
p0 = in[i * CDEF_BSTRIDE + j + cdef_directions[dir][k]] - xx;
p1 = in[i * CDEF_BSTRIDE + j - cdef_directions[dir][k]] - xx;
sum += taps[k] * constrain(p0, threshold, damping);
sum += taps[k] * constrain(p1, threshold, damping);
}
sum = (sum + 8) >> 4;
yy = xx + sum;
y[i * ystride + j] = yy;
}
}
}
#endif
/* Compute the primary filter strength for an 8x8 block based on the
directional variance difference. A high variance difference means
that we have a highly directional pattern (e.g. a high contrast
......@@ -282,160 +220,26 @@ static INLINE int adjust_strength(int strength, int32_t var) {
return var ? (strength * (4 + i) + 8) >> 4 : 0;
}
#if !CONFIG_CDEF_SINGLEPASS
void copy_8x8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
int sstride) {
int i, j;
for (i = 0; i < 8; i++)
for (j = 0; j < 8; j++) dst[i * dstride + j] = src[i * sstride + j];
}
void copy_4x4_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
int sstride) {
int i, j;
for (i = 0; i < 4; i++)
for (j = 0; j < 4; j++) dst[i * dstride + j] = src[i * sstride + j];
}
static void copy_block_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
cdef_list *dlist, int cdef_count,
int bsize) {
int bi, bx, by;
if (bsize == BLOCK_8X8) {
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_8x8_16bit_to_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
&src[bi << (3 + 3)], 8);
}
} else if (bsize == BLOCK_4X8) {
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_4x4_16bit_to_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
&src[bi << (3 + 2)], 4);
copy_4x4_16bit_to_16bit(&dst[((by << 3) + 4) * dstride + (bx << 2)],
dstride, &src[(bi << (3 + 2)) + 4 * 4], 4);
}
} else if (bsize == BLOCK_8X4) {
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
&src[bi << (2 + 3)], 8);
copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 3) + 4],
dstride, &src[(bi << (2 + 3)) + 4], 8);
}
} else {
assert(bsize == BLOCK_4X4);
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
&src[bi << (2 + 2)], 4);
}
}
}
void copy_8x8_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src,
int sstride) {
int i, j;
for (i = 0; i < 8; i++)
for (j = 0; j < 8; j++)
dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
}
void copy_4x4_16bit_to_8bit_c(uint8_t *dst, int dstride, const uint16_t *src,
int sstride) {
int i, j;
for (i = 0; i < 4; i++)
for (j = 0; j < 4; j++)
dst[i * dstride + j] = (uint8_t)src[i * sstride + j];
}
static void copy_block_16bit_to_8bit(uint8_t *dst, int dstride,
const uint16_t *src, cdef_list *dlist,
int cdef_count, int bsize) {
int bi, bx, by;
if (bsize == BLOCK_8X8) {
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_8x8_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
&src[bi << (3 + 3)], 8);
}
} else if (bsize == BLOCK_4X8) {
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_4x4_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
&src[bi << (3 + 2)], 4);
copy_4x4_16bit_to_8bit(&dst[((by << 3) + 4) * dstride + (bx << 2)],
dstride, &src[(bi << (3 + 2)) + 4 * 4], 4);
}
} else if (bsize == BLOCK_8X4) {
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
&src[bi << (2 + 3)], 8);
copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 3) + 4], dstride,
&src[(bi << (2 + 3)) + 4], 8);
}
} else {
assert(bsize == BLOCK_4X4);
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
&src[bi << (2 * 2)], 4);
}
}
}
void cdef_filter_fb(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in,
int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
cdef_list *dlist, int cdef_count, int level,
int sec_strength, int sec_damping, int pri_damping,
int coeff_shift, int skip_dering, int hbd) {
#else
void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
cdef_list *dlist, int cdef_count, int level,
int sec_strength, int pri_damping, int sec_damping,
int coeff_shift) {
#endif
int bi;
int bx;
int by;
int bsize, bsizex, bsizey;
#if CONFIG_CDEF_SINGLEPASS
int pri_strength = level << coeff_shift;
sec_strength <<= coeff_shift;
#else
int threshold = level << coeff_shift;
cdef_direction_func cdef_direction[] = { cdef_direction_4x4,
cdef_direction_8x8 };
#endif
sec_damping += coeff_shift - (pli != AOM_PLANE_Y);
pri_damping += coeff_shift - (pli != AOM_PLANE_Y);
bsize =
ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
bsizex = 3 - xdec;
bsizey = 3 - ydec;
#if CONFIG_CDEF_SINGLEPASS
if (dirinit && pri_strength == 0 && sec_strength == 0)
#else
if (!skip_dering)
#endif
{
#if CONFIG_CDEF_SINGLEPASS
if (dirinit && pri_strength == 0 && sec_strength == 0) {
// If we're here, both primary and secondary strengths are 0, and
// we still haven't written anything to y[] yet, so we just copy
// the input to y[]. This is necessary only for av1_cdef_search()
......@@ -443,106 +247,16 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
#else
if (pli == 0) {
if (!dirinit || !*dirinit) {
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
CDEF_BSTRIDE, &var[by][bx], coeff_shift);
}
if (dirinit) *dirinit = 1;
}
}
if (pli == 1 && xdec != ydec) {
for (bi = 0; bi < cdef_count; bi++) {
static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 };
static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 };
by = dlist[bi].by;
bx = dlist[bi].bx;
dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
}
}
// Only run dering for non-zero threshold. If we don't dering, we
// still need to eventually write something out in y[] later.
if (threshold != 0) {
assert(bsize == BLOCK_8X8 || bsize == BLOCK_4X4);
for (bi = 0; bi < cdef_count; bi++) {
int t = dlist[bi].skip ? 0 : threshold;
by = dlist[bi].by;
bx = dlist[bi].bx;
(cdef_direction[bsize == BLOCK_8X8])(
&y[bi << (bsizex + bsizey)], 1 << bsizex,
&in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
pli ? t : adjust_strength(t, var[by][bx]), dir[by][bx],
pri_damping);
}
}
}
if (sec_strength) {
if (threshold && !skip_dering)
copy_block_16bit_to_16bit(in, CDEF_BSTRIDE, y, dlist, cdef_count, bsize);
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
int py = by << bsizey;
int px = bx << bsizex;
if (dlist[bi].skip) continue;
if (!dst || hbd) {
// 16 bit destination if high bitdepth or 8 bit destination not given
(!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd
: aom_clpf_hblock_hbd)(
dst ? (uint16_t *)dst + py * dstride + px
: &y[bi << (bsizex + bsizey)],
in + py * CDEF_BSTRIDE + px, dst && hbd ? dstride : 1 << bsizex,
CDEF_BSTRIDE, 1 << bsizex, 1 << bsizey, sec_strength << coeff_shift,
sec_damping);
} else {
// Do clpf and write the result to an 8 bit destination
(!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block
: aom_clpf_hblock)(
dst + py * dstride + px, in + py * CDEF_BSTRIDE + px, dstride,
CDEF_BSTRIDE, 1 << bsizex, 1 << bsizey, sec_strength << coeff_shift,
sec_damping);
}
}
} else if (threshold != 0) {
// No clpf, so copy instead
if (hbd) {
copy_block_16bit_to_16bit((uint16_t *)dst, dstride, y, dlist, cdef_count,
bsize);
} else {
copy_block_16bit_to_8bit(dst, dstride, y, dlist, cdef_count, bsize);
}
} else if (dirinit) {
// If we're here, both dering and clpf are off, and we still haven't written
// anything to y[] yet, so we just copy the input to y[]. This is necessary
// only for av1_cdef_search() and only av1_cdef_search() sets dirinit.
for (bi = 0; bi < cdef_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
#endif
int iy, ix;
// TODO(stemidts/jmvalin): SIMD optimisations
for (iy = 0; iy < 1 << bsizey; iy++)
for (ix = 0; ix < 1 << bsizex; ix++)
#if CONFIG_CDEF_SINGLEPASS
dst16[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
#else
y[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
#endif
in[((by << bsizey) + iy) * CDEF_BSTRIDE + (bx << bsizex) + ix];
}
#if CONFIG_CDEF_SINGLEPASS
return;
#endif
}
#if CONFIG_CDEF_SINGLEPASS
if (pli == 0) {
if (!dirinit || !*dirinit) {
for (bi = 0; bi < cdef_count; bi++) {
......@@ -587,5 +301,4 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1,
coeff_shift);
}
#endif
}
......@@ -17,9 +17,7 @@
#define CDEF_BLOCKSIZE 64
#define CDEF_BLOCKSIZE_LOG2 6
#define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8)
#if CONFIG_CDEF_SINGLEPASS
#define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2)
#endif
/* We need to buffer three vertical lines. */
#define CDEF_VBORDER (3)
......@@ -33,7 +31,6 @@
#define CDEF_INBUF_SIZE \
(CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
#if CONFIG_CDEF_SINGLEPASS
// Filter configuration
#define CDEF_CAP 1 // 1 = Cap change to largest diff
#define CDEF_FULL 0 // 1 = 7x7 filter, 0 = 5x5 filter
......@@ -48,17 +45,12 @@ extern const int cdef_sec_taps[2][2];
DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
#endif
#else // CONFIG_CDEF_SINGLEPASS
DECLARE_ALIGNED(16, extern const int, cdef_directions[8][3]);
#endif
typedef struct {
uint8_t by;
uint8_t bx;
uint8_t skip;
} cdef_list;
#if CONFIG_CDEF_SINGLEPASS
typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
int dstride, const uint16_t *in,
int pri_strength, int sec_strength,
......@@ -67,26 +59,11 @@ typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
int coeff_shift);
void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
cdef_list *dlist, int cdef_count, int bsize);
#else
typedef void (*cdef_direction_func)(uint16_t *y, int ystride,
const uint16_t *in, int threshold, int dir,
int damping);
#endif
#if CONFIG_CDEF_SINGLEPASS
void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
cdef_list *dlist, int cdef_count, int level,
int sec_strength, int pri_damping, int sec_damping,
int coeff_shift);
#else
void cdef_filter_fb(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in,
int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
cdef_list *dlist, int cdef_count, int level,
int sec_strength, int sec_damping, int pri_damping,
int coeff_shift, int skip_dering, int hbd);
#endif
#endif
......@@ -221,7 +221,6 @@ SIMD_INLINE v128 constrain16(v128 a, v128 b, unsigned int threshold,
return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
}
#if CONFIG_CDEF_SINGLEPASS
// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
unsigned int adjdamp) {
......@@ -1081,153 +1080,6 @@ void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
}
}
#else
void SIMD_FUNC(cdef_direction_4x4)(uint16_t *y, int ystride, const uint16_t *in,
int threshold, int dir, int damping) {
int i;
v128 p0, p1, sum, row, res;
int o1 = cdef_directions[dir][0];
int o2 = cdef_directions[dir][1];
if (threshold) damping -= get_msb(threshold);
for (i = 0; i < 4; i += 2) {
sum = v128_zero();
row = v128_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
// p0 = constrain16(in[i*CDEF_BSTRIDE + offset], row, threshold, damping)
p0 = v128_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + o1]),
v64_load_unaligned(&