Commit ab6c9c77 authored by Steinar Midtskogen's avatar Steinar Midtskogen

Enable CDEF for chroma for 4:2:2 and 4:4:0

This enables CDEF for chroma when the horizontal and vertical
subsampling differs.  Since the direction search is only performed in
luma, the following corrections to the direction index are done for
chroma to correct the distortion resulting from the subsampling:

4:2:2:   4:4:0:
0 -> 7   0 -> 1
1 -> 0   1 -> 2
2 -> 2   2 -> 2
3 -> 4   3 -> 2
4 -> 5   4 -> 3
5 -> 6   5 -> 4
6 -> 6   6 -> 6
7 -> 6   7 -> 0

This improves the chroma PSNR BDR by about 2% for 4:2:2 content at
cpu-used=4, low delay configuration.  4:2:0 and 4:4:4 content is
unchanged.

Change-Id: Iee92d3697bc5a6fc9b1f340a63243a334935b433
parent 3f53da76
......@@ -168,8 +168,6 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
int ydec[3];
int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
int nplanes = av1_num_planes(cm);
int chroma_cdef = xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
av1_setup_dst_planes(xd->plane, cm->sb_size, frame, 0, 0);
......@@ -182,7 +180,6 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
ydec[pli] = xd->plane[pli].subsampling_y;
mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
if (xdec[pli] != ydec[pli]) nplanes = 1;
}
const int stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
for (int pli = 0; pli < nplanes; pli++) {
......@@ -295,10 +292,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
int vsize = nvb << mi_high_l2[pli];
if (pli) {
if (chroma_cdef)
level = uv_level;
else
level = 0;
level = uv_level;
sec_strength = uv_sec_strength;
}
......
......@@ -149,8 +149,8 @@ void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
const int s = CDEF_BSTRIDE;
const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
for (i = 0; i < 4 << (bsize == BLOCK_8X8); i++) {
for (j = 0; j < 4 << (bsize == BLOCK_8X8); j++) {
for (i = 0; i < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_4X8); i++) {
for (j = 0; j < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_8X4); j++) {
int16_t sum = 0;
int16_t y;
int16_t x = in[i * s + j];
......@@ -455,9 +455,18 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
if (dirinit) *dirinit = 1;
}
}
// Only run dering for non-zero threshold (which is always the case for
// 4:2:2 or 4:4:0). If we don't dering, we still need to eventually write
// something out in y[] later.
if (pli == 1 && xdec != ydec) {
for (bi = 0; bi < cdef_count; bi++) {
static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 };
static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 };
by = dlist[bi].by;
bx = dlist[bi].bx;
dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
}
}
// Only run dering for non-zero threshold. If we don't dering, we
// still need to eventually write something out in y[] later.
if (threshold != 0) {
assert(bsize == BLOCK_8X8 || bsize == BLOCK_4X4);
for (bi = 0; bi < cdef_count; bi++) {
......@@ -545,8 +554,16 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
if (dirinit) *dirinit = 1;
}
}
if (pli == 1 && xdec != ydec) {
for (bi = 0; bi < cdef_count; bi++) {
static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 };
static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 };
by = dlist[bi].by;
bx = dlist[bi].bx;
dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
}
}
assert(bsize == BLOCK_8X8 || bsize == BLOCK_4X4);
for (bi = 0; bi < cdef_count; bi++) {
int t = dlist[bi].skip ? 0 : pri_strength;
int s = dlist[bi].skip ? 0 : sec_strength;
......
......@@ -1029,16 +1029,56 @@ void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
int sec_strength, int dir, int pri_damping,
int sec_damping, int bsize, int max,
int coeff_shift) {
if (dst8)
(bsize == BLOCK_8X8 ? SIMD_FUNC(cdef_filter_block_8x8_8)
: SIMD_FUNC(cdef_filter_block_4x4_8))(
dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
sec_damping, max, coeff_shift);
else
(bsize == BLOCK_8X8 ? SIMD_FUNC(cdef_filter_block_8x8_16)
: SIMD_FUNC(cdef_filter_block_4x4_16))(
dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
sec_damping, max, coeff_shift);
if (dst8) {
if (bsize == BLOCK_8X8) {
SIMD_FUNC(cdef_filter_block_8x8_8)
(dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
sec_damping, max, coeff_shift);
} else if (bsize == BLOCK_4X8) {
SIMD_FUNC(cdef_filter_block_4x4_8)
(dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
sec_damping, max, coeff_shift);
SIMD_FUNC(cdef_filter_block_4x4_8)
(dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
} else if (bsize == BLOCK_8X4) {
SIMD_FUNC(cdef_filter_block_4x4_8)
(dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
sec_damping, max, coeff_shift);
SIMD_FUNC(cdef_filter_block_4x4_8)
(dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
sec_damping, max, coeff_shift);
} else {
SIMD_FUNC(cdef_filter_block_4x4_8)
(dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
sec_damping, max, coeff_shift);
}
} else {
if (bsize == BLOCK_8X8) {
SIMD_FUNC(cdef_filter_block_8x8_16)
(dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
sec_damping, max, coeff_shift);
} else if (bsize == BLOCK_4X8) {
SIMD_FUNC(cdef_filter_block_4x4_16)
(dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
sec_damping, max, coeff_shift);
SIMD_FUNC(cdef_filter_block_4x4_16)
(dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
} else if (bsize == BLOCK_8X4) {
SIMD_FUNC(cdef_filter_block_4x4_16)
(dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
sec_damping, max, coeff_shift);
SIMD_FUNC(cdef_filter_block_4x4_16)
(dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
sec_damping, max, coeff_shift);
} else {
assert(bsize == BLOCK_4X4);
SIMD_FUNC(cdef_filter_block_4x4_16)
(dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
sec_damping, max, coeff_shift);
}
}
}
#else
......
......@@ -1056,10 +1056,9 @@ static void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
cm->nb_cdef_strengths = 1 << cm->cdef_bits;
for (int i = 0; i < cm->nb_cdef_strengths; i++) {
cm->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
cm->cdef_uv_strengths[i] =
cm->subsampling_x == cm->subsampling_y && av1_num_planes(cm) > 1
? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS)
: 0;
cm->cdef_uv_strengths[i] = av1_num_planes(cm) > 1
? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS)
: 0;
}
}
......
......@@ -2520,7 +2520,7 @@ static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
aom_wb_write_literal(wb, cm->cdef_bits, 2);
for (i = 0; i < cm->nb_cdef_strengths; i++) {
aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS);
if (cm->subsampling_x == cm->subsampling_y && av1_num_planes(cm) > 1)
if (av1_num_planes(cm) > 1)
aom_wb_write_literal(wb, cm->cdef_uv_strengths[i], CDEF_STRENGTH_BITS);
}
}
......
......@@ -330,8 +330,6 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
uint16_t *in;
DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
int chroma_cdef = xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
xd->plane[2].subsampling_x == xd->plane[2].subsampling_y;
quantizer =
av1_ac_quant_Q3(cm->base_qindex, 0, cm->bit_depth) >> (cm->bit_depth - 8);
lambda = .12 * quantizer * quantizer / 256.;
......@@ -440,7 +438,6 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
int sec_strength;
threshold = gi / CDEF_SEC_STRENGTHS;
if (fast) threshold = priconv[threshold];
if (pli > 0 && !chroma_cdef) threshold = 0;
/* We avoid filtering the pixels for which some of the pixels to
average
are outside the frame. We could change the filter instead, but it
......@@ -452,40 +449,35 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
int xsize = (nhb << mi_wide_l2[pli]) +
CDEF_HBORDER * (fbc != nhfb - 1) + xoff;
sec_strength = gi % CDEF_SEC_STRENGTHS;
if (pli && !chroma_cdef) {
curr_mse = 0;
} else {
#if CONFIG_CDEF_SINGLEPASS
copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
src[pli],
(fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
(fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
stride[pli], ysize, xsize);
cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli],
dir, &dirinit, var, pli, dlist, cdef_count, threshold,
sec_strength + (sec_strength == 3), pri_damping,
sec_damping, coeff_shift);
#else
if (sec_strength == 0)
copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
src[pli],
(fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
(fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
stride[pli], ysize, xsize);
cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli],
ydec[pli], dir, &dirinit, var, pli, dlist,
cdef_count, threshold,
sec_strength + (sec_strength == 3), pri_damping,
sec_damping, coeff_shift);
#else
if (sec_strength == 0)
copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
src[pli],
(fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
(fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
stride[pli], ysize, xsize);
cdef_filter_fb(sec_strength ? NULL : (uint8_t *)in, CDEF_BSTRIDE,
tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit,
var, pli, dlist, cdef_count, threshold,
sec_strength + (sec_strength == 3), sec_damping,
pri_damping, coeff_shift, sec_strength != 0, 1);
cdef_filter_fb(sec_strength ? NULL : (uint8_t *)in, CDEF_BSTRIDE,
tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit, var,
pli, dlist, cdef_count, threshold,
sec_strength + (sec_strength == 3), sec_damping,
pri_damping, coeff_shift, sec_strength != 0, 1);
#endif
curr_mse = compute_cdef_dist(
ref_coeff[pli] +
(fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
(fbc * MI_SIZE_64X64 << mi_wide_l2[pli]),
stride[pli], tmp_dst, dlist, cdef_count, bsize[pli],
coeff_shift, pli);
}
curr_mse = compute_cdef_dist(
ref_coeff[pli] +
(fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
(fbc * MI_SIZE_64X64 << mi_wide_l2[pli]),
stride[pli], tmp_dst, dlist, cdef_count, bsize[pli], coeff_shift,
pli);
if (pli < 2)
mse[pli][sb_count][gi] = curr_mse;
else
......
......@@ -306,6 +306,8 @@ INSTANTIATE_TEST_CASE_P(
SSE2, CDEFBlockTest,
::testing::Values(
make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_4X4),
make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_8X4),
make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_4X8),
make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_8X8)));
INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirTest,
::testing::Values(make_tuple(&cdef_find_dir_sse2,
......@@ -315,6 +317,8 @@ INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirTest,
INSTANTIATE_TEST_CASE_P(
SSSE3, CDEFBlockTest,
::testing::Values(
make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_4X4),
make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_8X4),
make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_4X4),
make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_8X8)));
INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirTest,
......@@ -339,6 +343,8 @@ INSTANTIATE_TEST_CASE_P(
AVX2, CDEFBlockTest,
::testing::Values(
make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_4X4),
make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_8X4),
make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_4X8),
make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_8X8)));
INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirTest,
::testing::Values(make_tuple(&cdef_find_dir_avx2,
......@@ -362,6 +368,8 @@ INSTANTIATE_TEST_CASE_P(
SSE2, CDEFSpeedTest,
::testing::Values(
make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_4X4),
make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_8X4),
make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_4X8),
make_tuple(&cdef_filter_block_sse2, &cdef_filter_block_c, BLOCK_8X8)));
INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirSpeedTest,
::testing::Values(make_tuple(&cdef_find_dir_sse2,
......@@ -373,6 +381,8 @@ INSTANTIATE_TEST_CASE_P(
SSSE3, CDEFSpeedTest,
::testing::Values(
make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_4X4),
make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_8X4),
make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_4X8),
make_tuple(&cdef_filter_block_ssse3, &cdef_filter_block_c, BLOCK_8X8)));
INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirSpeedTest,
::testing::Values(make_tuple(&cdef_find_dir_ssse3,
......@@ -382,10 +392,12 @@ INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirSpeedTest,
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(
SSE4_1, CDEFSpeedTest,
::testing::Values(make_tuple(&cdef_filter_block_sse4_1,
&cdef_filter_block_c, BLOCK_4X4),
make_tuple(&cdef_filter_block_sse4_1,
&cdef_filter_block_c, BLOCK_8X8)));
::testing::Values(
make_tuple(&cdef_filter_block_sse4_1, &cdef_filter_block_c, BLOCK_4X4),
make_tuple(&cdef_filter_block_sse4_1, &cdef_filter_block_c, BLOCK_8X4),
make_tuple(&cdef_filter_block_sse4_1, &cdef_filter_block_c, BLOCK_4X8),
make_tuple(&cdef_filter_block_sse4_1, &cdef_filter_block_c,
BLOCK_8X8)));
INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirSpeedTest,
::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
&cdef_find_dir_c)));
......@@ -396,6 +408,8 @@ INSTANTIATE_TEST_CASE_P(
AVX2, CDEFSpeedTest,
::testing::Values(
make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_4X4),
make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_8X4),
make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_4X8),
make_tuple(&cdef_filter_block_avx2, &cdef_filter_block_c, BLOCK_8X8)));
INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirSpeedTest,
::testing::Values(make_tuple(&cdef_find_dir_avx2,
......@@ -407,6 +421,8 @@ INSTANTIATE_TEST_CASE_P(
NEON, CDEFSpeedTest,
::testing::Values(
make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_4X4),
make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_8X4),
make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_4X8),
make_tuple(&cdef_filter_block_neon, &cdef_filter_block_c, BLOCK_8X8)));
INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirSpeedTest,
::testing::Values(make_tuple(&cdef_find_dir_neon,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment