Commit 4ce85214 authored by Ola Hugosson's avatar Ola Hugosson Committed by Ola Hugosson
Browse files

Add deblock_13tap experiment

This change enables using 13 taps for luma plane deblocking and 5 taps for
chroma plane deblocking when pixels are in flat area.

The aim for the experiment is to make sure that luma line 57 and chroma
line 29 of the current superblock is not changed by the deblocking process
of the superblock below. Previously this was already the case for luma
line 56 and chroma line 28 (but not for 57 and 29).

This experiment is part of an effort to reduce the overall line buffer
size for DEBLOCK+CDEF+LR. With this change it is possible to CDEF line
-8 to +55 direcly on the output of deblock (which require line +56 and
+57 to be final).

Change-Id: I7779a08d6ad5683bf35c3372b1526786eaac8472
parent df48d293
......@@ -23,6 +23,14 @@ static INLINE int8_t signed_char_clamp(int t) {
#define PARALLEL_DEBLOCKING_11_TAP 0
#define PARALLEL_DEBLOCKING_9_TAP 0
#if CONFIG_DEBLOCK_13TAP
#define PARALLEL_DEBLOCKING_13_TAP 1
#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 1
#else
#define PARALLEL_DEBLOCKING_13_TAP 0
#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 0
#endif
#if CONFIG_HIGHBITDEPTH
static INLINE int16_t signed_char_clamp_high(int t, int bd) {
switch (bd) {
......@@ -58,6 +66,19 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
return ~mask;
}
#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
uint8_t p0, uint8_t q0, uint8_t q1,
uint8_t q2) {
int8_t mask = 0;
mask |= (abs(p1 - p0) > thresh) * -1;
mask |= (abs(q1 - q0) > thresh) * -1;
mask |= (abs(p2 - p0) > thresh) * -1;
mask |= (abs(q2 - q0) > thresh) * -1;
return ~mask;
}
#endif
static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
uint8_t q2, uint8_t q3) {
......@@ -216,6 +237,25 @@ void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
}
#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
uint8_t *op2, uint8_t *op1, uint8_t *op0,
uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
if (flat && mask) {
const uint8_t p2 = *op2, p1 = *op1, p0 = *op0;
const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
// 5-tap filter [1, 2, 2, 2, 1]
*op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
*op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
*oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
*oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
} else {
filter4(mask, thresh, op1, op0, oq0, oq1);
}
}
#endif
static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
uint8_t *op3, uint8_t *op2, uint8_t *op1,
uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
......@@ -236,6 +276,32 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
}
}
#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int count = 4;
#else
int count = 8;
#endif
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < count; ++i) {
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
const int8_t mask =
filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
s + 2 * p);
++s;
}
}
#endif
void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
......@@ -268,6 +334,28 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
}
#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int count = 4;
#else
int count = 8;
#endif
for (i = 0; i < count; ++i) {
const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask =
filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
s += pitch;
}
}
#endif
void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
......@@ -297,6 +385,56 @@ void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
}
#if PARALLEL_DEBLOCKING_13_TAP
static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
int8_t flat2, uint8_t *op6, uint8_t *op5,
uint8_t *op4, uint8_t *op3, uint8_t *op2,
uint8_t *op1, uint8_t *op0, uint8_t *oq0,
uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) {
if (flat2 && flat && mask) {
const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2,
p1 = *op1, p0 = *op0;
const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
q5 = *oq5, q6 = *oq6;
// 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
*op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
4);
*op4 = ROUND_POWER_OF_TWO(
p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
*op3 = ROUND_POWER_OF_TWO(
p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
*op2 = ROUND_POWER_OF_TWO(
p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
4);
*op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
q0 + q1 + q2 + q3 + q4,
4);
*op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
q0 * 2 + q1 + q2 + q3 + q4 + q5,
4);
*oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
q1 * 2 + q2 + q3 + q4 + q5 + q6,
4);
*oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
q2 * 2 + q3 + q4 + q5 + q6 * 2,
4);
*oq2 = ROUND_POWER_OF_TWO(
p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
4);
*oq3 = ROUND_POWER_OF_TWO(
p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
*oq4 = ROUND_POWER_OF_TWO(
p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
*oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
4);
} else {
filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
}
}
#endif
#if PARALLEL_DEBLOCKING_11_TAP
static INLINE void filter12(int8_t mask, uint8_t thresh, int8_t flat,
int8_t flat2, uint8_t *op5, uint8_t *op4,
......@@ -428,7 +566,16 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
#if PARALLEL_DEBLOCKING_11_TAP
#if PARALLEL_DEBLOCKING_13_TAP
(void)p7;
(void)q7;
const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p);
#elif PARALLEL_DEBLOCKING_11_TAP
const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);
filter12(mask, *thresh, flat, flat2, s - 6 * p, s - 5 * p, s - 4 * p,
......@@ -482,7 +629,14 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
#if PARALLEL_DEBLOCKING_11_TAP
#if PARALLEL_DEBLOCKING_13_TAP
(void)p7;
(void)q7;
const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
#elif PARALLEL_DEBLOCKING_11_TAP
const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5);
filter12(mask, *thresh, flat, flat2, s - 6, s - 5, s - 4, s - 3, s - 2,
......@@ -553,6 +707,21 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
return ~mask;
}
#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
uint16_t p1, uint16_t p0,
uint16_t q0, uint16_t q1,
uint16_t q2, int bd) {
int8_t mask = 0;
int16_t thresh16 = (uint16_t)thresh << (bd - 8);
mask |= (abs(p1 - p0) > thresh16) * -1;
mask |= (abs(q1 - q0) > thresh16) * -1;
mask |= (abs(p2 - p0) > thresh16) * -1;
mask |= (abs(q2 - q0) > thresh16) * -1;
return ~mask;
}
#endif
static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
uint16_t p1, uint16_t p0, uint16_t q0,
uint16_t q1, uint16_t q2, uint16_t q3,
......@@ -708,6 +877,26 @@ void aom_highbd_lpf_vertical_4_dual_c(
bd);
}
#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
uint16_t *op2, uint16_t *op1, uint16_t *op0,
uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
int bd) {
if (flat && mask) {
const uint16_t p2 = *op2, p1 = *op1, p0 = *op0;
const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
// 5-tap filter [1, 2, 2, 2, 1]
*op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
*op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
*oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
*oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
} else {
highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
}
}
#endif
static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
uint16_t *op3, uint16_t *op2, uint16_t *op1,
uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
......@@ -754,6 +943,33 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
}
}
#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int bd) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int count = 4;
#else
int count = 8;
#endif
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < count; ++i) {
const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
const int8_t mask =
highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
s + 1 * p, s + 2 * p, bd);
++s;
}
}
#endif
void aom_highbd_lpf_horizontal_8_dual_c(
uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
......@@ -762,6 +978,30 @@ void aom_highbd_lpf_horizontal_8_dual_c(
aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
}
#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int bd) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int count = 4;
#else
int count = 8;
#endif
for (i = 0; i < count; ++i) {
const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask =
highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
bd);
s += pitch;
}
}
#endif
void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int bd) {
......@@ -794,6 +1034,68 @@ void aom_highbd_lpf_vertical_8_dual_c(
bd);
}
#if PARALLEL_DEBLOCKING_13_TAP
static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
int8_t flat2, uint16_t *op6, uint16_t *op5,
uint16_t *op4, uint16_t *op3, uint16_t *op2,
uint16_t *op1, uint16_t *op0, uint16_t *oq0,
uint16_t *oq1, uint16_t *oq2, uint16_t *oq3,
uint16_t *oq4, uint16_t *oq5, uint16_t *oq6,
int bd) {
if (flat2 && flat && mask) {
const uint16_t p6 = *op6;
const uint16_t p5 = *op5;
const uint16_t p4 = *op4;
const uint16_t p3 = *op3;
const uint16_t p2 = *op2;
const uint16_t p1 = *op1;
const uint16_t p0 = *op0;
const uint16_t q0 = *oq0;
const uint16_t q1 = *oq1;
const uint16_t q2 = *oq2;
const uint16_t q3 = *oq3;
const uint16_t q4 = *oq4;
const uint16_t q5 = *oq5;
const uint16_t q6 = *oq6;
// 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
*op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
4);
*op4 = ROUND_POWER_OF_TWO(
p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
*op3 = ROUND_POWER_OF_TWO(
p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
*op2 = ROUND_POWER_OF_TWO(
p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
4);
*op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
q0 + q1 + q2 + q3 + q4,
4);
*op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
q0 * 2 + q1 + q2 + q3 + q4 + q5,
4);
*oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
q1 * 2 + q2 + q3 + q4 + q5 + q6,
4);
*oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
q2 * 2 + q3 + q4 + q5 + q6 * 2,
4);
*oq2 = ROUND_POWER_OF_TWO(
p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
4);
*oq3 = ROUND_POWER_OF_TWO(
p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
*oq4 = ROUND_POWER_OF_TWO(
p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
*oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
4);
} else {
highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
bd);
}
}
#endif
static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat,
int8_t flat2, uint16_t *op7, uint16_t *op6,
uint16_t *op5, uint16_t *op4, uint16_t *op3,
......@@ -887,6 +1189,16 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat =
highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
#if PARALLEL_DEBLOCKING_13_TAP
const int8_t flat2 =
highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
s[5 * p], s[6 * p], bd);
highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
#else
const int8_t flat2 =
highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
......@@ -895,6 +1207,7 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
s + 6 * p, s + 7 * p, bd);
#endif
++s;
}
}
......@@ -937,12 +1250,21 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat =
highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
#if PARALLEL_DEBLOCKING_13_TAP
const int8_t flat2 =
highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
s + 6, bd);
#else
const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
q0, s[4], s[5], s[6], s[7], bd);
highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
s + 5, s + 6, s + 7, bd);
#endif
s += p;
}
}
......
......@@ -277,6 +277,29 @@ static int pick_min_grad_direct(uint8_t *const src, int length, int row,
#define PARALLEL_DEBLOCKING_15TAPLUMAONLY 1
#define PARALLEL_DEBLOCKING_DISABLE_15TAP 0
#if CONFIG_DEBLOCK_13TAP
#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 1
#else
#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 0
#endif
#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
extern void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh);
extern void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh);
extern void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd);
extern void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd);
#endif
// 64 bit masks for left transform size. Each 1 represents a position where
// we should apply a loop filter across the left border of an 8x8 block
......@@ -2958,7 +2981,11 @@ static void set_lpf_parameters(
#if PARALLEL_DEBLOCKING_15TAPLUMAONLY
// No wide filtering for chroma plane
if (plane != 0) {
#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
params->filter_length = 6;
#else
params->filter_length = 8;
#endif
}
#endif
}
......@@ -3144,6 +3171,20 @@ static void av1_filter_block_plane_vert(
aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
case 6: // apply 6-tap filter for chroma plane only
assert(plane != 0);
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_vertical_6_c(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
#endif // CONFIG_HIGHBITDEPTH
aom_lpf_vertical_6_c(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
#endif
// apply 8-tap filtering
case 8:
#if CONFIG_HIGHBITDEPTH
......@@ -3160,13 +3201,25 @@ static void av1_filter_block_plane_vert(
case 16:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
#if CONFIG_DEBLOCK_13TAP
// TODO(olah): Remove _c once SIMD for 13-tap is available
aom_highbd_lpf_vertical_16_c(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
#else
aom_highbd_lpf_vertical_16(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
cm->bit_depth);
#endif
else
#endif // CONFIG_HIGHBITDEPTH
aom_lpf_vertical_16(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
#if CONFIG_DEBLOCK_13TAP
aom_lpf_vertical_16_c(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
#else
aom_lpf_vertical_16(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
#endif
break;
// no filtering
default: break;
......@@ -3341,6 +3394,20 @@ static void av1_filter_block_plane_horz(
aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
#if PARALLEL_DEBLOCKING_5_TAP_CHROMA
// apply 6-tap filtering
case 6: assert(plane != 0);
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_horizontal_6_c(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
#endif // CONFIG_HIGHBITDEPTH
aom_lpf_horizontal_6_c(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
break;
#endif
// apply 8-tap filtering
case 8:
#if CONFIG_HIGHBITDEPTH
......@@ -3357,13 +3424,25 @@ static void av1_filter_block_plane_horz(
case 16:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
#if CONFIG_DEBLOCK_13TAP
// TODO(olah): Remove _c once SIMD for 13-tap is available
aom_highbd_lpf_horizontal_edge_16_c(
CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
#else
aom_highbd_lpf_horizontal_edge_16(
CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
#endif
else
#endif // CONFIG_HIGHBITDEPTH
aom_lpf_horizontal_edge_16(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
#if CONFIG_DEBLOCK_13TAP
aom_lpf_horizontal_edge_16_c(p, dst_stride, params.mblim,
params.lim, params.hev_thr);
#else
aom_lpf_horizontal_edge_16(p, dst_stride, params.mblim, params.lim,
params.hev_thr);
#endif
break;
// no filtering
default: break;
......
......@@ -128,6 +128,7 @@ set(CONFIG_DAALA_DCT64 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DAALA_DCT8 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DAALA_TX 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DCT_ONLY 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DEBLOCK_13TAP 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DELTA_Q 1 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DEPENDENT_HORZTILES 0 CACHE NUMBER "AV1 experiment flag.")
set(CONFIG_DIST_8X8 1 CACHE NUMBER "AV1 experiment flag.")
......