Commit 17905edf authored by Ryan Lei's avatar Ryan Lei

integrate parallel_deblocking with CB4x4

this change makes parallel deblocking experiment works with
cb4x4. the inner loop process every 4x4 block.

Change-Id: I86adb3d7b6d67a91ccc12aab29da9bfb8c522cf1
parent b2a01db8
......@@ -149,10 +149,15 @@ void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
const uint8_t *blimit, const uint8_t *limit,
const uint8_t *thresh) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int count = 4;
#else
int count = 8;
#endif
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
for (i = 0; i < count; ++i) {
#if !CONFIG_PARALLEL_DEBLOCKING
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
......@@ -179,10 +184,15 @@ void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int count = 4;
#else
int count = 8;
#endif
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
for (i = 0; i < count; ++i) {
#if !CONFIG_PARALLEL_DEBLOCKING
const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
......@@ -229,10 +239,15 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int count = 4;
#else
int count = 8;
#endif
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
for (i = 0; i < count; ++i) {
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
......@@ -256,8 +271,13 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int count = 4;
#else
int count = 8;
#endif
for (i = 0; i < 8; ++i) {
for (i = 0; i < count; ++i) {
const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask =
......@@ -390,10 +410,15 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int count) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int step = 4;
#else
int step = 8;
#endif
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8 * count; ++i) {
for (i = 0; i < step * count; ++i) {
const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p],
p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p],
p1 = s[-2 * p], p0 = s[-p];
......@@ -436,7 +461,11 @@ void aom_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,
void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
#else
mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
#endif
}
static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
......@@ -478,7 +507,11 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
#else
mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
#endif
}
void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
......@@ -596,10 +629,15 @@ void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
const uint8_t *blimit, const uint8_t *limit,
const uint8_t *thresh, int bd) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int count = 4;
#else
int count = 8;
#endif
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
for (i = 0; i < count; ++i) {
#if !CONFIG_PARALLEL_DEBLOCKING
const uint16_t p3 = s[-4 * p];
const uint16_t p2 = s[-3 * p];
......@@ -636,10 +674,15 @@ void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int bd) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int count = 4;
#else
int count = 8;
#endif
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
for (i = 0; i < count; ++i) {
#if !CONFIG_PARALLEL_DEBLOCKING
const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
......@@ -689,10 +732,15 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int bd) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int count = 4;
#else
int count = 8;
#endif
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
for (i = 0; i < count; ++i) {
const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
......@@ -718,8 +766,13 @@ void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int bd) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int count = 4;
#else
int count = 8;
#endif
for (i = 0; i < 8; ++i) {
for (i = 0; i < count; ++i) {
const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask =
......@@ -813,10 +866,15 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
const uint8_t *thresh, int count,
int bd) {
int i;
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
int step = 4;
#else
int step = 8;
#endif
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8 * count; ++i) {
for (i = 0; i < step * count; ++i) {
const uint16_t p3 = s[-4 * p];
const uint16_t p2 = s[-3 * p];
const uint16_t p1 = s[-2 * p];
......@@ -852,7 +910,11 @@ void aom_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
#else
highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
#endif
}
static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
......@@ -888,13 +950,21 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int bd) {
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
#else
highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
#endif
}
void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
#else
highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
#endif
}
#endif // CONFIG_HIGHBITDEPTH
This diff is collapsed.
......@@ -551,6 +551,11 @@ post_process_cmdline() {
soft_enable accounting
soft_enable inspection
fi
if enabled parallel_deblocking_15tap && ! enabled parallel_deblocking; then
log_echo "parallel_deblocking_15tap dependes on parallel_deblocking, so"
log_echo "enabling parallel_deblocking"
soft_enable parallel_deblocking
fi
}
process_targets() {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment