Commit a2bbf621 authored by Yaowu Xu's avatar Yaowu Xu Committed by Gerrit Code Review
Browse files

Merge "Reduce memory footprint for CLPF decoding." into nextgenv2

parents 4da3ed40 e8224c7a
......@@ -587,7 +587,7 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint
specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
if (aom_config("CONFIG_CLPF") eq "yes") {
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int stride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";
specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
......
......@@ -27,30 +27,30 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) {
return (8 + delta - (delta < 0)) >> 4;
}
void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int stride, int x0,
int y0, int sizex, int sizey, int width, int height,
unsigned int strength) {
void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
int width, int height, unsigned int strength) {
int x, y;
for (y = y0; y < y0 + sizey; y++) {
for (x = x0; x < x0 + sizex; x++) {
int X = src[y * stride + x];
int A = src[AOMMAX(0, y - 1) * stride + x];
int B = src[y * stride + AOMMAX(0, x - 2)];
int C = src[y * stride + AOMMAX(0, x - 1)];
int D = src[y * stride + AOMMIN(width - 1, x + 1)];
int E = src[y * stride + AOMMIN(width - 1, x + 2)];
int F = src[AOMMIN(height - 1, y + 1) * stride + x];
int X = src[y * sstride + x];
int A = src[AOMMAX(0, y - 1) * sstride + x];
int B = src[y * sstride + AOMMAX(0, x - 2)];
int C = src[y * sstride + AOMMAX(0, x - 1)];
int D = src[y * sstride + AOMMIN(width - 1, x + 1)];
int E = src[y * sstride + AOMMIN(width - 1, x + 2)];
int F = src[AOMMIN(height - 1, y + 1) * sstride + x];
int delta;
delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
dst[y * stride + x] = X + delta;
dst[y * dstride + x] = X + delta;
}
}
}
// Return number of filtered blocks
int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength,
int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst,
const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org,
AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, uint8_t *blocks,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *,
......@@ -59,23 +59,45 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
/* Constrained low-pass filter (CLPF) */
int c, k, l, m, n;
const int bs = MI_SIZE;
int width = rec->y_crop_width;
int height = rec->y_crop_height;
const int width = rec->y_crop_width;
const int height = rec->y_crop_height;
int xpos, ypos;
int stride_y = rec->y_stride;
int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
const int sstride = rec->y_stride;
int dstride = orig_dst->y_stride;
const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
int block_index = 0;
uint8_t *cache = NULL;
uint8_t **cache_ptr = NULL;
uint8_t **cache_dst = NULL;
int cache_idx = 0;
const int cache_size = num_fb_hor << (2 * fb_size_log2);
const int cache_blocks = cache_size / (bs * bs);
YV12_BUFFER_CONFIG dst = *orig_dst;
// Make buffer space for in-place filtering
if (rec->y_buffer == dst.y_buffer) {
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
CHECK_MEM_ERROR(cm, cache_ptr,
aom_malloc(cache_blocks * sizeof(*cache_ptr)));
CHECK_MEM_ERROR(cm, cache_dst,
aom_malloc(cache_blocks * sizeof(*cache_dst)));
memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst));
dst.y_buffer = cache;
dstride = bs;
}
// Iterate over all filter blocks
for (k = 0; k < num_fb_ver; k++) {
for (l = 0; l < num_fb_hor; l++) {
int h, w;
int allskip = 1;
const int xoff = l << fb_size_log2;
const int yoff = k << fb_size_log2;
for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) {
xpos = (l << fb_size_log2) + n * bs;
ypos = (k << fb_size_log2) + m * bs;
xpos = xoff + n * bs;
ypos = yoff + m * bs;
if (xpos < width && ypos < height) {
allskip &=
cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
......@@ -96,31 +118,57 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
// Iterate over all smaller blocks inside the filter block
for (m = 0; m < (h + bs - 1) / bs; m++) {
for (n = 0; n < (w + bs - 1) / bs; n++) {
xpos = (l << fb_size_log2) + n * bs;
ypos = (k << fb_size_log2) + m * bs;
xpos = xoff + n * bs;
ypos = yoff + m * bs;
if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
->mbmi.skip) {
// Not skip block, apply the filter
aom_clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos,
bs, bs, width, height, strength);
->mbmi.skip) { // Not skip block
// Temporary buffering needed if filtering in-place
if (cache) {
if (cache_ptr[cache_idx]) {
// Copy filtered block back into the frame
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
}
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
cache_dst[cache_idx] = rec->y_buffer + ypos * sstride + xpos;
if (++cache_idx >= cache_blocks) cache_idx = 0;
}
// Apply the filter
aom_clpf_block(rec->y_buffer, dst.y_buffer, sstride, dstride,
xpos, ypos, bs, bs, width, height, strength);
} else { // Skip block, copy instead
for (c = 0; c < bs; c++)
*(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) =
*(uint64_t *)(rec->y_buffer + (ypos + c) * stride_y + xpos);
if (!cache)
for (c = 0; c < bs; c++)
*(uint64_t *)(dst.y_buffer + (ypos + c) * dstride + xpos) = *(
uint64_t *)(rec->y_buffer + (ypos + c) * sstride + xpos);
}
}
}
} else { // Entire filter block is skip, copy
for (m = 0; m < h; m++)
memcpy(dst->y_buffer + ((k << fb_size_log2) + m) * stride_y +
(l << fb_size_log2),
rec->y_buffer + ((k << fb_size_log2) + m) * stride_y +
(l << fb_size_log2),
w);
if (!cache)
for (m = 0; m < h; m++)
memcpy(dst.y_buffer + (yoff + m) * dstride + xoff,
rec->y_buffer + (yoff + m) * sstride + xoff, w);
}
block_index += !allskip; // Count number of blocks filtered
}
}
if (cache) {
// Copy remaining blocks into the frame
for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx];
cache_idx++)
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
aom_free(cache);
aom_free(cache_ptr);
}
return block_index;
}
......@@ -18,7 +18,7 @@
int av1_clpf_maxbits(const AV1_COMMON *cm);
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, uint8_t *blocks,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
......
......@@ -11,11 +11,11 @@
#include "./aom_dsp_rtcd.h"
static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
int y0, int sizey, int width, int height,
unsigned int strength) {
dst += x0 + y0 * stride;
src += x0 + y0 * stride;
static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int width,
int height, unsigned int strength) {
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
{
int bottom = height - 2 - y0;
const v128 sp = v128_dup_8(strength);
......@@ -32,23 +32,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + stride);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + stride)));
v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + stride)));
v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(l2, v64_load_aligned(
src + ((y != bottom) + 1) * stride)));
src + ((y != bottom) + 1) * sstride)));
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
......@@ -70,9 +70,9 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
delta, v128_zero()))),
4));
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + stride, v128_low_v64(o));
src += stride * 2;
dst += stride * 2;
v64_store_aligned(dst + dstride, v128_low_v64(o));
src += sstride * 2;
dst += dstride * 2;
}
} else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
......@@ -83,23 +83,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + stride);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + stride)));
v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + stride)));
v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8(
c128, v128_from_v64(l2, v64_load_aligned(
src + ((y != bottom) + 1) * stride)));
src + ((y != bottom) + 1) * sstride)));
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
......@@ -121,35 +121,35 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
delta, v128_zero()))),
4));
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + stride, v128_low_v64(o));
src += stride * 2;
dst += stride * 2;
v64_store_aligned(dst + dstride, v128_low_v64(o));
src += sstride * 2;
dst += dstride * 2;
}
} else { // No left/right clipping
int y;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + stride);
const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8(
c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1));
v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + stride)));
v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + stride)));
v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + stride)));
v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + stride)));
v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8(
c128, v128_from_v64(l2, v64_load_aligned(
src + ((y != bottom) + 1) * stride)));
src + ((y != bottom) + 1) * sstride)));
const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
......@@ -171,17 +171,18 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
delta, v128_zero()))),
4));
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + stride, v128_low_v64(o));
src += stride * 2;
dst += stride * 2;
v64_store_aligned(dst + dstride, v128_low_v64(o));
src += sstride * 2;
dst += dstride * 2;
}
}
}
}
void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
int x0, int y0, int sizex, int sizey, int width,
int height, unsigned int strength) {
void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex,
int sizey, int width, int height,
unsigned int strength) {
// TODO(stemidts):
// A sizex different from 8 will only be needed if CLPF is extended to chroma.
// This will only be used if 4:2:0 and width not a multiple of 16 and along
......@@ -189,9 +190,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
// this case. If not extended to chroma, this test will be redundant.
if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
// Fallback to C for odd sizes
aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height,
strength);
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
height, strength);
} else {
clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength);
clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
strength);
}
}
......@@ -3929,19 +3929,10 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
#if CONFIG_CLPF
if (cm->clpf_strength && !cm->skip_loop_filter) {
YV12_BUFFER_CONFIG dst; // Buffer for the result
dst = pbi->cur_buf->buf;
CHECK_MEM_ERROR(cm, dst.y_buffer, aom_malloc(dst.y_stride * dst.y_height));
av1_clpf_frame(&dst, &pbi->cur_buf->buf, 0, cm, !!cm->clpf_size,
const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
av1_clpf_frame(frame, frame, 0, cm, !!cm->clpf_size,
cm->clpf_strength + (cm->clpf_strength == 3),
4 + cm->clpf_size, cm->clpf_blocks, clpf_bit);
// Copy result
memcpy(pbi->cur_buf->buf.y_buffer, dst.y_buffer,
dst.y_height * dst.y_stride);
aom_free(dst.y_buffer);
}
if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
#endif
......
......@@ -26,9 +26,9 @@ using libaom_test::ACMRandom;
namespace {
typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int stride,
int x0, int y0, int sizex, int sizey, int width,
int height, unsigned int strength);
typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey,
int width, int height, unsigned int strength);
typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
clpf_block_param_t;
......@@ -85,10 +85,10 @@ TEST_P(ClpfBlockTest, TestSIMDNoMismatch) {
for (ypos = 0; ypos < size && !error; ypos += h * !error) {
for (xpos = 0; xpos < size && !error; xpos += w * !error) {
for (strength = 0; strength < 3 && !error; strength += !error) {
ref_clpf(s, ref_d, size, xpos, ypos, w, h, size, size,
ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size,
1 << strength);
ASM_REGISTER_STATE_CHECK(
clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength));
ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w, h,
size, size, 1 << strength));
for (pos = 0; pos < size * size && !error; pos++) {
error = ref_d[pos] != d[pos];
......@@ -137,7 +137,8 @@ TEST_P(ClpfSpeedTest, TestSpeed) {
for (ypos = 0; ypos < size; ypos += h) {
for (xpos = 0; xpos < size; xpos += w) {
for (strength = 0; strength < 3; strength++) {
ref_clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
ref_clpf(s, d, size, size, xpos, ypos, w, h, size, size,
1 << strength);
}
}
}
......@@ -150,7 +151,7 @@ TEST_P(ClpfSpeedTest, TestSpeed) {
for (ypos = 0; ypos < size; ypos += h) {
for (xpos = 0; xpos < size; xpos += w) {
for (strength = 0; strength < 3; strength++) {
clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength);
clpf(s, d, size, size, xpos, ypos, w, h, size, size, 1 << strength);
}
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment