Commit eb5794da authored by Steinar Midtskogen's avatar Steinar Midtskogen

Reduce memory footprint for CLPF decoding.

Instead of having CLPF write to an entire new frame and
copy the result back into the original frame, make the
filter able to work in-place by keeping a buffer of size
frame_width*filter_block_size and delay the write-back
by one filter_block_size row.

This reduces the cycles spent in the filter to ~75%.

Change-Id: I78ca74380c45492daa8935d08d766851edb5fbc1
parent 59228957
...@@ -625,7 +625,7 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint ...@@ -625,7 +625,7 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint
specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
if (aom_config("CONFIG_CLPF") eq "yes") { if (aom_config("CONFIG_CLPF") eq "yes") {
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int stride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength"; add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength";
specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/; specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength"; add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength";
specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/; specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/;
......
...@@ -27,30 +27,30 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) { ...@@ -27,30 +27,30 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) {
return (8 + delta - (delta < 0)) >> 4; return (8 + delta - (delta < 0)) >> 4;
} }
void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int stride, int x0, void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
int y0, int sizex, int sizey, int width, int height, int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength) { int width, int height, unsigned int strength) {
int x, y; int x, y;
for (y = y0; y < y0 + sizey; y++) { for (y = y0; y < y0 + sizey; y++) {
for (x = x0; x < x0 + sizex; x++) { for (x = x0; x < x0 + sizex; x++) {
int X = src[y * stride + x]; int X = src[y * sstride + x];
int A = src[AOMMAX(0, y - 1) * stride + x]; int A = src[AOMMAX(0, y - 1) * sstride + x];
int B = src[y * stride + AOMMAX(0, x - 2)]; int B = src[y * sstride + AOMMAX(0, x - 2)];
int C = src[y * stride + AOMMAX(0, x - 1)]; int C = src[y * sstride + AOMMAX(0, x - 1)];
int D = src[y * stride + AOMMIN(width - 1, x + 1)]; int D = src[y * sstride + AOMMIN(width - 1, x + 1)];
int E = src[y * stride + AOMMIN(width - 1, x + 2)]; int E = src[y * sstride + AOMMIN(width - 1, x + 2)];
int F = src[AOMMIN(height - 1, y + 1) * stride + x]; int F = src[AOMMIN(height - 1, y + 1) * sstride + x];
int delta; int delta;
delta = av1_clpf_sample(X, A, B, C, D, E, F, strength); delta = av1_clpf_sample(X, A, B, C, D, E, F, strength);
dst[y * stride + x] = X + delta; dst[y * dstride + x] = X + delta;
} }
} }
} }
// Return number of filtered blocks // Return number of filtered blocks
int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org,
int enable_fb_flag, unsigned int strength, AV1_COMMON *cm, int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, uint8_t *blocks, unsigned int fb_size_log2, uint8_t *blocks,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *, int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
const YV12_BUFFER_CONFIG *, const YV12_BUFFER_CONFIG *,
...@@ -59,23 +59,45 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, ...@@ -59,23 +59,45 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
/* Constrained low-pass filter (CLPF) */ /* Constrained low-pass filter (CLPF) */
int c, k, l, m, n; int c, k, l, m, n;
const int bs = MI_SIZE; const int bs = MI_SIZE;
int width = rec->y_crop_width; const int width = rec->y_crop_width;
int height = rec->y_crop_height; const int height = rec->y_crop_height;
int xpos, ypos; int xpos, ypos;
int stride_y = rec->y_stride; const int sstride = rec->y_stride;
int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2; int dstride = orig_dst->y_stride;
int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2; const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
int block_index = 0; int block_index = 0;
uint8_t *cache = NULL;
uint8_t **cache_ptr = NULL;
uint8_t **cache_dst = NULL;
int cache_idx = 0;
const int cache_size = num_fb_hor << (2 * fb_size_log2);
const int cache_blocks = cache_size / (bs * bs);
YV12_BUFFER_CONFIG dst = *orig_dst;
// Make buffer space for in-place filtering
if (rec->y_buffer == dst.y_buffer) {
CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size));
CHECK_MEM_ERROR(cm, cache_ptr,
aom_malloc(cache_blocks * sizeof(*cache_ptr)));
CHECK_MEM_ERROR(cm, cache_dst,
aom_malloc(cache_blocks * sizeof(*cache_dst)));
memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst));
dst.y_buffer = cache;
dstride = bs;
}
// Iterate over all filter blocks // Iterate over all filter blocks
for (k = 0; k < num_fb_ver; k++) { for (k = 0; k < num_fb_ver; k++) {
for (l = 0; l < num_fb_hor; l++) { for (l = 0; l < num_fb_hor; l++) {
int h, w; int h, w;
int allskip = 1; int allskip = 1;
const int xoff = l << fb_size_log2;
const int yoff = k << fb_size_log2;
for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) { for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) { for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) {
xpos = (l << fb_size_log2) + n * bs; xpos = xoff + n * bs;
ypos = (k << fb_size_log2) + m * bs; ypos = yoff + m * bs;
if (xpos < width && ypos < height) { if (xpos < width && ypos < height) {
allskip &= allskip &=
cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
...@@ -96,31 +118,57 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, ...@@ -96,31 +118,57 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
// Iterate over all smaller blocks inside the filter block // Iterate over all smaller blocks inside the filter block
for (m = 0; m < (h + bs - 1) / bs; m++) { for (m = 0; m < (h + bs - 1) / bs; m++) {
for (n = 0; n < (w + bs - 1) / bs; n++) { for (n = 0; n < (w + bs - 1) / bs; n++) {
xpos = (l << fb_size_log2) + n * bs; xpos = xoff + n * bs;
ypos = (k << fb_size_log2) + m * bs; ypos = yoff + m * bs;
if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs]
->mbmi.skip) { ->mbmi.skip) { // Not skip block
// Not skip block, apply the filter // Temporary buffering needed if filtering in-place
aom_clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos, if (cache) {
bs, bs, width, height, strength); if (cache_ptr[cache_idx]) {
// Copy filtered block back into the frame
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
}
cache_ptr[cache_idx] = cache + cache_idx * bs * bs;
dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos;
cache_dst[cache_idx] = rec->y_buffer + ypos * sstride + xpos;
if (++cache_idx >= cache_blocks) cache_idx = 0;
}
// Apply the filter
aom_clpf_block(rec->y_buffer, dst.y_buffer, sstride, dstride,
xpos, ypos, bs, bs, width, height, strength);
} else { // Skip block, copy instead } else { // Skip block, copy instead
for (c = 0; c < bs; c++) if (!cache)
*(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) = for (c = 0; c < bs; c++)
*(uint64_t *)(rec->y_buffer + (ypos + c) * stride_y + xpos); *(uint64_t *)(dst.y_buffer + (ypos + c) * dstride + xpos) = *(
uint64_t *)(rec->y_buffer + (ypos + c) * sstride + xpos);
} }
} }
} }
} else { // Entire filter block is skip, copy } else { // Entire filter block is skip, copy
for (m = 0; m < h; m++) if (!cache)
memcpy(dst->y_buffer + ((k << fb_size_log2) + m) * stride_y + for (m = 0; m < h; m++)
(l << fb_size_log2), memcpy(dst.y_buffer + (yoff + m) * dstride + xoff,
rec->y_buffer + ((k << fb_size_log2) + m) * stride_y + rec->y_buffer + (yoff + m) * sstride + xoff, w);
(l << fb_size_log2),
w);
} }
block_index += !allskip; // Count number of blocks filtered block_index += !allskip; // Count number of blocks filtered
} }
} }
if (cache) {
// Copy remaining blocks into the frame
for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx];
cache_idx++)
for (c = 0; c < bs; c++)
*(uint64_t *)(cache_dst[cache_idx] + c * sstride) =
*(uint64_t *)(cache_ptr[cache_idx] + c * bs);
aom_free(cache);
aom_free(cache_ptr);
}
return block_index; return block_index;
} }
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
int av1_clpf_maxbits(const AV1_COMMON *cm); int av1_clpf_maxbits(const AV1_COMMON *cm);
int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b); int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec,
const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
int enable_fb_flag, unsigned int strength, int enable_fb_flag, unsigned int strength,
unsigned int fb_size_log2, uint8_t *blocks, unsigned int fb_size_log2, uint8_t *blocks,
int (*decision)(int, int, const YV12_BUFFER_CONFIG *, int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
......
...@@ -11,11 +11,11 @@ ...@@ -11,11 +11,11 @@
#include "./aom_dsp_rtcd.h" #include "./aom_dsp_rtcd.h"
static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride,
int y0, int sizey, int width, int height, int dstride, int x0, int y0, int sizey, int width,
unsigned int strength) { int height, unsigned int strength) {
dst += x0 + y0 * stride; dst += x0 + y0 * dstride;
src += x0 + y0 * stride; src += x0 + y0 * sstride;
{ {
int bottom = height - 2 - y0; int bottom = height - 2 - y0;
const v128 sp = v128_dup_8(strength); const v128 sp = v128_dup_8(strength);
...@@ -32,23 +32,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, ...@@ -32,23 +32,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
for (y = 0; y < sizey; y += 2) { for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src); const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + stride); const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2); v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o); const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8( const v128 a = v128_add_8(
c128, c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1)); v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_shuffle_8(x, b_shuff); const v128 b = v128_shuffle_8(x, b_shuff);
const v128 c = v128_shuffle_8(x, c_shuff); const v128 c = v128_shuffle_8(x, c_shuff);
const v128 d = v128_add_8( const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1), c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + stride))); v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8( const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2), c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + stride))); v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8( const v128 f = v128_add_8(
c128, v128_from_v64(l2, v64_load_aligned( c128, v128_from_v64(l2, v64_load_aligned(
src + ((y != bottom) + 1) * stride))); src + ((y != bottom) + 1) * sstride)));
const v128 tmp = const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
...@@ -70,9 +70,9 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, ...@@ -70,9 +70,9 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
delta, v128_zero()))), delta, v128_zero()))),
4)); 4));
v64_store_aligned(dst, v128_high_v64(o)); v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + stride, v128_low_v64(o)); v64_store_aligned(dst + dstride, v128_low_v64(o));
src += stride * 2; src += sstride * 2;
dst += stride * 2; dst += dstride * 2;
} }
} else if (!(width - x0 - 8)) { // Clip right } else if (!(width - x0 - 8)) { // Clip right
const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL),
...@@ -83,23 +83,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, ...@@ -83,23 +83,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
for (y = 0; y < sizey; y += 2) { for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src); const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + stride); const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2); v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o); const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8( const v128 a = v128_add_8(
c128, c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1)); v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8( const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2), c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + stride))); v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8( const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1), c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + stride))); v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_shuffle_8(x, d_shuff); const v128 d = v128_shuffle_8(x, d_shuff);
const v128 e = v128_shuffle_8(x, e_shuff); const v128 e = v128_shuffle_8(x, e_shuff);
const v128 f = v128_add_8( const v128 f = v128_add_8(
c128, v128_from_v64(l2, v64_load_aligned( c128, v128_from_v64(l2, v64_load_aligned(
src + ((y != bottom) + 1) * stride))); src + ((y != bottom) + 1) * sstride)));
const v128 tmp = const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
...@@ -121,35 +121,35 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, ...@@ -121,35 +121,35 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
delta, v128_zero()))), delta, v128_zero()))),
4)); 4));
v64_store_aligned(dst, v128_high_v64(o)); v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + stride, v128_low_v64(o)); v64_store_aligned(dst + dstride, v128_low_v64(o));
src += stride * 2; src += sstride * 2;
dst += stride * 2; dst += dstride * 2;
} }
} else { // No left/right clipping } else { // No left/right clipping
int y; int y;
for (y = 0; y < sizey; y += 2) { for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src); const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + stride); const v64 l2 = v64_load_aligned(src + sstride);
v128 o = v128_from_v64(l1, l2); v128 o = v128_from_v64(l1, l2);
const v128 x = v128_add_8(c128, o); const v128 x = v128_add_8(c128, o);
const v128 a = v128_add_8( const v128 a = v128_add_8(
c128, c128,
v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1)); v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1));
const v128 b = v128_add_8( const v128 b = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 2), c128, v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + stride))); v64_load_unaligned(src - 2 + sstride)));
const v128 c = v128_add_8( const v128 c = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src - 1), c128, v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + stride))); v64_load_unaligned(src - 1 + sstride)));
const v128 d = v128_add_8( const v128 d = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 1), c128, v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + stride))); v64_load_unaligned(src + 1 + sstride)));
const v128 e = v128_add_8( const v128 e = v128_add_8(
c128, v128_from_v64(v64_load_unaligned(src + 2), c128, v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + stride))); v64_load_unaligned(src + 2 + sstride)));
const v128 f = v128_add_8( const v128 f = v128_add_8(
c128, v128_from_v64(l2, v64_load_aligned( c128, v128_from_v64(l2, v64_load_aligned(
src + ((y != bottom) + 1) * stride))); src + ((y != bottom) + 1) * sstride)));
const v128 tmp = const v128 tmp =
v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm),
...@@ -171,17 +171,18 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, ...@@ -171,17 +171,18 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0,
delta, v128_zero()))), delta, v128_zero()))),
4)); 4));
v64_store_aligned(dst, v128_high_v64(o)); v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + stride, v128_low_v64(o)); v64_store_aligned(dst + dstride, v128_low_v64(o));
src += stride * 2; src += sstride * 2;
dst += stride * 2; dst += dstride * 2;
} }
} }
} }
} }
void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride, void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
int x0, int y0, int sizex, int sizey, int width, int dstride, int x0, int y0, int sizex,
int height, unsigned int strength) { int sizey, int width, int height,
unsigned int strength) {
// TODO(stemidts): // TODO(stemidts):
// A sizex different from 8 will only be needed if CLPF is extended to chroma. // A sizex different from 8 will only be needed if CLPF is extended to chroma.
// This will only be used if 4:2:0 and width not a multiple of 16 and along // This will only be used if 4:2:0 and width not a multiple of 16 and along
...@@ -189,9 +190,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride, ...@@ -189,9 +190,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride,
// this case. If not extended to chroma, this test will be redundant. // this case. If not extended to chroma, this test will be redundant.
if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) { if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) {
// Fallback to C for odd sizes // Fallback to C for odd sizes
aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height, aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width,
strength); height, strength);
} else { } else {
clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength); clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height,
strength);
} }
} }
...@@ -2320,19 +2320,10 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data, ...@@ -2320,19 +2320,10 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
#if CONFIG_CLPF #if CONFIG_CLPF
if (cm->clpf_strength && !cm->skip_loop_filter) { if (cm->clpf_strength && !cm->skip_loop_filter) {
YV12_BUFFER_CONFIG dst; // Buffer for the result const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
av1_clpf_frame(frame, frame, 0, cm, !!cm->clpf_size,
dst = pbi->cur_buf->buf;
CHECK_MEM_ERROR(cm, dst.y_buffer, aom_malloc(dst.y_stride * dst.y_height));
av1_clpf_frame(&dst, &pbi->cur_buf->buf, 0, cm, !!cm->clpf_size,
cm->clpf_strength + (cm->clpf_strength == 3), cm->clpf_strength + (cm->clpf_strength == 3),
4 + cm->clpf_size, cm->clpf_blocks, clpf_bit); 4 + cm->clpf_size, cm->clpf_blocks, clpf_bit);
// Copy result
memcpy(pbi->cur_buf->buf.y_buffer, dst.y_buffer,
dst.y_height * dst.y_stride);
aom_free(dst.y_buffer);
} }
if (cm->clpf_blocks) aom_free(cm->clpf_blocks); if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
#endif #endif
......
...@@ -26,9 +26,9 @@ using libaom_test::ACMRandom; ...@@ -26,9 +26,9 @@ using libaom_test::ACMRandom;
namespace { namespace {
typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int stride, typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride,
int x0, int y0, int sizex, int sizey, int width, int dstride, int x0, int y0, int sizex, int sizey,
int height, unsigned int strength); int width, int height, unsigned int strength);
typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int> typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
clpf_block_param_t; clpf_block_param_t;
...@@ -85,10 +85,10 @@ TEST_P(ClpfBlockTest, TestSIMDNoMismatch) { ...@@ -85,10 +85,10 @@ TEST_P(ClpfBlockTest, TestSIMDNoMismatch) {
for (ypos = 0; ypos < size && !error; ypos += h * !error) { for (ypos = 0; ypos < size && !error; ypos += h * !error) {
for (xpos = 0; xpos < size && !error; xpos += w * !error) { for (xpos = 0; xpos < size && !error; xpos += w * !error) {
for (strength = 0; strength < 3 && !error; strength += !error) { for (strength = 0; strength < 3 && !error; strength += !error) {
ref_clpf(s, ref_d, size, xpos, ypos, w, h, size, size, ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size,
1 << strength); 1 << strength);
ASM_REGISTER_STATE_CHECK( ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w, h,
clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength)); size, size, 1 << strength));
for (pos = 0; pos < size * size && !error; pos++) { for (pos = 0; pos < size * size && !error; pos++) {
error = ref_d[pos] != d[pos]; error = ref_d[pos] != d[pos];
...@@ -137,7 +137,8 @@ TEST_P(ClpfSpeedTest, TestSpeed) { ...@@ -137,7 +137,8 @@ TEST_P(ClpfSpeedTest, TestSpeed) {
for (ypos = 0; ypos < size; ypos += h) { for (ypos = 0; ypos < size; ypos += h) {
for (xpos = 0; xpos < size; xpos += w) { for (xpos = 0; xpos < size; xpos += w) {
for (strength = 0; strength < 3; strength++) { for (strength = 0; strength < 3; strength++) {
ref_clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength); ref_clpf(s, d, size, size, xpos, ypos, w, h, size, size,
1 << strength);
} }
} }
} }
...@@ -150,7 +151,7 @@ TEST_P(ClpfSpeedTest, TestSpeed) { ...@@ -150,7 +151,7 @@ TEST_P(ClpfSpeedTest, TestSpeed) {
for (ypos = 0; ypos < size; ypos += h) { for (ypos = 0; ypos < size; ypos += h) {
for (xpos = 0; xpos < size; xpos += w) { for (xpos = 0; xpos < size; xpos += w) {
for (strength = 0; strength < 3; strength++) { for (strength = 0; strength < 3; strength++) {
clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength); clpf(s, d, size, size, xpos, ypos, w, h, size, size, 1 << strength);
} }
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment