Commit be6cc07d authored by David Barker's avatar David Barker Committed by Debargha Mukherjee

Add new convolve variant for loop-restoration

The convolve filters generated by loop_wiener_filter_tile
are not compatible with some existing convolve implementations
(they can have coefficients >128, sums of (certain subsets of)
coefficients >128, etc.)

So we implement a new variant, which takes a filter with 128
subtracted from its central element and which adds an extra copy
of the source just before clipping to a pixel (reinstating the
128 we subtracted). This should be easy to adapt from the existing
convolve functions, and this patch includes SSE2 highbd and
SSSE3 lowbd implementations.

Change-Id: I0abf4c2915f0665c49d88fe450dbc77b783f69e1
parent 469a5c80
This diff is collapsed.
......@@ -734,6 +734,16 @@ specialize qw/aom_convolve8_avg_horiz sse2 ssse3/;
specialize qw/aom_convolve8_avg_vert sse2 ssse3/;
specialize qw/aom_scaled_2d ssse3/;
if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void aom_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void aom_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/aom_convolve8_add_src ssse3/;
specialize qw/aom_convolve8_add_src_horiz ssse3/;
specialize qw/aom_convolve8_add_src_vert ssse3/;
} # CONFIG_LOOP_RESTORATION
# TODO(any): These need to be extended to up to 128x128 block sizes
if (!(aom_config("CONFIG_AV1") eq "yes" && aom_config("CONFIG_EXT_PARTITION") eq "yes")) {
specialize qw/aom_convolve_copy neon dspr2 msa/;
......@@ -770,6 +780,16 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/aom_highbd_convolve8_avg_vert/, "$sse2_x86_64";
if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void aom_highbd_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
add_proto qw/void aom_highbd_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
add_proto qw/void aom_highbd_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/aom_highbd_convolve8_add_src sse2/;
specialize qw/aom_highbd_convolve8_add_src_horiz sse2/;
specialize qw/aom_highbd_convolve8_add_src_vert sse2/;
} # CONFIG_LOOP_RESTORATION
} # CONFIG_AOM_HIGHBITDEPTH
#
......
......@@ -159,5 +159,24 @@ HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
// int w, int h, int bd);
HIGH_FUN_CONV_2D(, sse2);
HIGH_FUN_CONV_2D(avg_, sse2);
#if CONFIG_LOOP_RESTORATION
// The SSE2 highbd convolve functions can deal with coefficients up to 32767.
// So redirect highbd_convolve8_add_src to regular highbd_convolve8.
void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
assert(x_step_q4 == 16);
assert(y_step_q4 == 16);
((int16_t *)filter_x)[3] += 128;
((int16_t *)filter_y)[3] += 128;
aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h, bd);
((int16_t *)filter_x)[3] -= 128;
((int16_t *)filter_y)[3] -= 128;
}
#endif // CONFIG_LOOP_RESTORATION
#endif // CONFIG_AOM_HIGHBITDEPTH && ARCH_X86_64
#endif // HAVE_SSE2
......@@ -291,6 +291,14 @@ filter8_1dfunction aom_filter_block1d8_v8_avg_ssse3;
filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3;
filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3;
filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3;
#if CONFIG_LOOP_RESTORATION
filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3;
filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3;
filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3;
filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3;
filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3;
filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3;
#endif
filter8_1dfunction aom_filter_block1d16_v2_ssse3;
filter8_1dfunction aom_filter_block1d16_h2_ssse3;
......@@ -331,6 +339,13 @@ FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
ssse3);
#if CONFIG_LOOP_RESTORATION
FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_,
ssse3);
FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
src - src_stride * 3, add_src_, ssse3);
#endif
#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
out2, out3, out4, out5, out6, out7) \
{ \
......@@ -900,3 +915,6 @@ void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
// int w, int h);
FUN_CONV_2D(, ssse3);
FUN_CONV_2D(avg_, ssse3);
#if CONFIG_LOOP_RESTORATION
FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3);
#endif
......@@ -15,6 +15,7 @@
SECTION_RODATA
pw_64: times 8 dw 64
even_byte_mask: times 8 dw 0x00ff
; %define USE_PMULHRSW
; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
......@@ -142,6 +143,14 @@ cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
paddsw m0, m1
paddsw m0, krd
psraw m0, 7
%ifidn %1, h8_add_src
pxor m3, m3
movu m4, [srcq]
movu m5, [srcq + sstrideq]
punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
punpcklbw m4, m3
paddsw m0, m4
%endif
packuswb m0, m0
psrldq m1, m0, 4
......@@ -178,6 +187,12 @@ cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
paddsw m0, m1
paddsw m0, krd
psraw m0, 7
%ifidn %1, h8_add_src
pxor m3, m3
movu m4, [srcq]
punpcklbw m4, m3
paddsw m0, m4
%endif
packuswb m0, m0
%ifidn %1, h8_avg
movd m4, [dstq]
......@@ -235,6 +250,15 @@ cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
paddsw m6, m0
paddsw m6, krd
psraw m6, 7
%ifidn %1, h8_add_src
pxor m3, m3
movu m4, [srcq]
movu m5, [srcq + sstrideq]
punpcklbw m4, m3
punpcklbw m5, m3
paddsw m1, m4
paddsw m6, m5
%endif
packuswb m1, m6
%ifidn %1, h8_avg
pavgb m1, m2
......@@ -269,6 +293,12 @@ cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
paddsw m1, m4
paddsw m1, krd
psraw m1, 7
%ifidn %1, h8_add_src
pxor m6, m6
movu m5, [srcq]
punpcklbw m5, m6
paddsw m1, m5
%endif
packuswb m1, m1
%ifidn %1, h8_avg
movh m0, [dstq]
......@@ -315,6 +345,14 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
paddsw m4, krd
psraw m0, 7
psraw m4, 7
%ifidn %1, h8_add_src
movu m5, [srcq]
mova m7, m5
pand m5, [even_byte_mask]
psrlw m7, 8
paddsw m0, m5
paddsw m4, m7
%endif
packuswb m0, m0
packuswb m4, m4
punpcklbw m0, m4
......@@ -337,6 +375,12 @@ SUBPIX_HFILTER8 h8_avg
SUBPIX_HFILTER4 h8
SUBPIX_HFILTER4 h8_avg
%if CONFIG_LOOP_RESTORATION
SUBPIX_HFILTER16 h8_add_src
SUBPIX_HFILTER8 h8_add_src
SUBPIX_HFILTER4 h8_add_src
%endif
;-------------------------------------------------------------------------------
; TODO(Linfeng): Detect cpu type and choose the code with better performance.
......@@ -413,12 +457,23 @@ cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
paddsw m0, krd
psraw m0, 7
paddsw m1, m5
%ifidn %1, v8_add_src
pxor m6, m6
movu m4, [srcq]
punpcklbw m4, m6
paddsw m0, m4
%endif
packuswb m0, m0
paddsw m3, m7
paddsw m1, m3
paddsw m1, krd
psraw m1, 7
%ifidn %1, v8_add_src
movu m4, [src1q]
punpcklbw m4, m6
paddsw m1, m4
%endif
lea srcq, [srcq + sstrideq * 2 ]
lea src1q, [src1q + sstrideq * 2]
packuswb m1, m1
......@@ -462,6 +517,12 @@ cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
paddsw m0, m2
paddsw m0, krd
psraw m0, 7
%ifidn %1, v8_add_src
pxor m6, m6
movu m4, [srcq]
punpcklbw m4, m6
paddsw m0, m4
%endif
packuswb m0, m0
%ifidn %1, v8_avg
movx m1, [dstq]
......@@ -643,6 +704,15 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
paddsw m3, m7
paddsw m3, krd
psraw m3, 7
%ifidn %1, v8_add_src
pxor m6, m6
movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
mova m5, m4
punpcklbw m4, m6
punpckhbw m5, m6
paddsw m0, m4
paddsw m3, m5
%endif
packuswb m0, m3
add srcq, sstrideq
......@@ -804,3 +874,10 @@ SUBPIX_VFILTER v8, 8
SUBPIX_VFILTER v8_avg, 8
SUBPIX_VFILTER v8, 4
SUBPIX_VFILTER v8_avg, 4
%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON) && \
CONFIG_LOOP_RESTORATION
SUBPIX_VFILTER16 v8_add_src
SUBPIX_VFILTER v8_add_src, 8
SUBPIX_VFILTER v8_add_src, 4
%endif
......@@ -31,11 +31,7 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
(void)x_step_q4; \
(void)filter_y; \
(void)y_step_q4; \
if (filter[3] >= 128) { \
aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \
x_step_q4, filter_y, y_step_q4, w, h); \
return; \
} \
assert((-128 <= filter[3]) && (filter[3] <= 127)); \
assert(step_q4 == 16); \
if (filter[0] | filter[1] | filter[2]) { \
while (w >= 16) { \
......@@ -93,11 +89,8 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, int w, int h) { \
if (filter_x[3] >= 128 || filter_y[3] >= 128) { \
aom_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter_x, \
x_step_q4, filter_y, y_step_q4, w, h); \
return; \
} \
assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \
assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \
assert(w <= MAX_SB_SIZE); \
assert(h <= MAX_SB_SIZE); \
assert(x_step_q4 == 16); \
......@@ -122,8 +115,71 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
} \
}
#if CONFIG_AOM_HIGHBITDEPTH
#if CONFIG_LOOP_RESTORATION
// convolve_add_src is only used by the Wiener filter, which will never
// end up calling the bilinear functions (it uses a symmetric filter, so
// the possible numbers of taps are 1,3,5,7)
#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \
opt) \
void aom_convolve8_##name##_##opt( \
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, int w, int h) { \
(void)filter_x; \
(void)x_step_q4; \
(void)filter_y; \
(void)y_step_q4; \
assert((-128 <= filter[3]) && (filter[3] <= 127)); \
assert(step_q4 == 16); \
while (w >= 16) { \
aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter); \
src += 16; \
dst += 16; \
w -= 16; \
} \
while (w >= 8) { \
aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter); \
src += 8; \
dst += 8; \
w -= 8; \
} \
while (w >= 4) { \
aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter); \
src += 4; \
dst += 4; \
w -= 4; \
} \
if (w) { \
aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \
x_step_q4, filter_y, y_step_q4, w, h); \
} \
}
#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt) \
void aom_convolve8_##type##opt( \
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, int w, int h) { \
DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \
assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \
assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \
assert(w <= MAX_SB_SIZE); \
assert(h <= MAX_SB_SIZE); \
assert(x_step_q4 == 16); \
assert(y_step_q4 == 16); \
aom_convolve8_##htype##horiz_##opt( \
src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x, \
x_step_q4, filter_y, y_step_q4, w, h + 7); \
aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
dst, dst_stride, filter_x, x_step_q4, \
filter_y, y_step_q4, w, h); \
}
#endif
#if CONFIG_AOM_HIGHBITDEPTH
typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
const ptrdiff_t src_pitch,
uint16_t *output_ptr,
......
......@@ -208,6 +208,8 @@ static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
}
hkernel[WIENER_WIN] = 0;
vkernel[WIENER_WIN] = 0;
hkernel[3] -= 128;
vkernel[3] -= 128;
av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
tile_width, tile_height, width, height, 0, 0,
&h_start, &h_end, &v_start, &v_end);
......@@ -219,8 +221,8 @@ static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
int h = AOMMIN(MAX_SB_SIZE, (v_end - i + 15) & ~15);
const uint8_t *data_p = data + i * stride + j;
uint8_t *dst_p = dst + i * dst_stride + j;
aom_convolve8(data_p, stride, dst_p, dst_stride, hkernel, 16, vkernel, 16,
w, h);
aom_convolve8_add_src(data_p, stride, dst_p, dst_stride, hkernel, 16,
vkernel, 16, w, h);
}
}
......@@ -779,6 +781,8 @@ static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
}
hkernel[WIENER_WIN] = 0;
vkernel[WIENER_WIN] = 0;
hkernel[3] -= 128;
vkernel[3] -= 128;
av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
tile_width, tile_height, width, height, 0, 0,
&h_start, &h_end, &v_start, &v_end);
......@@ -790,9 +794,9 @@ static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
int h = AOMMIN(MAX_SB_SIZE, (v_end - i + 15) & ~15);
const uint16_t *data_p = data + i * stride + j;
uint16_t *dst_p = dst + i * dst_stride + j;
aom_highbd_convolve8_c(CONVERT_TO_BYTEPTR(data_p), stride,
CONVERT_TO_BYTEPTR(dst_p), dst_stride, hkernel, 16,
vkernel, 16, w, h, bit_depth);
aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
CONVERT_TO_BYTEPTR(dst_p), dst_stride,
hkernel, 16, vkernel, 16, w, h, bit_depth);
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment