Commit be6cc07d authored by David Barker's avatar David Barker Committed by Debargha Mukherjee

Add new convolve variant for loop-restoration

The convolve filters generated by loop_wiener_filter_tile
are not compatible with some existing convolve implementations
(they can have coefficients >128, sums of (certain subsets of)
coefficients >128, etc.)

So we implement a new variant, which takes a filter with 128
subtracted from its central element and which adds an extra copy
of the source just before clipping to a pixel (reinstating the
128 we subtracted). This should be easy to adapt from the existing
convolve functions, and this patch includes SSE2 highbd and
SSSE3 lowbd implementations.

Change-Id: I0abf4c2915f0665c49d88fe450dbc77b783f69e1
parent 469a5c80
......@@ -332,6 +332,122 @@ void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
filter_y, y_step_q4, w, h);
}
#if CONFIG_LOOP_RESTORATION
static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *x_filters, int x0_q4,
int x_step_q4, int w, int h) {
int x, y;
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
int x_q4 = x0_q4;
for (x = 0; x < w; ++x) {
const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
src_x[SUBPEL_TAPS / 2 - 1]);
x_q4 += x_step_q4;
}
src += src_stride;
dst += dst_stride;
}
}
static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *y_filters, int y0_q4,
int y_step_q4, int w, int h) {
int x, y;
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
for (x = 0; x < w; ++x) {
int y_q4 = y0_q4;
for (y = 0; y < h; ++y) {
const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_y[k * src_stride] * y_filter[k];
dst[y * dst_stride] =
clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
y_q4 += y_step_q4;
}
++src;
++dst;
}
}
static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *const x_filters, int x0_q4,
int x_step_q4, const InterpKernel *const y_filters,
int y0_q4, int y_step_q4, int w, int h) {
uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
assert(w <= MAX_SB_SIZE);
assert(h <= MAX_SB_SIZE);
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
intermediate_height);
convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
}
void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
(void)filter_y;
(void)y_step_q4;
convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
x_step_q4, w, h);
}
void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
(void)filter_x;
(void)x_step_q4;
convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
y_step_q4, w, h);
}
void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
}
#endif // CONFIG_LOOP_RESTORATION
#if CONFIG_AOM_HIGHBITDEPTH
static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
......@@ -597,4 +713,142 @@ void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
dst += dst_stride;
}
}
#endif
#if CONFIG_LOOP_RESTORATION
static void highbd_convolve_add_src_horiz(const uint8_t *src8,
ptrdiff_t src_stride, uint8_t *dst8,
ptrdiff_t dst_stride,
const InterpKernel *x_filters,
int x0_q4, int x_step_q4, int w,
int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
int x_q4 = x0_q4;
for (x = 0; x < w; ++x) {
const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
dst[x] = clip_pixel_highbd(
ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
bd);
x_q4 += x_step_q4;
}
src += src_stride;
dst += dst_stride;
}
}
static void highbd_convolve_add_src_vert(const uint8_t *src8,
ptrdiff_t src_stride, uint8_t *dst8,
ptrdiff_t dst_stride,
const InterpKernel *y_filters,
int y0_q4, int y_step_q4, int w, int h,
int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
for (x = 0; x < w; ++x) {
int y_q4 = y0_q4;
for (y = 0; y < h; ++y) {
const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_y[k * src_stride] * y_filter[k];
dst[y * dst_stride] =
clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
bd);
y_q4 += y_step_q4;
}
++src;
++dst;
}
}
static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *const x_filters,
int x0_q4, int x_step_q4,
const InterpKernel *const y_filters,
int y0_q4, int y_step_q4, int w, int h,
int bd) {
// Note: Fixed size intermediate buffer, temp, places limits on parameters.
// 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
// (2) Interpolate temp vertically to derive the sub-pixel result.
// Deriving the maximum number of rows in the temp buffer (135):
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
// --Largest block size is 64x64 pixels.
// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
// original frame (in 1/16th pixel units).
// --Must round-up because block may be located at sub-pixel position.
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
assert(w <= MAX_SB_SIZE);
assert(h <= MAX_SB_SIZE);
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
src_stride, CONVERT_TO_BYTEPTR(temp),
MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
intermediate_height, bd);
highbd_convolve_add_src_vert(
CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
}
void aom_highbd_convolve8_add_src_horiz_c(
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
(void)filter_y;
(void)y_step_q4;
highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x,
x0_q4, x_step_q4, w, h, bd);
}
void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src,
ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
(void)filter_x;
(void)x_step_q4;
highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y,
y0_q4, y_step_q4, w, h, bd);
}
void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
}
#endif // CONFIG_LOOP_RESTORATION
#endif // CONFIG_AOM_HIGHBITDEPTH
......@@ -734,6 +734,16 @@ specialize qw/aom_convolve8_avg_horiz sse2 ssse3/;
specialize qw/aom_convolve8_avg_vert sse2 ssse3/;
specialize qw/aom_scaled_2d ssse3/;
if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void aom_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void aom_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/aom_convolve8_add_src ssse3/;
specialize qw/aom_convolve8_add_src_horiz ssse3/;
specialize qw/aom_convolve8_add_src_vert ssse3/;
} # CONFIG_LOOP_RESTORATION
# TODO(any): These need to be extended to up to 128x128 block sizes
if (!(aom_config("CONFIG_AV1") eq "yes" && aom_config("CONFIG_EXT_PARTITION") eq "yes")) {
specialize qw/aom_convolve_copy neon dspr2 msa/;
......@@ -770,6 +780,16 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/aom_highbd_convolve8_avg_vert/, "$sse2_x86_64";
if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void aom_highbd_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
add_proto qw/void aom_highbd_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
add_proto qw/void aom_highbd_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/aom_highbd_convolve8_add_src sse2/;
specialize qw/aom_highbd_convolve8_add_src_horiz sse2/;
specialize qw/aom_highbd_convolve8_add_src_vert sse2/;
} # CONFIG_LOOP_RESTORATION
} # CONFIG_AOM_HIGHBITDEPTH
#
......
......@@ -159,5 +159,24 @@ HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
// int w, int h, int bd);
HIGH_FUN_CONV_2D(, sse2);
HIGH_FUN_CONV_2D(avg_, sse2);
#if CONFIG_LOOP_RESTORATION
// The SSE2 highbd convolve functions can deal with coefficients up to 32767.
// So redirect highbd_convolve8_add_src to regular highbd_convolve8.
void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
assert(x_step_q4 == 16);
assert(y_step_q4 == 16);
((int16_t *)filter_x)[3] += 128;
((int16_t *)filter_y)[3] += 128;
aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h, bd);
((int16_t *)filter_x)[3] -= 128;
((int16_t *)filter_y)[3] -= 128;
}
#endif // CONFIG_LOOP_RESTORATION
#endif // CONFIG_AOM_HIGHBITDEPTH && ARCH_X86_64
#endif // HAVE_SSE2
......@@ -291,6 +291,14 @@ filter8_1dfunction aom_filter_block1d8_v8_avg_ssse3;
filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3;
filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3;
filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3;
#if CONFIG_LOOP_RESTORATION
filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3;
filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3;
filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3;
filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3;
filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3;
filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3;
#endif
filter8_1dfunction aom_filter_block1d16_v2_ssse3;
filter8_1dfunction aom_filter_block1d16_h2_ssse3;
......@@ -331,6 +339,13 @@ FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
ssse3);
#if CONFIG_LOOP_RESTORATION
FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_,
ssse3);
FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
src - src_stride * 3, add_src_, ssse3);
#endif
#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
out2, out3, out4, out5, out6, out7) \
{ \
......@@ -900,3 +915,6 @@ void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
// int w, int h);
FUN_CONV_2D(, ssse3);
FUN_CONV_2D(avg_, ssse3);
#if CONFIG_LOOP_RESTORATION
FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3);
#endif
......@@ -15,6 +15,7 @@
SECTION_RODATA
pw_64: times 8 dw 64
even_byte_mask: times 8 dw 0x00ff
; %define USE_PMULHRSW
; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
......@@ -142,6 +143,14 @@ cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
paddsw m0, m1
paddsw m0, krd
psraw m0, 7
%ifidn %1, h8_add_src
pxor m3, m3
movu m4, [srcq]
movu m5, [srcq + sstrideq]
punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
punpcklbw m4, m3
paddsw m0, m4
%endif
packuswb m0, m0
psrldq m1, m0, 4
......@@ -178,6 +187,12 @@ cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
paddsw m0, m1
paddsw m0, krd
psraw m0, 7
%ifidn %1, h8_add_src
pxor m3, m3
movu m4, [srcq]
punpcklbw m4, m3
paddsw m0, m4
%endif
packuswb m0, m0
%ifidn %1, h8_avg
movd m4, [dstq]
......@@ -235,6 +250,15 @@ cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
paddsw m6, m0
paddsw m6, krd
psraw m6, 7
%ifidn %1, h8_add_src
pxor m3, m3
movu m4, [srcq]
movu m5, [srcq + sstrideq]
punpcklbw m4, m3
punpcklbw m5, m3
paddsw m1, m4
paddsw m6, m5
%endif
packuswb m1, m6
%ifidn %1, h8_avg
pavgb m1, m2
......@@ -269,6 +293,12 @@ cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
paddsw m1, m4
paddsw m1, krd
psraw m1, 7
%ifidn %1, h8_add_src
pxor m6, m6
movu m5, [srcq]
punpcklbw m5, m6
paddsw m1, m5
%endif
packuswb m1, m1
%ifidn %1, h8_avg
movh m0, [dstq]
......@@ -315,6 +345,14 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
paddsw m4, krd
psraw m0, 7
psraw m4, 7
%ifidn %1, h8_add_src
movu m5, [srcq]
mova m7, m5
pand m5, [even_byte_mask]
psrlw m7, 8
paddsw m0, m5
paddsw m4, m7
%endif
packuswb m0, m0
packuswb m4, m4
punpcklbw m0, m4
......@@ -337,6 +375,12 @@ SUBPIX_HFILTER8 h8_avg
SUBPIX_HFILTER4 h8
SUBPIX_HFILTER4 h8_avg
%if CONFIG_LOOP_RESTORATION
SUBPIX_HFILTER16 h8_add_src
SUBPIX_HFILTER8 h8_add_src
SUBPIX_HFILTER4 h8_add_src
%endif
;-------------------------------------------------------------------------------
; TODO(Linfeng): Detect cpu type and choose the code with better performance.
......@@ -413,12 +457,23 @@ cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
paddsw m0, krd
psraw m0, 7
paddsw m1, m5
%ifidn %1, v8_add_src
pxor m6, m6
movu m4, [srcq]
punpcklbw m4, m6
paddsw m0, m4
%endif
packuswb m0, m0
paddsw m3, m7
paddsw m1, m3
paddsw m1, krd
psraw m1, 7
%ifidn %1, v8_add_src
movu m4, [src1q]
punpcklbw m4, m6
paddsw m1, m4
%endif
lea srcq, [srcq + sstrideq * 2 ]
lea src1q, [src1q + sstrideq * 2]
packuswb m1, m1
......@@ -462,6 +517,12 @@ cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
paddsw m0, m2
paddsw m0, krd
psraw m0, 7
%ifidn %1, v8_add_src
pxor m6, m6
movu m4, [srcq]
punpcklbw m4, m6
paddsw m0, m4
%endif
packuswb m0, m0
%ifidn %1, v8_avg
movx m1, [dstq]
......@@ -643,6 +704,15 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
paddsw m3, m7
paddsw m3, krd
psraw m3, 7
%ifidn %1, v8_add_src
pxor m6, m6
movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
mova m5, m4
punpcklbw m4, m6
punpckhbw m5, m6
paddsw m0, m4
paddsw m3, m5
%endif
packuswb m0, m3
add srcq, sstrideq
......@@ -804,3 +874,10 @@ SUBPIX_VFILTER v8, 8
SUBPIX_VFILTER v8_avg, 8
SUBPIX_VFILTER v8, 4
SUBPIX_VFILTER v8_avg, 4
%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON) && \
CONFIG_LOOP_RESTORATION
SUBPIX_VFILTER16 v8_add_src
SUBPIX_VFILTER v8_add_src, 8
SUBPIX_VFILTER v8_add_src, 4
%endif
......@@ -31,11 +31,7 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
(void)x_step_q4; \
(void)filter_y; \
(void)y_step_q4; \
if (filter[3] >= 128) { \
aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \
x_step_q4, filter_y, y_step_q4, w, h); \
return; \
} \
assert((-128 <= filter[3]) && (filter[3] <= 127)); \
assert(step_q4 == 16); \
if (filter[0] | filter[1] | filter[2]) { \
while (w >= 16) { \
......@@ -93,11 +89,8 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, int w, int h) { \
if (filter_x[3] >= 128 || filter_y[3] >= 128) { \
aom_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter_x, \
x_step_q4, filter_y, y_step_q4, w, h); \
return; \
} \
assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \
assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \
assert(w <= MAX_SB_SIZE); \
assert(h <= MAX_SB_SIZE); \
assert(x_step_q4 == 16); \
......@@ -122,8 +115,71 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
} \
}
#if CONFIG_AOM_HIGHBITDEPTH
#if CONFIG_LOOP_RESTORATION
// convolve_add_src is only used by the Wiener filter, which will never
// end up calling the bilinear functions (it uses a symmetric filter, so
// the possible numbers of taps are 1,3,5,7)
#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \
opt) \
void aom_convolve8_##name##_##opt( \
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, int w, int h) { \
(void)filter_x; \
(void)x_step_q4; \
(void)filter_y; \
(void)y_step_q4; \
assert((-128 <= filter[3]) && (filter[3] <= 127)); \
assert(step_q4 == 16); \
while (w >= 16) { \
aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter); \
src += 16; \
dst += 16; \
w -= 16; \
} \
while (w >= 8) { \
aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter); \
src += 8; \
dst += 8; \
w -= 8; \
} \
while (w >= 4) { \
aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
dst_stride, h, filter); \
src += 4; \
dst += 4; \
w -= 4; \
} \
if (w) { \
aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \
x_step_q4, filter_y, y_step_q4, w, h); \
} \
}
#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt) \
void aom_convolve8_##type##opt( \
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, int w, int h) { \
DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \
assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \
assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \
assert(w <= MAX_SB_SIZE); \
assert(h <= MAX_SB_SIZE); \
assert(x_step_q4 == 16); \
assert(y_step_q4 == 16); \
aom_convolve8_##htype##horiz_##opt( \
src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x, \
x_step_q4, filter_y, y_step_q4, w, h + 7); \
aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
dst, dst_stride, filter_x, x_step_q4, \
filter_y, y_step_q4, w, h); \