Commit 94e3fe3b authored by Yunqing Wang's avatar Yunqing Wang

Add optimized convolve functions for single reference case

Added optimized convolve functions for single reference case, so that no
separate post rounding is needed and the result is written to the
destination buffer directly. Duplicate code will be cleaned up later.

Change-Id: Iffc0cc6e135b8b6f45a95c314d63368f5aa35f34
parent b101935f
......@@ -560,26 +560,34 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
}
# CONVOLVE_ROUND/COMPOUND_ROUND functions
add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d sse2 avx2/;
add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d_sr c/;
add_proto qw/void av1_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits";
specialize qw/av1_convolve_rounding avx2/;
add_proto qw/void av1_convolve_2d_copy/, "const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
add_proto qw/void av1_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d_copy sse2/;
add_proto qw/void av1_convolve_x/, "const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d_copy_sr c/;
add_proto qw/void av1_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_x sse2/;
add_proto qw/void av1_convolve_y/, "const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
add_proto qw/void av1_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_y sse2/;
add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_x_sr c/;
add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_y_sr c/;
add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d_scale sse4_1/;
if (aom_config("CONFIG_JNT_COMP") eq "yes") {
add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_jnt_convolve_2d sse4_1/;
add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_jnt_convolve_2d_copy sse2/;
}
......
......@@ -373,7 +373,7 @@ void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst,
bit widths for various intermediate values, see the comments above
av1_warp_affine_c.
*/
void av1_convolve_2d_c(const uint8_t *src, int src_stride, const uint8_t *dst0,
void av1_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
......@@ -429,7 +429,7 @@ void av1_convolve_2d_c(const uint8_t *src, int src_stride, const uint8_t *dst0,
}
}
void av1_convolve_y_c(const uint8_t *src, int src_stride, const uint8_t *dst0,
void av1_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
......@@ -462,7 +462,7 @@ void av1_convolve_y_c(const uint8_t *src, int src_stride, const uint8_t *dst0,
}
}
void av1_convolve_x_c(const uint8_t *src, int src_stride, const uint8_t *dst0,
void av1_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
......@@ -495,8 +495,8 @@ void av1_convolve_x_c(const uint8_t *src, int src_stride, const uint8_t *dst0,
}
}
void av1_convolve_2d_copy_c(const uint8_t *src, int src_stride,
const uint8_t *dst0, int dst_stride0, int w, int h,
void av1_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
......@@ -524,9 +524,132 @@ void av1_convolve_2d_copy_c(const uint8_t *src, int src_stride,
}
}
void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
int im_h = h + filter_params_y->taps - 1;
int im_stride = w;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bd = 8;
const int bits =
FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
// horizontal filter
const uint8_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < im_h; ++y) {
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
for (int k = 0; k < filter_params_x->taps; ++k) {
sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
}
assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
im_block[y * im_stride + x] =
ROUND_POWER_OF_TWO(sum, conv_params->round_0);
}
}
// vertical filter
int32_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
assert(0 <= sum && sum < (1 << (offset_bits + 2)));
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
}
}
}
void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int fo_vert = filter_params_y->taps / 2 - 1;
(void)filter_params_x;
(void)subpel_x_q4;
(void)conv_params;
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = 0;
for (int k = 0; k < filter_params_y->taps; ++k) {
res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
}
dst[y * dst_stride + x] =
clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
}
}
}
void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bits = FILTER_BITS - conv_params->round_0;
(void)filter_params_y;
(void)subpel_y_q4;
(void)conv_params;
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = 0;
for (int k = 0; k < filter_params_x->taps; ++k) {
res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
}
res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
}
}
}
void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
(void)filter_params_x;
(void)filter_params_y;
(void)subpel_x_q4;
(void)subpel_y_q4;
(void)conv_params;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
dst[y * dst_stride + x] = src[y * src_stride + x];
}
}
}
#if CONFIG_JNT_COMP
void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride,
const uint8_t *dst0, int dst_stride0, int w, int h,
void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
......@@ -593,8 +716,8 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride,
}
void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
const uint8_t *dst0, int dst_stride0, int w,
int h, InterpFilterParams *filter_params_x,
uint8_t *dst0, int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
......@@ -743,7 +866,7 @@ void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
&filter_params_y, subpel_x_q4, x_step_q4, subpel_y_q4,
y_step_q4, conv_params);
else
sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][1](
sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
src, src_stride, dst, dst_stride, w, h, &filter_params_x,
&filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
}
......
......@@ -44,8 +44,8 @@ typedef struct ConvolveParams {
} ConvolveParams;
typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
const uint8_t *dst, int dst_stride, int w,
int h, InterpFilterParams *filter_params_x,
uint8_t *dst, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params);
......
......@@ -77,7 +77,11 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
interp_filters, subpel_x, xs, subpel_y, ys, 0,
conv_params, sf);
conv_params->do_post_rounding = 1;
if (conv_params->is_compound)
conv_params->do_post_rounding = 1;
else
conv_params->do_post_rounding = 0;
} else {
assert(conv_params->round == CONVOLVE_OPT_ROUND);
......
......@@ -182,9 +182,18 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
}
#endif // CONFIG_HIGHBITDEPTH
#if CONFIG_JNT_COMP
// AV1 convolve functions
// Special case convolve functions should produce the same result as
// av1_int_convolve_2d.
// av1_convolve_2d.
// subpel_x_q4 == 0 && subpel_y_q4 == 0
sf->convolve[0][0][0] = av1_convolve_2d_copy_sr;
// subpel_x_q4 == 0
sf->convolve[0][1][0] = av1_convolve_y_sr;
// subpel_y_q4 == 0
sf->convolve[1][0][0] = av1_convolve_x_sr;
// subpel_x_q4 != 0 && subpel_y_q4 != 0
sf->convolve[1][1][0] = av1_convolve_2d_sr;
#if CONFIG_JNT_COMP
// subpel_x_q4 == 0 && subpel_y_q4 == 0
sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
// subpel_x_q4 == 0
......@@ -196,8 +205,6 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
// subpel_x_q4 != 0 && subpel_y_q4 != 0
sf->convolve[1][1][1] = av1_jnt_convolve_2d;
#else
// Special case convolve functions should produce the same result as
// av1_convolve_2d.
// subpel_x_q4 == 0 && subpel_y_q4 == 0
sf->convolve[0][0][1] = av1_convolve_2d_copy;
// subpel_x_q4 == 0
......
......@@ -17,8 +17,8 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
void av1_convolve_2d_avx2(const uint8_t *src, int src_stride,
const uint8_t *dst0, int dst_stride0, int w, int h,
void av1_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
......
......@@ -17,8 +17,8 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
const uint8_t *dst0, int dst_stride0, int w, int h,
void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
......@@ -205,8 +205,8 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
}
void av1_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
const uint8_t *dst0, int dst_stride0, int w,
int h, InterpFilterParams *filter_params_x,
uint8_t *dst0, int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
......@@ -322,8 +322,8 @@ void av1_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
#if CONFIG_JNT_COMP
void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
const uint8_t *dst0, int dst_stride0, int w,
int h, InterpFilterParams *filter_params_x,
uint8_t *dst0, int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
......
......@@ -20,8 +20,8 @@
#if CONFIG_JNT_COMP
void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride,
const uint8_t *dst0, int dst_stride0, int w,
int h, InterpFilterParams *filter_params_x,
uint8_t *dst0, int dst_stride0, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
......
......@@ -25,8 +25,8 @@ namespace libaom_test {
namespace AV1Convolve2D {
typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
const uint8_t *dst, int dst_stride, int w,
int h, InterpFilterParams *filter_params_x,
uint8_t *dst, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment