Commit 07089c68 authored by Rupert Swarbrick's avatar Rupert Swarbrick Committed by Debargha Mukherjee

Obey do_average flag when doing convolve_round

Doing this means that we don't have to memset temporary buffers to
zero in reconinter.c, which was taking ~5% of cycles in a short
encoding test (using perf to attach to a running encode).

Change-Id: Ibb6e31920000b876c6ee99f454d89c8a97e9fb31
parent 91a1cf91
......@@ -361,7 +361,10 @@ void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
dst[y * dst_stride + x] += res;
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
}
}
}
......@@ -416,7 +419,10 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
}
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
dst[y * dst_stride + x] += res;
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
}
src_vert++;
}
......@@ -475,7 +481,10 @@ void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
dst[y * dst_stride + x] += res;
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
}
}
}
......@@ -536,7 +545,10 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
dst[y * dst_stride + x] += res;
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
}
src_vert++;
}
......@@ -669,7 +681,10 @@ void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
dst[y * dst_stride + x] += res;
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
}
}
}
......@@ -725,7 +740,10 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
}
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
dst[y * dst_stride + x] += res;
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
}
src_vert++;
}
......@@ -778,7 +796,10 @@ void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
dst[y * dst_stride + x] += res;
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
}
}
}
......@@ -838,7 +859,10 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
dst[y * dst_stride + x] += res;
if (conv_params->do_average)
dst[y * dst_stride + x] += res;
else
dst[y * dst_stride + x] = res;
}
src_vert++;
}
......
......@@ -975,7 +975,6 @@ void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
#if CONFIG_HIGHBITDEPTH
#if CONFIG_CONVOLVE_ROUND
DECLARE_ALIGNED(16, CONV_BUF_TYPE, tmp_dst2[MAX_SB_SQUARE]);
memset(tmp_dst2, 0, sizeof(tmp_dst2));
int tmp_dst2_stride = MAX_SB_SIZE;
CONV_BUF_TYPE *org_dst = conv_params->dst;
int org_dst_stride = conv_params->dst_stride;
......@@ -1310,7 +1309,6 @@ static INLINE void build_inter_predictors(
is_compound = has_second_ref(this_mbmi);
#if CONFIG_CONVOLVE_ROUND
DECLARE_ALIGNED(16, int32_t, tmp_dst[8 * 8]);
av1_zero(tmp_dst);
int tmp_dst_stride = 8;
assert(w <= 8 && h <= 8);
#endif // CONFIG_CONVOLVE_ROUND
......@@ -1489,7 +1487,6 @@ static INLINE void build_inter_predictors(
SubpelParams subpel_params[2];
#if CONFIG_CONVOLVE_ROUND
DECLARE_ALIGNED(16, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
av1_zero(tmp_dst);
#endif // CONFIG_CONVOLVE_ROUND
#if CONFIG_EXT_INTER && CONFIG_COMPOUND_SINGLEREF
......
......@@ -1050,7 +1050,10 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
(1 << (offset_bits_horiz + FILTER_BITS -
conv_params->round_0 - conv_params->round_1)) -
(1 << (offset_bits_vert - conv_params->round_1));
*p += sum;
if (conv_params->do_average)
*p += sum;
else
*p = sum;
} else {
#else
{
......@@ -1395,7 +1398,10 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
(1 << (offset_bits_horiz + FILTER_BITS -
conv_params->round_0 - conv_params->round_1)) -
(1 << (offset_bits_vert - conv_params->round_1));
*p += sum;
if (conv_params->do_average)
*p += sum;
else
*p = sum;
} else {
#else
{
......
......@@ -31,6 +31,7 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
int i, j;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int do_average = conv_params->do_average;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
const __m128i zero = _mm_setzero_si128();
......@@ -181,9 +182,15 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
// Accumulate values into the destination buffer
__m128i *const p = (__m128i *)&dst[i * dst_stride + j];
_mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
_mm_storeu_si128(p + 1,
_mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
if (do_average) {
_mm_storeu_si128(p + 0,
_mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
_mm_storeu_si128(p + 1,
_mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
} else {
_mm_storeu_si128(p + 0, res_lo_round);
_mm_storeu_si128(p + 1, res_hi_round);
}
}
}
}
......@@ -204,6 +211,7 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
int i, j;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int do_average = conv_params->do_average;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
const __m128i zero = _mm_setzero_si128();
......@@ -357,9 +365,15 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride,
// Accumulate values into the destination buffer
__m128i *const p = (__m128i *)&dst[i * dst_stride + j];
_mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
_mm_storeu_si128(p + 1,
_mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
if (do_average) {
_mm_storeu_si128(p + 0,
_mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
_mm_storeu_si128(p + 1,
_mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
} else {
_mm_storeu_si128(p + 0, res_lo_round);
_mm_storeu_si128(p + 1, res_hi_round);
}
}
}
}
......
......@@ -32,6 +32,7 @@ void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
int i, j;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int do_average = conv_params->do_average;
const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
/* Horizontal filter */
......@@ -185,9 +186,15 @@ void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
// Accumulate values into the destination buffer
__m128i *const p = (__m128i *)&dst[i * dst_stride + j];
_mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
_mm_storeu_si128(p + 1,
_mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
if (do_average) {
_mm_storeu_si128(p + 0,
_mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
_mm_storeu_si128(p + 1,
_mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
} else {
_mm_storeu_si128(p + 0, res_lo_round);
_mm_storeu_si128(p + 1, res_hi_round);
}
}
}
}
......@@ -204,6 +211,7 @@ void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
int im_h = h + filter_params_y->taps - 1;
int im_stride = MAX_SB_SIZE;
int i, j;
const int do_average = conv_params->do_average;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
......@@ -362,9 +370,15 @@ void av1_highbd_convolve_2d_ssse3(const uint16_t *src, int src_stride,
// Accumulate values into the destination buffer
__m128i *const p = (__m128i *)&dst[i * dst_stride + j];
_mm_storeu_si128(p, _mm_add_epi32(_mm_loadu_si128(p), res_lo_round));
_mm_storeu_si128(p + 1,
_mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
if (do_average) {
_mm_storeu_si128(p + 0,
_mm_add_epi32(_mm_loadu_si128(p + 0), res_lo_round));
_mm_storeu_si128(p + 1,
_mm_add_epi32(_mm_loadu_si128(p + 1), res_hi_round));
} else {
_mm_storeu_si128(p + 0, res_lo_round);
_mm_storeu_si128(p + 1, res_hi_round);
}
}
}
}
......
......@@ -320,21 +320,20 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
__m128i *const p =
(__m128i *)&conv_params
->dst[(i + k + 4) * conv_params->dst_stride + j];
const __m128i orig_lo = _mm_loadu_si128(p);
const __m128i round_const = _mm_set1_epi32(
-(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
((1 << (conv_params->round_1)) >> 1));
res_lo = _mm_add_epi32(res_lo, round_const);
res_lo = _mm_add_epi32(
orig_lo,
_mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1)));
res_lo =
_mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
_mm_storeu_si128(p, res_lo);
if (p_width > 4) {
const __m128i orig_hi = _mm_loadu_si128(p + 1);
res_hi = _mm_add_epi32(res_hi, round_const);
res_hi = _mm_add_epi32(
orig_hi,
_mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1)));
res_hi =
_mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
if (comp_avg)
res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
_mm_storeu_si128(p + 1, res_hi);
}
} else {
......
......@@ -316,21 +316,20 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
__m128i *const p =
(__m128i *)&conv_params
->dst[(i + k + 4) * conv_params->dst_stride + j];
const __m128i orig_lo = _mm_loadu_si128(p);
const __m128i round_const = _mm_set1_epi32(
-(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
((1 << (conv_params->round_1)) >> 1));
res_lo = _mm_add_epi32(res_lo, round_const);
res_lo = _mm_add_epi32(
orig_lo,
_mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1)));
res_lo =
_mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
_mm_storeu_si128(p, res_lo);
if (p_width > 4) {
const __m128i orig_hi = _mm_loadu_si128(p + 1);
res_hi = _mm_add_epi32(res_hi, round_const);
res_hi = _mm_add_epi32(
orig_hi,
_mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1)));
res_hi =
_mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
if (comp_avg)
res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
_mm_storeu_si128(p + 1, res_hi);
}
} else {
......
......@@ -492,21 +492,20 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
__m128i *const p =
(__m128i *)&conv_params
->dst[(i + k + 4) * conv_params->dst_stride + j];
const __m128i orig_lo = _mm_loadu_si128(p);
const __m128i round_const = _mm_set1_epi32(
-(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
((1 << (conv_params->round_1)) >> 1));
res_lo = _mm_add_epi32(res_lo, round_const);
res_lo = _mm_add_epi32(
orig_lo,
_mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1)));
res_lo =
_mm_srl_epi16(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
_mm_storeu_si128(p, res_lo);
if (p_width > 4) {
const __m128i orig_hi = _mm_loadu_si128(p + 1);
res_hi = _mm_add_epi32(res_hi, round_const);
res_hi = _mm_add_epi32(
orig_hi,
_mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1)));
res_hi =
_mm_srl_epi16(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
if (comp_avg)
res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
_mm_storeu_si128(p + 1, res_hi);
}
} else {
......
......@@ -56,10 +56,11 @@ void AV1Convolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
av1_get_interp_filter_params((InterpFilter)hfilter);
InterpFilterParams filter_params_y =
av1_get_interp_filter_params((InterpFilter)vfilter);
const int do_average = rnd_.Rand8() & 1;
ConvolveParams conv_params1 =
get_conv_params_no_round(0, 0, 0, output, MAX_SB_SIZE);
get_conv_params_no_round(0, do_average, 0, output, MAX_SB_SIZE);
ConvolveParams conv_params2 =
get_conv_params_no_round(0, 0, 0, output2, MAX_SB_SIZE);
get_conv_params_no_round(0, do_average, 0, output2, MAX_SB_SIZE);
for (subx = 0; subx < 16; ++subx)
for (suby = 0; suby < 16; ++suby) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment