Commit b4d4aff4 authored by Deepa K G's avatar Deepa K G Committed by Yunqing Wang

Some fixes and clean-ups on convolve functions

Make the av1_convolve_x_sr_sse2/avx2 support various bit
shift options.

Addition of asserts in the convolve functions.

Change-Id: Ib6d1ada6c00a20e6e498af2672bd0bb76040d7d0
parent c7a5e883
......@@ -447,6 +447,8 @@ void av1_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst0,
(void)dst0;
(void)dst_stride0;
assert(bits >= 0);
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
......@@ -481,6 +483,8 @@ void av1_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst0,
(void)dst0;
(void)dst_stride0;
assert(bits >= 0);
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
......@@ -590,6 +594,10 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
(void)subpel_x_q4;
(void)conv_params;
assert(conv_params->round_0 <= FILTER_BITS);
assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
// vertical filter
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
......@@ -617,6 +625,10 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
(void)subpel_y_q4;
(void)conv_params;
assert(bits >= 0);
assert((FILTER_BITS - conv_params->round_1) >= 0 ||
((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
// horizontal filter
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
......
......@@ -43,6 +43,8 @@ void av1_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
__m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
assert(conv_params->round_0 > 0);
filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
......@@ -176,6 +178,8 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
__m256i filt[4], coeffs_h[4], coeffs_v[4];
assert(conv_params->round_0 > 0);
filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
......
......@@ -41,6 +41,8 @@ void av1_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i zero = _mm_setzero_si128();
assert(conv_params->round_0 > 0);
/* Horizontal filter */
{
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
......@@ -226,6 +228,8 @@ void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
assert(conv_params->round_0 > 0);
/* Horizontal filter */
{
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
......
......@@ -359,6 +359,8 @@ void av1_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m256i avg_mask = _mm256_set1_epi32(conv_params->do_average ? -1 : 0);
__m256i coeffs[4], s[8];
assert((FILTER_BITS - conv_params->round_0) >= 0);
prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
(void)conv_params;
......@@ -514,6 +516,10 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
_mm256_set1_epi16((1 << right_shift_bits) >> 1);
__m256i coeffs[4], s[8];
assert(conv_params->round_0 <= FILTER_BITS);
assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
(void)filter_params_x;
......@@ -665,6 +671,9 @@ void av1_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
__m256i filt[4], coeffs[4];
assert(bits >= 0);
assert(conv_params->round_0 > 0);
filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
......@@ -720,6 +729,7 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
int i, j;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint8_t *const src_ptr = src - fo_horiz;
const int bits = FILTER_BITS - conv_params->round_0;
__m256i filt[4], coeffs[4];
......@@ -730,14 +740,20 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
const __m256i round_const =
_mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) +
((1 << (FILTER_BITS - 1)) >> 1));
const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS - 1);
const __m256i round_0_const =
_mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
const __m128i round_shift = _mm_cvtsi32_si128(bits);
(void)filter_params_y;
(void)subpel_y_q4;
assert(bits >= 0);
assert((FILTER_BITS - conv_params->round_1) >= 0 ||
((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
assert(conv_params->round_0 > 0);
for (i = 0; i < h; ++i) {
for (j = 0; j < w; j += 16) {
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
......@@ -748,7 +764,9 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
__m256i res_16b = convolve_x(data, coeffs, filt);
// Combine V round and 2F-H-V round into a single rounding
res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
round_0_shift);
res_16b =
_mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift);
......
......@@ -105,6 +105,8 @@ void av1_convolve_y_sse2(const uint8_t *src, int src_stride,
(void)dst0;
(void)dst_stride0;
assert(bits >= 0);
prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
if (w == 4) {
......@@ -252,6 +254,8 @@ void av1_convolve_x_sse2(const uint8_t *src, int src_stride,
(void)dst0;
(void)dst_stride0;
assert(bits >= 0);
prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
if (w == 4) {
......@@ -335,6 +339,10 @@ void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride,
(void)subpel_x_q4;
(void)conv_params;
assert(conv_params->round_0 <= FILTER_BITS);
assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
if (w <= 4) {
......@@ -484,14 +492,21 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
ConvolveParams *conv_params) {
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint8_t *src_ptr = src - fo_horiz;
const __m128i round_const = _mm_set1_epi32(
((1 << conv_params->round_0) >> 1) + (1 << (FILTER_BITS - 1)));
const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
const int bits = FILTER_BITS - conv_params->round_0;
const __m128i round_0_const =
_mm_set1_epi32((1 << conv_params->round_0) >> 1);
const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
const __m128i round_shift = _mm_cvtsi32_si128(bits);
__m128i coeffs[4];
(void)filter_params_y;
(void)subpel_y_q4;
assert(bits >= 0);
assert((FILTER_BITS - conv_params->round_1) >= 0 ||
((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
if (w <= 4) {
......@@ -507,8 +522,10 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
s[3] =
_mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
const __m128i res_lo = convolve_lo_x(s, coeffs);
const __m128i res_lo_round =
_mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
__m128i res_lo_round =
_mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
res_lo_round =
_mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift);
const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
const __m128i res = _mm_packus_epi16(res16, res16);
......@@ -549,10 +566,14 @@ void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride,
// Rearrange pixels back into the order 0 ... 7
const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
const __m128i res_lo_round =
_mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
const __m128i res_hi_round =
_mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
__m128i res_lo_round =
_mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
round_shift);
__m128i res_hi_round =
_mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift);
res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
round_shift);
const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
const __m128i res = _mm_packus_epi16(res16, res16);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment