Commit 0768d98b authored by Deepa K G's avatar Deepa K G Committed by Yunqing Wang

Faster AVX2 implementation of motion compensation modules

Improvements have been made to av1_convolve_y_avx2 (~1.5x faster),
av1_convolve_y_sr_avx2 (~1.8x faster) and av1_convolve_2d_sr_avx2 (~1.3x faster).

Change-Id: Iaed764a7c4d069a4180c3edb0b1ac57ad36dad21
parent 3ea816e8
......@@ -81,6 +81,7 @@ set(AOM_DSP_COMMON_INTRIN_AVX2
"${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/inv_txfm_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
"${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
"${AOM_ROOT}/aom_dsp/x86/inv_txfm_common_avx2.h"
"${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h")
......
......@@ -33,4 +33,101 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
};
static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
const int subpel_q4,
__m256i *const coeffs /* [4] */) {
const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
*filter_params, subpel_q4 & SUBPEL_MASK);
const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
// right shift all filter co-efficients by 1 to reduce the bits required.
// This extra right shift will be taken care of at the end while rounding
// the result.
// Since all filter co-efficients are even, this change will not affect the
// end result
assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
_mm_set1_epi16(0xffff)));
const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
// coeffs 0 1 0 1 0 1 0 1
coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
// coeffs 2 3 2 3 2 3 2 3
coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
// coeffs 4 5 4 5 4 5 4 5
coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
// coeffs 6 7 6 7 6 7 6 7
coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
}
static INLINE void prepare_coeffs_y_2d(
const InterpFilterParams *const filter_params_y, const int subpel_y_q4,
__m256i *const coeffs /* [4] */) {
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const __m128i coeffs_y8 = _mm_loadu_si128((__m128i *)y_filter);
const __m256i coeffs_y = _mm256_broadcastsi128_si256(coeffs_y8);
// coeffs 0 1 0 1 0 1 0 1
coeffs[0] = _mm256_shuffle_epi32(coeffs_y, 0x00);
// coeffs 2 3 2 3 2 3 2 3
coeffs[1] = _mm256_shuffle_epi32(coeffs_y, 0x55);
// coeffs 4 5 4 5 4 5 4 5
coeffs[2] = _mm256_shuffle_epi32(coeffs_y, 0xaa);
// coeffs 6 7 6 7 6 7 6 7
coeffs[3] = _mm256_shuffle_epi32(coeffs_y, 0xff);
}
static INLINE __m256i convolve(const __m256i *const s,
const __m256i *const coeffs) {
const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
// order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
_mm256_add_epi16(res_23, res_67));
return res;
}
static INLINE __m256i convolve_y_2d(const __m256i *const s,
const __m256i *const coeffs) {
const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
_mm256_add_epi32(res_2, res_3));
return res;
}
static INLINE __m256i convolve_x(const __m256i data,
const __m256i *const coeffs,
const __m256i *const filt) {
__m256i s[4];
s[0] = _mm256_shuffle_epi8(data, filt[0]);
s[1] = _mm256_shuffle_epi8(data, filt[1]);
s[2] = _mm256_shuffle_epi8(data, filt[2]);
s[3] = _mm256_shuffle_epi8(data, filt[3]);
return convolve(s, coeffs);
}
static INLINE void add_store_aligned(CONV_BUF_TYPE *const dst,
const __m256i *const res,
const __m256i *const avg_mask) {
__m256i d;
d = _mm256_load_si256((__m256i *)dst);
d = _mm256_and_si256(d, *avg_mask);
d = _mm256_add_epi32(d, *res);
_mm256_store_si256((__m256i *)dst, d);
}
#endif
......@@ -401,7 +401,6 @@ void av1_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst0,
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
for (int k = 0; k < filter_params_x->taps; ++k) {
assert((x_filter[k] % 2) == 0);
sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
}
assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
......@@ -419,7 +418,6 @@ void av1_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst0,
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
assert((y_filter[k] % 2) == 0);
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
assert(0 <= sum && sum < (1 << (offset_bits + 2)));
......@@ -456,7 +454,6 @@ void av1_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst0,
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = 0;
for (int k = 0; k < filter_params_y->taps; ++k) {
assert((y_filter[k] % 2) == 0);
res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
}
res *= (1 << bits);
......@@ -490,7 +487,6 @@ void av1_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst0,
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = 0;
for (int k = 0; k < filter_params_x->taps; ++k) {
assert((x_filter[k] % 2) == 0);
res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
}
res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
......@@ -554,7 +550,6 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
for (int k = 0; k < filter_params_x->taps; ++k) {
assert((x_filter[k] % 2) == 0);
sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
}
assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
......@@ -572,7 +567,6 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
assert((y_filter[k] % 2) == 0);
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
assert(0 <= sum && sum < (1 << (offset_bits + 2)));
......@@ -602,7 +596,6 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = 0;
for (int k = 0; k < filter_params_y->taps; ++k) {
assert((y_filter[k] % 2) == 0);
res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
}
dst[y * dst_stride + x] =
......@@ -630,7 +623,6 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = 0;
for (int k = 0; k < filter_params_x->taps; ++k) {
assert((x_filter[k] % 2) == 0);
res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
}
res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
......
......@@ -740,7 +740,7 @@ void av1_make_masked_inter_predictor(
// CONFIG_HIGHBITDEPTH or just 8 otherwise.
#define INTER_PRED_BYTES_PER_PIXEL 4
DECLARE_ALIGNED(16, uint8_t,
DECLARE_ALIGNED(32, uint8_t,
tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
#undef INTER_PRED_BYTES_PER_PIXEL
......@@ -988,7 +988,7 @@ static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
for (idx = 0; idx < b8_w; idx += b4_w) {
MB_MODE_INFO *this_mbmi = &xd->mi[row * xd->mi_stride + col]->mbmi;
is_compound = has_second_ref(this_mbmi);
DECLARE_ALIGNED(16, int32_t, tmp_dst[8 * 8]);
DECLARE_ALIGNED(32, int32_t, tmp_dst[8 * 8]);
int tmp_dst_stride = 8;
assert(w <= 8 && h <= 8);
ConvolveParams conv_params = get_conv_params_no_round(
......@@ -1124,7 +1124,7 @@ static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
uint8_t *pre[2];
SubpelParams subpel_params[2];
DECLARE_ALIGNED(16, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
DECLARE_ALIGNED(32, int32_t, tmp_dst[MAX_SB_SIZE * MAX_SB_SIZE]);
for (ref = 0; ref < 1 + is_compound; ++ref) {
#if CONFIG_INTRABC
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment