Commit c8e0336a authored by Deepa K G's avatar Deepa K G Committed by Yunqing Wang

AVX2 optimization of motion compensation functions

AVX2 implementation of av1_convolve_x_sr, av1_convolve_y_sr and
av1_convolve_2d_sr have been added.

Improvements have been made to av1_convolve_x_avx2, av1_convolve_y_avx2
and av1_convolve_2d_avx2.

Change-Id: I62a699dd9dcf42de94dd72cc2d43affc0dc31404
parent aa71f071
......@@ -13,29 +13,9 @@
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/x86/convolve.h"
#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_ports/mem.h"
// filters for 16_h8 and 16_v8
DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
};
DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
};
DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
};
DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
};
#if defined(__clang__)
#if (__clang_major__ > 0 && __clang_major__ < 3) || \
(__clang_major__ == 3 && __clang_minor__ <= 3) || \
......
/*
* Copyright (c) 2018, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_
#define AOM_DSP_X86_CONVOLVE_AVX2_H_
// filters for 16
DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
};
DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
};
DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
};
DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
};
#endif
......@@ -497,9 +497,9 @@ specialize qw/av1_convolve_x sse2 avx2/;
add_proto qw/void av1_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_y sse2 avx2/;
add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_x_sr c sse2/;
specialize qw/av1_convolve_x_sr c sse2 avx2/;
add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_y_sr c sse2/;
specialize qw/av1_convolve_y_sr c sse2 avx2/;
add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d_scale sse4_1/;
......
......@@ -401,6 +401,7 @@ void av1_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst0,
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
for (int k = 0; k < filter_params_x->taps; ++k) {
assert((x_filter[k] % 2) == 0);
sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
}
assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
......@@ -418,6 +419,7 @@ void av1_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst0,
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
assert((y_filter[k] % 2) == 0);
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
assert(0 <= sum && sum < (1 << (offset_bits + 2)));
......@@ -454,6 +456,7 @@ void av1_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst0,
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = 0;
for (int k = 0; k < filter_params_y->taps; ++k) {
assert((y_filter[k] % 2) == 0);
res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
}
res *= (1 << bits);
......@@ -487,6 +490,7 @@ void av1_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst0,
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = 0;
for (int k = 0; k < filter_params_x->taps; ++k) {
assert((x_filter[k] % 2) == 0);
res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
}
res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
......@@ -550,6 +554,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
for (int x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
for (int k = 0; k < filter_params_x->taps; ++k) {
assert((x_filter[k] % 2) == 0);
sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
}
assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
......@@ -567,6 +572,7 @@ void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 1 << offset_bits;
for (int k = 0; k < filter_params_y->taps; ++k) {
assert((y_filter[k] % 2) == 0);
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
assert(0 <= sum && sum < (1 << (offset_bits + 2)));
......@@ -596,6 +602,7 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = 0;
for (int k = 0; k < filter_params_y->taps; ++k) {
assert((y_filter[k] % 2) == 0);
res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
}
dst[y * dst_stride + x] =
......@@ -623,6 +630,7 @@ void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
for (int x = 0; x < w; ++x) {
CONV_BUF_TYPE res = 0;
for (int k = 0; k < filter_params_x->taps; ++k) {
assert((x_filter[k] % 2) == 0);
res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
}
res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
......
This diff is collapsed.
This diff is collapsed.
......@@ -94,6 +94,14 @@ INSTANTIATE_TEST_CASE_P(
libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_sse2, 1, 1, 0));
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(
AVX2_X, AV1Convolve2DSrTest,
libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_avx2, 1, 0, 0));
INSTANTIATE_TEST_CASE_P(
AVX2_Y, AV1Convolve2DSrTest,
libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_avx2, 0, 1, 0));
INSTANTIATE_TEST_CASE_P(
AVX2, AV1Convolve2DSrTest,
libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_avx2, 1, 1, 0));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment