Commit 5f0485b4 authored by Linfeng Zhang's avatar Linfeng Zhang

Add av1_convolve_2d_copy_sr_sse2()

Change-Id: I7776ccaecb6933af47253a15fa8ed8a53346fac1
parent e07a675f
......@@ -526,7 +526,7 @@ specialize qw/av1_convolve_rounding avx2/;
add_proto qw/void av1_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d_copy sse2/;
add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d_copy_sr c/;
specialize qw/av1_convolve_2d_copy_sr sse2/;
add_proto qw/void av1_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_x sse2 avx2/;
add_proto qw/void av1_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
......
......@@ -320,6 +320,148 @@ void av1_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
}
}
#if CONFIG_EXT_PARTITION
static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
__m128i s[8];
s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
_mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
_mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
_mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
_mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
_mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
_mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
_mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
_mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
}
#endif
void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
(void)filter_params_x;
(void)filter_params_y;
(void)subpel_x_q4;
(void)subpel_y_q4;
(void)conv_params;
if (w >= 16) {
assert(!((intptr_t)dst % 16));
assert(!(dst_stride % 16));
}
if (w == 2) {
do {
*(uint16_t *)dst = *(uint16_t *)src;
src += src_stride;
dst += dst_stride;
*(uint16_t *)dst = *(uint16_t *)src;
src += src_stride;
dst += dst_stride;
h -= 2;
} while (h);
} else if (w == 4) {
do {
*(uint32_t *)dst = *(uint32_t *)src;
src += src_stride;
dst += dst_stride;
*(uint32_t *)dst = *(uint32_t *)src;
src += src_stride;
dst += dst_stride;
h -= 2;
} while (h);
} else if (w == 8) {
do {
__m128i s[2];
s[0] = _mm_loadl_epi64((__m128i *)src);
src += src_stride;
s[1] = _mm_loadl_epi64((__m128i *)src);
src += src_stride;
_mm_storel_epi64((__m128i *)dst, s[0]);
dst += dst_stride;
_mm_storel_epi64((__m128i *)dst, s[1]);
dst += dst_stride;
h -= 2;
} while (h);
} else if (w == 16) {
do {
__m128i s[2];
s[0] = _mm_loadu_si128((__m128i *)src);
src += src_stride;
s[1] = _mm_loadu_si128((__m128i *)src);
src += src_stride;
_mm_store_si128((__m128i *)dst, s[0]);
dst += dst_stride;
_mm_store_si128((__m128i *)dst, s[1]);
dst += dst_stride;
h -= 2;
} while (h);
} else if (w == 32) {
do {
__m128i s[4];
s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
src += src_stride;
s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
src += src_stride;
_mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
_mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
dst += dst_stride;
_mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
_mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
dst += dst_stride;
h -= 2;
} while (h);
} else if (w == 64) {
do {
__m128i s[8];
s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
src += src_stride;
s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
src += src_stride;
_mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
_mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
_mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
_mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
dst += dst_stride;
_mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
_mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
_mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
_mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
dst += dst_stride;
h -= 2;
} while (h);
#if CONFIG_EXT_PARTITION
} else {
do {
copy_128(src, dst);
src += src_stride;
dst += dst_stride;
copy_128(src, dst);
src += src_stride;
dst += dst_stride;
h -= 2;
} while (h);
#endif // CONFIG_EXT_PARTITION
}
}
#if CONFIG_JNT_COMP
void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
uint8_t *dst0, int dst_stride0, int w, int h,
......
......@@ -16,6 +16,7 @@ using std::tr1::tuple;
using std::tr1::make_tuple;
using libaom_test::ACMRandom;
using libaom_test::AV1Convolve2D::AV1Convolve2DTest;
using libaom_test::AV1Convolve2D::AV1Convolve2DSrTest;
#if CONFIG_JNT_COMP
using libaom_test::AV1Convolve2D::AV1JntConvolve2DTest;
#endif
......@@ -72,6 +73,14 @@ INSTANTIATE_TEST_CASE_P(
libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_avx2, 1, 1, 1));
#endif
TEST_P(AV1Convolve2DSrTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
TEST_P(AV1Convolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1Convolve2DSrTest,
libaom_test::AV1Convolve2D::BuildParams(
av1_convolve_2d_copy_sr_sse2, 0, 0, 1));
#if CONFIG_JNT_COMP && HAVE_SSE4_1
TEST_P(AV1JntConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
......
This diff is collapsed.
......@@ -50,6 +50,20 @@ class AV1Convolve2DTest : public ::testing::TestWithParam<Convolve2DParam> {
libaom_test::ACMRandom rnd_;
};
class AV1Convolve2DSrTest : public ::testing::TestWithParam<Convolve2DParam> {
public:
virtual ~AV1Convolve2DSrTest();
virtual void SetUp();
virtual void TearDown();
protected:
void RunCheckOutput(convolve_2d_func test_impl);
void RunSpeedTest(convolve_2d_func test_impl);
libaom_test::ACMRandom rnd_;
};
#if CONFIG_JNT_COMP
class AV1JntConvolve2DTest : public ::testing::TestWithParam<Convolve2DParam> {
public:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment