Commit 1d2d1e75 authored by Geza Lore's avatar Geza Lore Committed by Gerrit Code Review

Merge "Add SSE2 versions of 128x128 vpx_sad*" into nextgenv2

parents edf6a708 a0e1c232
......@@ -763,6 +763,11 @@ INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
#if HAVE_SSE2
#if CONFIG_USE_X86INC
const SadMxNParam sse2_tests[] = {
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(128, 128, &vpx_sad128x128_sse2, -1),
make_tuple(128, 64, &vpx_sad128x64_sse2, -1),
make_tuple(64, 128, &vpx_sad64x128_sse2, -1),
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(64, 64, &vpx_sad64x64_sse2, -1),
make_tuple(64, 32, &vpx_sad64x32_sse2, -1),
make_tuple(32, 64, &vpx_sad32x64_sse2, -1),
......@@ -815,6 +820,11 @@ const SadMxNParam sse2_tests[] = {
INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
const SadMxNAvgParam avg_sse2_tests[] = {
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(128, 128, &vpx_sad128x128_avg_sse2, -1),
make_tuple(128, 64, &vpx_sad128x64_avg_sse2, -1),
make_tuple(64, 128, &vpx_sad64x128_avg_sse2, -1),
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(64, 64, &vpx_sad64x64_avg_sse2, -1),
make_tuple(64, 32, &vpx_sad64x32_avg_sse2, -1),
make_tuple(32, 64, &vpx_sad32x64_avg_sse2, -1),
......@@ -867,6 +877,11 @@ const SadMxNAvgParam avg_sse2_tests[] = {
INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
const SadMxNx4Param x4d_sse2_tests[] = {
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(128, 128, &vpx_sad128x128x4d_sse2, -1),
make_tuple(128, 64, &vpx_sad128x64x4d_sse2, -1),
make_tuple(64, 128, &vpx_sad64x128x4d_sse2, -1),
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(64, 64, &vpx_sad64x64x4d_sse2, -1),
make_tuple(64, 32, &vpx_sad64x32x4d_sse2, -1),
make_tuple(32, 64, &vpx_sad32x64x4d_sse2, -1),
......
......@@ -1029,33 +1029,39 @@ foreach (@block_sizes) {
add_proto qw/unsigned int/, "vpx_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
}
specialize qw/vpx_sad64x64 avx2 neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad64x32 avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x64 avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x32 avx2 neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x16 avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x32 msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x16 mmx media neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x8 mmx neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x16 mmx neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x8 mmx neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc";
specialize qw/vpx_sad4x4 mmx neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad64x32_avg avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x64_avg avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x32_avg avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x16_avg avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x32_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x16_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x8_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x16_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x8_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x4_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad4x8_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad4x4_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad128x128 /, "$sse2_x86inc";
specialize qw/vpx_sad128x64 /, "$sse2_x86inc";
specialize qw/vpx_sad64x128 /, "$sse2_x86inc";
specialize qw/vpx_sad64x64 avx2 neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad64x32 avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x64 avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x32 avx2 neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x16 avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x32 msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x16 mmx media neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x8 mmx neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x16 mmx neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x8 mmx neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc";
specialize qw/vpx_sad4x4 mmx neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad128x128_avg /, "$sse2_x86inc";
specialize qw/vpx_sad128x64_avg /, "$sse2_x86inc";
specialize qw/vpx_sad64x128_avg /, "$sse2_x86inc";
specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad64x32_avg avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x64_avg avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x32_avg avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x16_avg avx2 msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x32_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x16_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x8_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x16_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x8_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x4_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad4x8_avg msa/, "$sse2_x86inc";
specialize qw/vpx_sad4x4_avg msa/, "$sse2_x86inc";
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
foreach (@block_sizes) {
......@@ -1149,19 +1155,23 @@ foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/void/, "vpx_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
}
specialize qw/vpx_sad64x64x4d avx2 neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad64x32x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x64x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x32x4d avx2 neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x16x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x32x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x16x4d neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x8x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x16x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x8x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x4x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad4x8x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad4x4x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad128x128x4d /, "$sse2_x86inc";
specialize qw/vpx_sad128x64x4d /, "$sse2_x86inc";
specialize qw/vpx_sad64x128x4d /, "$sse2_x86inc";
specialize qw/vpx_sad64x64x4d avx2 neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad64x32x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x64x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x32x4d avx2 neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad32x16x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x32x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x16x4d neon msa/, "$sse2_x86inc";
specialize qw/vpx_sad16x8x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x16x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x8x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad8x4x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad4x8x4d msa/, "$sse2_x86inc";
specialize qw/vpx_sad4x4x4d msa/, "$sse2_x86inc";
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
......
......@@ -175,6 +175,12 @@ SECTION .text
PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6
%endmacro
; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end
%macro PROCESS_128x2x4 5-6 0
PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64
PROCESS_64x2x4 0, %4, %5, %4 + 64, %5 + 64, %6
%endmacro
; void vpx_sadNxNx4d_sse2(uint8_t *src, int src_stride,
; uint8_t *ref[4], int ref_stride,
; uint32_t res[4]);
......@@ -224,6 +230,11 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
%endmacro
INIT_XMM sse2
%if CONFIG_EXT_PARTITION
SADNXN4D 128, 128
SADNXN4D 128, 64
SADNXN4D 64, 128
%endif
SADNXN4D 64, 64
SADNXN4D 64, 32
SADNXN4D 32, 64
......
......@@ -44,6 +44,76 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
%endif ; %3 == 7
%endmacro
%if CONFIG_EXT_PARTITION
; unsigned int vpx_sad128x128_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD128XN 1-2 0
SAD_FN 128, %1, 5, %2
mov n_rowsd, %1
pxor m0, m0
.loop:
movu m1, [refq]
movu m2, [refq+16]
movu m3, [refq+32]
movu m4, [refq+48]
%if %2 == 1
pavgb m1, [second_predq+mmsize*0]
pavgb m2, [second_predq+mmsize*1]
pavgb m3, [second_predq+mmsize*2]
pavgb m4, [second_predq+mmsize*3]
%endif
psadbw m1, [srcq]
psadbw m2, [srcq+16]
psadbw m3, [srcq+32]
psadbw m4, [srcq+48]
paddd m1, m2
paddd m3, m4
paddd m0, m1
paddd m0, m3
movu m1, [refq+64]
movu m2, [refq+80]
movu m3, [refq+96]
movu m4, [refq+112]
%if %2 == 1
pavgb m1, [second_predq+mmsize*4]
pavgb m2, [second_predq+mmsize*5]
pavgb m3, [second_predq+mmsize*6]
pavgb m4, [second_predq+mmsize*7]
lea second_predq, [second_predq+mmsize*8]
%endif
psadbw m1, [srcq+64]
psadbw m2, [srcq+80]
psadbw m3, [srcq+96]
psadbw m4, [srcq+112]
add refq, ref_strideq
add srcq, src_strideq
paddd m1, m2
paddd m3, m4
paddd m0, m1
paddd m0, m3
sub n_rowsd, 1
jg .loop
movhlps m1, m0
paddd m0, m1
movd eax, m0
RET
%endmacro
INIT_XMM sse2
SAD128XN 128 ; sad128x128_sse2
SAD128XN 128, 1 ; sad128x128_avg_sse2
SAD128XN 64 ; sad128x64_sse2
SAD128XN 64, 1 ; sad128x64_avg_sse2
%endif
; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro SAD64XN 1-2 0
......@@ -82,6 +152,10 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
%endmacro
INIT_XMM sse2
%if CONFIG_EXT_PARTITION
SAD64XN 128 ; sad64x128_sse2
SAD64XN 128, 1 ; sad64x128_avg_sse2
%endif
SAD64XN 64 ; sad64x64_sse2
SAD64XN 32 ; sad64x32_sse2
SAD64XN 64, 1 ; sad64x64_avg_sse2
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment