Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Xiph.Org
aom-rav1e
Commits
29a17db9
Commit
29a17db9
authored
Jul 01, 2015
by
Parag Salasakar
Committed by
Gerrit Code Review
Jul 01, 2015
Browse files
Options
Browse Files
Download
Plain Diff
Merge "mips msa vpx_dsp sad sad4d avgsad optimization"
parents
440995ca
bc3ec8ef
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
1008 additions
and
39 deletions
+1008
-39
test/sad_test.cc
test/sad_test.cc
+94
-0
vpx_dsp/mips/macros_msa.h
vpx_dsp/mips/macros_msa.h
+87
-0
vpx_dsp/mips/sad_msa.c
vpx_dsp/mips/sad_msa.c
+787
-0
vpx_dsp/vpx_dsp.mk
vpx_dsp/vpx_dsp.mk
+1
-0
vpx_dsp/vpx_dsp_rtcd_defs.pl
vpx_dsp/vpx_dsp_rtcd_defs.pl
+39
-39
No files found.
test/sad_test.cc
View file @
29a17db9
...
...
@@ -1114,4 +1114,98 @@ const SadMxNx4Param x4d_avx2_tests[] = {
INSTANTIATE_TEST_CASE_P
(
AVX2
,
SADx4Test
,
::
testing
::
ValuesIn
(
x4d_avx2_tests
));
#endif // HAVE_AVX2
//------------------------------------------------------------------------------
// MIPS functions
#if HAVE_MSA
const
SadMxNFunc
sad64x64_msa
=
vpx_sad64x64_msa
;
const
SadMxNFunc
sad64x32_msa
=
vpx_sad64x32_msa
;
const
SadMxNFunc
sad32x64_msa
=
vpx_sad32x64_msa
;
const
SadMxNFunc
sad32x32_msa
=
vpx_sad32x32_msa
;
const
SadMxNFunc
sad32x16_msa
=
vpx_sad32x16_msa
;
const
SadMxNFunc
sad16x32_msa
=
vpx_sad16x32_msa
;
const
SadMxNFunc
sad16x16_msa
=
vpx_sad16x16_msa
;
const
SadMxNFunc
sad16x8_msa
=
vpx_sad16x8_msa
;
const
SadMxNFunc
sad8x16_msa
=
vpx_sad8x16_msa
;
const
SadMxNFunc
sad8x8_msa
=
vpx_sad8x8_msa
;
const
SadMxNFunc
sad8x4_msa
=
vpx_sad8x4_msa
;
const
SadMxNFunc
sad4x8_msa
=
vpx_sad4x8_msa
;
const
SadMxNFunc
sad4x4_msa
=
vpx_sad4x4_msa
;
const
SadMxNParam
msa_tests
[]
=
{
make_tuple
(
64
,
64
,
sad64x64_msa
,
-
1
),
make_tuple
(
64
,
32
,
sad64x32_msa
,
-
1
),
make_tuple
(
32
,
64
,
sad32x64_msa
,
-
1
),
make_tuple
(
32
,
32
,
sad32x32_msa
,
-
1
),
make_tuple
(
32
,
16
,
sad32x16_msa
,
-
1
),
make_tuple
(
16
,
32
,
sad16x32_msa
,
-
1
),
make_tuple
(
16
,
16
,
sad16x16_msa
,
-
1
),
make_tuple
(
16
,
8
,
sad16x8_msa
,
-
1
),
make_tuple
(
8
,
16
,
sad8x16_msa
,
-
1
),
make_tuple
(
8
,
8
,
sad8x8_msa
,
-
1
),
make_tuple
(
8
,
4
,
sad8x4_msa
,
-
1
),
make_tuple
(
4
,
8
,
sad4x8_msa
,
-
1
),
make_tuple
(
4
,
4
,
sad4x4_msa
,
-
1
),
};
INSTANTIATE_TEST_CASE_P
(
MSA
,
SADTest
,
::
testing
::
ValuesIn
(
msa_tests
));
const
SadMxNAvgFunc
sad64x64_avg_msa
=
vpx_sad64x64_avg_msa
;
const
SadMxNAvgFunc
sad64x32_avg_msa
=
vpx_sad64x32_avg_msa
;
const
SadMxNAvgFunc
sad32x64_avg_msa
=
vpx_sad32x64_avg_msa
;
const
SadMxNAvgFunc
sad32x32_avg_msa
=
vpx_sad32x32_avg_msa
;
const
SadMxNAvgFunc
sad32x16_avg_msa
=
vpx_sad32x16_avg_msa
;
const
SadMxNAvgFunc
sad16x32_avg_msa
=
vpx_sad16x32_avg_msa
;
const
SadMxNAvgFunc
sad16x16_avg_msa
=
vpx_sad16x16_avg_msa
;
const
SadMxNAvgFunc
sad16x8_avg_msa
=
vpx_sad16x8_avg_msa
;
const
SadMxNAvgFunc
sad8x16_avg_msa
=
vpx_sad8x16_avg_msa
;
const
SadMxNAvgFunc
sad8x8_avg_msa
=
vpx_sad8x8_avg_msa
;
const
SadMxNAvgFunc
sad8x4_avg_msa
=
vpx_sad8x4_avg_msa
;
const
SadMxNAvgFunc
sad4x8_avg_msa
=
vpx_sad4x8_avg_msa
;
const
SadMxNAvgFunc
sad4x4_avg_msa
=
vpx_sad4x4_avg_msa
;
const
SadMxNAvgParam
avg_msa_tests
[]
=
{
make_tuple
(
64
,
64
,
sad64x64_avg_msa
,
-
1
),
make_tuple
(
64
,
32
,
sad64x32_avg_msa
,
-
1
),
make_tuple
(
32
,
64
,
sad32x64_avg_msa
,
-
1
),
make_tuple
(
32
,
32
,
sad32x32_avg_msa
,
-
1
),
make_tuple
(
32
,
16
,
sad32x16_avg_msa
,
-
1
),
make_tuple
(
16
,
32
,
sad16x32_avg_msa
,
-
1
),
make_tuple
(
16
,
16
,
sad16x16_avg_msa
,
-
1
),
make_tuple
(
16
,
8
,
sad16x8_avg_msa
,
-
1
),
make_tuple
(
8
,
16
,
sad8x16_avg_msa
,
-
1
),
make_tuple
(
8
,
8
,
sad8x8_avg_msa
,
-
1
),
make_tuple
(
8
,
4
,
sad8x4_avg_msa
,
-
1
),
make_tuple
(
4
,
8
,
sad4x8_avg_msa
,
-
1
),
make_tuple
(
4
,
4
,
sad4x4_avg_msa
,
-
1
),
};
INSTANTIATE_TEST_CASE_P
(
MSA
,
SADavgTest
,
::
testing
::
ValuesIn
(
avg_msa_tests
));
const
SadMxNx4Func
sad64x64x4d_msa
=
vpx_sad64x64x4d_msa
;
const
SadMxNx4Func
sad64x32x4d_msa
=
vpx_sad64x32x4d_msa
;
const
SadMxNx4Func
sad32x64x4d_msa
=
vpx_sad32x64x4d_msa
;
const
SadMxNx4Func
sad32x32x4d_msa
=
vpx_sad32x32x4d_msa
;
const
SadMxNx4Func
sad32x16x4d_msa
=
vpx_sad32x16x4d_msa
;
const
SadMxNx4Func
sad16x32x4d_msa
=
vpx_sad16x32x4d_msa
;
const
SadMxNx4Func
sad16x16x4d_msa
=
vpx_sad16x16x4d_msa
;
const
SadMxNx4Func
sad16x8x4d_msa
=
vpx_sad16x8x4d_msa
;
const
SadMxNx4Func
sad8x16x4d_msa
=
vpx_sad8x16x4d_msa
;
const
SadMxNx4Func
sad8x8x4d_msa
=
vpx_sad8x8x4d_msa
;
const
SadMxNx4Func
sad8x4x4d_msa
=
vpx_sad8x4x4d_msa
;
const
SadMxNx4Func
sad4x8x4d_msa
=
vpx_sad4x8x4d_msa
;
const
SadMxNx4Func
sad4x4x4d_msa
=
vpx_sad4x4x4d_msa
;
const
SadMxNx4Param
x4d_msa_tests
[]
=
{
make_tuple
(
64
,
64
,
sad64x64x4d_msa
,
-
1
),
make_tuple
(
64
,
32
,
sad64x32x4d_msa
,
-
1
),
make_tuple
(
32
,
64
,
sad32x64x4d_msa
,
-
1
),
make_tuple
(
32
,
32
,
sad32x32x4d_msa
,
-
1
),
make_tuple
(
32
,
16
,
sad32x16x4d_msa
,
-
1
),
make_tuple
(
16
,
32
,
sad16x32x4d_msa
,
-
1
),
make_tuple
(
16
,
16
,
sad16x16x4d_msa
,
-
1
),
make_tuple
(
16
,
8
,
sad16x8x4d_msa
,
-
1
),
make_tuple
(
8
,
16
,
sad8x16x4d_msa
,
-
1
),
make_tuple
(
8
,
8
,
sad8x8x4d_msa
,
-
1
),
make_tuple
(
8
,
4
,
sad8x4x4d_msa
,
-
1
),
make_tuple
(
4
,
8
,
sad4x8x4d_msa
,
-
1
),
make_tuple
(
4
,
4
,
sad4x4x4d_msa
,
-
1
),
};
INSTANTIATE_TEST_CASE_P
(
MSA
,
SADx4Test
,
::
testing
::
ValuesIn
(
x4d_msa_tests
));
#endif // HAVE_MSA
}
// namespace
vpx_dsp/mips/macros_msa.h
View file @
29a17db9
...
...
@@ -82,12 +82,24 @@
}
#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) { \
LD_B2(RTYPE, (psrc), stride, out0, out1); \
out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
}
#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \
LD_B2(RTYPE, (psrc), stride, out0, out1); \
LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
}
#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) { \
LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
}
#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
/* Description : Load vectors with 8 halfword elements with stride
Arguments : Inputs - psrc, stride
Outputs - out0, out1
...
...
@@ -105,6 +117,40 @@
}
#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
/* Description : average with rounding (in0 + in1 + 1) / 2.
Arguments : Inputs - in0, in1, in2, in3,
Outputs - out0, out1
Return Type - as per RTYPE
Details : Each unsigned byte element from 'in0' vector is added with
each unsigned byte element from 'in1' vector. Then the average
with rounding is calculated and written to 'out0'
*/
#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) { \
out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
}
#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) { \
AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
}
#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
/* Description : Immediate number of elements to slide
Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
Outputs - out0, out1
Return Type - as per RTYPE
Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by
value specified in the 'slide_val'
*/
#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \
out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \
out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \
}
#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
/* Description : Dot product & addition of halfword vector elements
Arguments : Inputs - mult0, mult1, cnst0, cnst1
Outputs - out0, out1
...
...
@@ -155,6 +201,26 @@
sum_m; \
})
/* Description : Horizontal addition of 8 unsigned halfword elements
Arguments : Inputs - in (unsigned halfword vector)
Outputs - sum_m (u32 sum)
Return Type - unsigned word
Details : 8 unsigned halfword elements of input vector are added
together and the resulting integer sum is returned
*/
#define HADD_UH_U32(in) ({ \
v4u32 res_m; \
v2u64 res0_m, res1_m; \
uint32_t sum_m; \
\
res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
res0_m = __msa_hadd_u_d(res_m, res_m); \
res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
res0_m = res0_m + res1_m; \
sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \
sum_m; \
})
/* Description : Horizontal subtraction of unsigned byte vector elements
Arguments : Inputs - in0, in1
Outputs - out0, out1
...
...
@@ -169,6 +235,27 @@
}
#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
/* Description : SAD (Sum of Absolute Difference)
Arguments : Inputs - in0, in1, ref0, ref1
Outputs - sad_m (halfword vector)
Return Type - unsigned halfword
Details : Absolute difference of all the byte elements from 'in0' with
'ref0' is calculated and preserved in 'diff0'. Then even-odd
pairs are added together to generate 8 halfword results.
*/
#define SAD_UB2_UH(in0, in1, ref0, ref1) ({ \
v16u8 diff0_m, diff1_m; \
v8u16 sad_m = { 0 }; \
\
diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \
diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \
\
sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
\
sad_m; \
})
/* Description : Set element n input vector to GPR value
Arguments : Inputs - in0, in1, in2, in3
Output - out
...
...
vpx_dsp/mips/sad_msa.c
0 → 100644
View file @
29a17db9
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h"
static
uint32_t
sad_4width_msa
(
const
uint8_t
*
src_ptr
,
int32_t
src_stride
,
const
uint8_t
*
ref_ptr
,
int32_t
ref_stride
,
int32_t
height
)
{
int32_t
ht_cnt
;
uint32_t
src0
,
src1
,
src2
,
src3
,
ref0
,
ref1
,
ref2
,
ref3
;
v16u8
src
=
{
0
};
v16u8
ref
=
{
0
};
v16u8
diff
;
v8u16
sad
=
{
0
};
for
(
ht_cnt
=
(
height
>>
2
);
ht_cnt
--
;)
{
LW4
(
src_ptr
,
src_stride
,
src0
,
src1
,
src2
,
src3
);
src_ptr
+=
(
4
*
src_stride
);
LW4
(
ref_ptr
,
ref_stride
,
ref0
,
ref1
,
ref2
,
ref3
);
ref_ptr
+=
(
4
*
ref_stride
);
INSERT_W4_UB
(
src0
,
src1
,
src2
,
src3
,
src
);
INSERT_W4_UB
(
ref0
,
ref1
,
ref2
,
ref3
,
ref
);
diff
=
__msa_asub_u_b
(
src
,
ref
);
sad
+=
__msa_hadd_u_h
(
diff
,
diff
);
}
return
HADD_UH_U32
(
sad
);
}
static
uint32_t
sad_8width_msa
(
const
uint8_t
*
src
,
int32_t
src_stride
,
const
uint8_t
*
ref
,
int32_t
ref_stride
,
int32_t
height
)
{
int32_t
ht_cnt
;
v16u8
src0
,
src1
,
src2
,
src3
,
ref0
,
ref1
,
ref2
,
ref3
;
v8u16
sad
=
{
0
};
for
(
ht_cnt
=
(
height
>>
2
);
ht_cnt
--
;)
{
LD_UB4
(
src
,
src_stride
,
src0
,
src1
,
src2
,
src3
);
src
+=
(
4
*
src_stride
);
LD_UB4
(
ref
,
ref_stride
,
ref0
,
ref1
,
ref2
,
ref3
);
ref
+=
(
4
*
ref_stride
);
PCKEV_D4_UB
(
src1
,
src0
,
src3
,
src2
,
ref1
,
ref0
,
ref3
,
ref2
,
src0
,
src1
,
ref0
,
ref1
);
sad
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
}
return
HADD_UH_U32
(
sad
);
}
static
uint32_t
sad_16width_msa
(
const
uint8_t
*
src
,
int32_t
src_stride
,
const
uint8_t
*
ref
,
int32_t
ref_stride
,
int32_t
height
)
{
int32_t
ht_cnt
;
v16u8
src0
,
src1
,
ref0
,
ref1
;
v8u16
sad
=
{
0
};
for
(
ht_cnt
=
(
height
>>
2
);
ht_cnt
--
;)
{
LD_UB2
(
src
,
src_stride
,
src0
,
src1
);
src
+=
(
2
*
src_stride
);
LD_UB2
(
ref
,
ref_stride
,
ref0
,
ref1
);
ref
+=
(
2
*
ref_stride
);
sad
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
LD_UB2
(
src
,
src_stride
,
src0
,
src1
);
src
+=
(
2
*
src_stride
);
LD_UB2
(
ref
,
ref_stride
,
ref0
,
ref1
);
ref
+=
(
2
*
ref_stride
);
sad
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
}
return
HADD_UH_U32
(
sad
);
}
static
uint32_t
sad_32width_msa
(
const
uint8_t
*
src
,
int32_t
src_stride
,
const
uint8_t
*
ref
,
int32_t
ref_stride
,
int32_t
height
)
{
int32_t
ht_cnt
;
v16u8
src0
,
src1
,
ref0
,
ref1
;
v8u16
sad
=
{
0
};
for
(
ht_cnt
=
(
height
>>
2
);
ht_cnt
--
;)
{
LD_UB2
(
src
,
16
,
src0
,
src1
);
src
+=
src_stride
;
LD_UB2
(
ref
,
16
,
ref0
,
ref1
);
ref
+=
ref_stride
;
sad
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
LD_UB2
(
src
,
16
,
src0
,
src1
);
src
+=
src_stride
;
LD_UB2
(
ref
,
16
,
ref0
,
ref1
);
ref
+=
ref_stride
;
sad
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
LD_UB2
(
src
,
16
,
src0
,
src1
);
src
+=
src_stride
;
LD_UB2
(
ref
,
16
,
ref0
,
ref1
);
ref
+=
ref_stride
;
sad
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
LD_UB2
(
src
,
16
,
src0
,
src1
);
src
+=
src_stride
;
LD_UB2
(
ref
,
16
,
ref0
,
ref1
);
ref
+=
ref_stride
;
sad
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
}
return
HADD_UH_U32
(
sad
);
}
static
uint32_t
sad_64width_msa
(
const
uint8_t
*
src
,
int32_t
src_stride
,
const
uint8_t
*
ref
,
int32_t
ref_stride
,
int32_t
height
)
{
int32_t
ht_cnt
;
uint32_t
sad
=
0
;
v16u8
src0
,
src1
,
src2
,
src3
;
v16u8
ref0
,
ref1
,
ref2
,
ref3
;
v8u16
sad0
=
{
0
};
v8u16
sad1
=
{
0
};
for
(
ht_cnt
=
(
height
>>
1
);
ht_cnt
--
;)
{
LD_UB4
(
src
,
16
,
src0
,
src1
,
src2
,
src3
);
src
+=
src_stride
;
LD_UB4
(
ref
,
16
,
ref0
,
ref1
,
ref2
,
ref3
);
ref
+=
ref_stride
;
sad0
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
sad1
+=
SAD_UB2_UH
(
src2
,
src3
,
ref2
,
ref3
);
LD_UB4
(
src
,
16
,
src0
,
src1
,
src2
,
src3
);
src
+=
src_stride
;
LD_UB4
(
ref
,
16
,
ref0
,
ref1
,
ref2
,
ref3
);
ref
+=
ref_stride
;
sad0
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
sad1
+=
SAD_UB2_UH
(
src2
,
src3
,
ref2
,
ref3
);
}
sad
=
HADD_UH_U32
(
sad0
);
sad
+=
HADD_UH_U32
(
sad1
);
return
sad
;
}
static
void
sad_4width_x4d_msa
(
const
uint8_t
*
src_ptr
,
int32_t
src_stride
,
const
uint8_t
*
const
aref_ptr
[],
int32_t
ref_stride
,
int32_t
height
,
uint32_t
*
sad_array
)
{
const
uint8_t
*
ref0_ptr
,
*
ref1_ptr
,
*
ref2_ptr
,
*
ref3_ptr
;
int32_t
ht_cnt
;
uint32_t
src0
,
src1
,
src2
,
src3
;
uint32_t
ref0
,
ref1
,
ref2
,
ref3
;
v16u8
src
=
{
0
};
v16u8
ref
=
{
0
};
v16u8
diff
;
v8u16
sad0
=
{
0
};
v8u16
sad1
=
{
0
};
v8u16
sad2
=
{
0
};
v8u16
sad3
=
{
0
};
ref0_ptr
=
aref_ptr
[
0
];
ref1_ptr
=
aref_ptr
[
1
];
ref2_ptr
=
aref_ptr
[
2
];
ref3_ptr
=
aref_ptr
[
3
];
for
(
ht_cnt
=
(
height
>>
2
);
ht_cnt
--
;)
{
LW4
(
src_ptr
,
src_stride
,
src0
,
src1
,
src2
,
src3
);
INSERT_W4_UB
(
src0
,
src1
,
src2
,
src3
,
src
);
src_ptr
+=
(
4
*
src_stride
);
LW4
(
ref0_ptr
,
ref_stride
,
ref0
,
ref1
,
ref2
,
ref3
);
INSERT_W4_UB
(
ref0
,
ref1
,
ref2
,
ref3
,
ref
);
ref0_ptr
+=
(
4
*
ref_stride
);
diff
=
__msa_asub_u_b
(
src
,
ref
);
sad0
+=
__msa_hadd_u_h
(
diff
,
diff
);
LW4
(
ref1_ptr
,
ref_stride
,
ref0
,
ref1
,
ref2
,
ref3
);
INSERT_W4_UB
(
ref0
,
ref1
,
ref2
,
ref3
,
ref
);
ref1_ptr
+=
(
4
*
ref_stride
);
diff
=
__msa_asub_u_b
(
src
,
ref
);
sad1
+=
__msa_hadd_u_h
(
diff
,
diff
);
LW4
(
ref2_ptr
,
ref_stride
,
ref0
,
ref1
,
ref2
,
ref3
);
INSERT_W4_UB
(
ref0
,
ref1
,
ref2
,
ref3
,
ref
);
ref2_ptr
+=
(
4
*
ref_stride
);
diff
=
__msa_asub_u_b
(
src
,
ref
);
sad2
+=
__msa_hadd_u_h
(
diff
,
diff
);
LW4
(
ref3_ptr
,
ref_stride
,
ref0
,
ref1
,
ref2
,
ref3
);
INSERT_W4_UB
(
ref0
,
ref1
,
ref2
,
ref3
,
ref
);
ref3_ptr
+=
(
4
*
ref_stride
);
diff
=
__msa_asub_u_b
(
src
,
ref
);
sad3
+=
__msa_hadd_u_h
(
diff
,
diff
);
}
sad_array
[
0
]
=
HADD_UH_U32
(
sad0
);
sad_array
[
1
]
=
HADD_UH_U32
(
sad1
);
sad_array
[
2
]
=
HADD_UH_U32
(
sad2
);
sad_array
[
3
]
=
HADD_UH_U32
(
sad3
);
}
static
void
sad_8width_x4d_msa
(
const
uint8_t
*
src_ptr
,
int32_t
src_stride
,
const
uint8_t
*
const
aref_ptr
[],
int32_t
ref_stride
,
int32_t
height
,
uint32_t
*
sad_array
)
{
int32_t
ht_cnt
;
const
uint8_t
*
ref0_ptr
,
*
ref1_ptr
,
*
ref2_ptr
,
*
ref3_ptr
;
v16u8
src0
,
src1
,
src2
,
src3
;
v16u8
ref0
,
ref1
,
ref2
,
ref3
,
ref4
,
ref5
,
ref6
,
ref7
;
v16u8
ref8
,
ref9
,
ref10
,
ref11
,
ref12
,
ref13
,
ref14
,
ref15
;
v8u16
sad0
=
{
0
};
v8u16
sad1
=
{
0
};
v8u16
sad2
=
{
0
};
v8u16
sad3
=
{
0
};
ref0_ptr
=
aref_ptr
[
0
];
ref1_ptr
=
aref_ptr
[
1
];
ref2_ptr
=
aref_ptr
[
2
];
ref3_ptr
=
aref_ptr
[
3
];
for
(
ht_cnt
=
(
height
>>
2
);
ht_cnt
--
;)
{
LD_UB4
(
src_ptr
,
src_stride
,
src0
,
src1
,
src2
,
src3
);
src_ptr
+=
(
4
*
src_stride
);
LD_UB4
(
ref0_ptr
,
ref_stride
,
ref0
,
ref1
,
ref2
,
ref3
);
ref0_ptr
+=
(
4
*
ref_stride
);
LD_UB4
(
ref1_ptr
,
ref_stride
,
ref4
,
ref5
,
ref6
,
ref7
);
ref1_ptr
+=
(
4
*
ref_stride
);
LD_UB4
(
ref2_ptr
,
ref_stride
,
ref8
,
ref9
,
ref10
,
ref11
);
ref2_ptr
+=
(
4
*
ref_stride
);
LD_UB4
(
ref3_ptr
,
ref_stride
,
ref12
,
ref13
,
ref14
,
ref15
);
ref3_ptr
+=
(
4
*
ref_stride
);
PCKEV_D2_UB
(
src1
,
src0
,
src3
,
src2
,
src0
,
src1
);
PCKEV_D2_UB
(
ref1
,
ref0
,
ref3
,
ref2
,
ref0
,
ref1
);
sad0
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
PCKEV_D2_UB
(
ref5
,
ref4
,
ref7
,
ref6
,
ref0
,
ref1
);
sad1
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
PCKEV_D2_UB
(
ref9
,
ref8
,
ref11
,
ref10
,
ref0
,
ref1
);
sad2
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
PCKEV_D2_UB
(
ref13
,
ref12
,
ref15
,
ref14
,
ref0
,
ref1
);
sad3
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
}
sad_array
[
0
]
=
HADD_UH_U32
(
sad0
);
sad_array
[
1
]
=
HADD_UH_U32
(
sad1
);
sad_array
[
2
]
=
HADD_UH_U32
(
sad2
);
sad_array
[
3
]
=
HADD_UH_U32
(
sad3
);
}
static
void
sad_16width_x4d_msa
(
const
uint8_t
*
src_ptr
,
int32_t
src_stride
,
const
uint8_t
*
const
aref_ptr
[],
int32_t
ref_stride
,
int32_t
height
,
uint32_t
*
sad_array
)
{
int32_t
ht_cnt
;
const
uint8_t
*
ref0_ptr
,
*
ref1_ptr
,
*
ref2_ptr
,
*
ref3_ptr
;
v16u8
src
,
ref0
,
ref1
,
ref2
,
ref3
,
diff
;
v8u16
sad0
=
{
0
};
v8u16
sad1
=
{
0
};
v8u16
sad2
=
{
0
};
v8u16
sad3
=
{
0
};
ref0_ptr
=
aref_ptr
[
0
];
ref1_ptr
=
aref_ptr
[
1
];
ref2_ptr
=
aref_ptr
[
2
];
ref3_ptr
=
aref_ptr
[
3
];
for
(
ht_cnt
=
(
height
>>
1
);
ht_cnt
--
;)
{
src
=
LD_UB
(
src_ptr
);
src_ptr
+=
src_stride
;
ref0
=
LD_UB
(
ref0_ptr
);
ref0_ptr
+=
ref_stride
;
ref1
=
LD_UB
(
ref1_ptr
);
ref1_ptr
+=
ref_stride
;
ref2
=
LD_UB
(
ref2_ptr
);
ref2_ptr
+=
ref_stride
;
ref3
=
LD_UB
(
ref3_ptr
);
ref3_ptr
+=
ref_stride
;
diff
=
__msa_asub_u_b
(
src
,
ref0
);
sad0
+=
__msa_hadd_u_h
(
diff
,
diff
);
diff
=
__msa_asub_u_b
(
src
,
ref1
);
sad1
+=
__msa_hadd_u_h
(
diff
,
diff
);
diff
=
__msa_asub_u_b
(
src
,
ref2
);
sad2
+=
__msa_hadd_u_h
(
diff
,
diff
);
diff
=
__msa_asub_u_b
(
src
,
ref3
);
sad3
+=
__msa_hadd_u_h
(
diff
,
diff
);
src
=
LD_UB
(
src_ptr
);
src_ptr
+=
src_stride
;
ref0
=
LD_UB
(
ref0_ptr
);
ref0_ptr
+=
ref_stride
;
ref1
=
LD_UB
(
ref1_ptr
);
ref1_ptr
+=
ref_stride
;
ref2
=
LD_UB
(
ref2_ptr
);
ref2_ptr
+=
ref_stride
;
ref3
=
LD_UB
(
ref3_ptr
);
ref3_ptr
+=
ref_stride
;
diff
=
__msa_asub_u_b
(
src
,
ref0
);
sad0
+=
__msa_hadd_u_h
(
diff
,
diff
);
diff
=
__msa_asub_u_b
(
src
,
ref1
);
sad1
+=
__msa_hadd_u_h
(
diff
,
diff
);
diff
=
__msa_asub_u_b
(
src
,
ref2
);
sad2
+=
__msa_hadd_u_h
(
diff
,
diff
);
diff
=
__msa_asub_u_b
(
src
,
ref3
);
sad3
+=
__msa_hadd_u_h
(
diff
,
diff
);
}
sad_array
[
0
]
=
HADD_UH_U32
(
sad0
);
sad_array
[
1
]
=
HADD_UH_U32
(
sad1
);
sad_array
[
2
]
=
HADD_UH_U32
(
sad2
);
sad_array
[
3
]
=
HADD_UH_U32
(
sad3
);
}
static
void
sad_32width_x4d_msa
(
const
uint8_t
*
src
,
int32_t
src_stride
,
const
uint8_t
*
const
aref_ptr
[],
int32_t
ref_stride
,
int32_t
height
,
uint32_t
*
sad_array
)
{
const
uint8_t
*
ref0_ptr
,
*
ref1_ptr
,
*
ref2_ptr
,
*
ref3_ptr
;
int32_t
ht_cnt
;
v16u8
src0
,
src1
,
ref0
,
ref1
;
v8u16
sad0
=
{
0
};
v8u16
sad1
=
{
0
};
v8u16
sad2
=
{
0
};
v8u16
sad3
=
{
0
};
ref0_ptr
=
aref_ptr
[
0
];
ref1_ptr
=
aref_ptr
[
1
];
ref2_ptr
=
aref_ptr
[
2
];
ref3_ptr
=
aref_ptr
[
3
];
for
(
ht_cnt
=
height
;
ht_cnt
--
;)
{
LD_UB2
(
src
,
16
,
src0
,
src1
);
src
+=
src_stride
;
LD_UB2
(
ref0_ptr
,
16
,
ref0
,
ref1
);
ref0_ptr
+=
ref_stride
;
sad0
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
LD_UB2
(
ref1_ptr
,
16
,
ref0
,
ref1
);
ref1_ptr
+=
ref_stride
;
sad1
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
LD_UB2
(
ref2_ptr
,
16
,
ref0
,
ref1
);
ref2_ptr
+=
ref_stride
;
sad2
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
LD_UB2
(
ref3_ptr
,
16
,
ref0
,
ref1
);
ref3_ptr
+=
ref_stride
;
sad3
+=
SAD_UB2_UH
(
src0
,
src1
,
ref0
,
ref1
);
}
sad_array
[
0
]
=
HADD_UH_U32
(
sad0
);
sad_array
[
1
]
=
HADD_UH_U32
(
sad1
);
sad_array
[
2
]
=
HADD_UH_U32
(
sad2
);
sad_array
[
3
]
=
HADD_UH_U32
(
sad3
);
}
static
void
sad_64width_x4d_msa
(
const
uint8_t
*
src
,
int32_t
src_stride
,
const
uint8_t
*
const
aref_ptr
[],
int32_t
ref_stride
,
int32_t
height
,
uint32_t
*
sad_array
)
{
const
uint8_t
*
ref0_ptr
,
*
ref1_ptr
,
*
ref2_ptr
,
*
ref3_ptr
;
int32_t
ht_cnt
;
v16u8
src0
,
src1
,
src2
,
src3
;
v16u8
ref0
,
ref1
,
ref2
,
ref3
;
v8u16
sad0_0
=
{
0
};
v8u16
sad0_1
=
{
0
};
v8u16
sad1_0
=
{
0
};
v8u16
sad1_1
=
{
0
};
v8u16
sad2_0
=
{
0
};
v8u16
sad2_1
=
{
0
};