Commit 4b5e462d authored by Linfeng Zhang's avatar Linfeng Zhang

Upgrade vpx_lpf_{vertical,horizontal}_4 mmx to sse2

Followed the code style of other lpf fuctions.
These 2 functions put 2 rows of data in a single xmm register,
so they have similar but not identical filter operations,
and cannot share the same macros.

Change-Id: I3bab55a5d1a1232926ac8fd1f03251acc38302bc
parent 2240d83d
......@@ -430,16 +430,6 @@ TEST_P(Loop8Test9Param, ValueCheck) {
using std::tr1::make_tuple;
#if HAVE_MMX && CONFIG_USE_X86INC && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
MMX, Loop8Test6Param,
::testing::Values(
make_tuple(&vpx_lpf_horizontal_4_mmx,
&vpx_lpf_horizontal_4_c, 8),
make_tuple(&vpx_lpf_vertical_4_mmx,
&vpx_lpf_vertical_4_c, 8)));
#endif // HAVE_MMX
#if HAVE_SSE2
#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
......@@ -497,12 +487,16 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE2, Loop8Test6Param,
::testing::Values(
make_tuple(&vpx_lpf_horizontal_4_sse2,
&vpx_lpf_horizontal_4_c, 8),
make_tuple(&vpx_lpf_horizontal_8_sse2,
&vpx_lpf_horizontal_8_c, 8),
make_tuple(&vpx_lpf_horizontal_edge_8_sse2,
&vpx_lpf_horizontal_edge_8_c, 8),
make_tuple(&vpx_lpf_horizontal_edge_16_sse2,
&vpx_lpf_horizontal_edge_16_c, 8),
make_tuple(&vpx_lpf_vertical_4_sse2,
&vpx_lpf_vertical_4_c, 8),
make_tuple(&vpx_lpf_vertical_8_sse2,
&vpx_lpf_vertical_8_c, 8),
make_tuple(&vpx_lpf_vertical_16_sse2,
......
......@@ -39,7 +39,6 @@ endif
DSP_SRCS-yes += intrapred.c
ifeq ($(CONFIG_USE_X86INC),yes)
DSP_SRCS-$(HAVE_MMX) += x86/loopfilter_mmx.asm
DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
......
......@@ -543,7 +543,7 @@ specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_vertical_4 neon dspr2 msa/, "$mmx_x86inc";
specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
......@@ -564,7 +564,7 @@ specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/vpx_lpf_horizontal_4 neon dspr2 msa/, "$mmx_x86inc";
specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;
add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
......
This diff is collapsed.
......@@ -18,6 +18,213 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
}
// filter_mask and hev_mask
#define FILTER_HEV_MASK do { \
/* (abs(q1 - q0), abs(p1 - p0) */ \
__m128i flat = abs_diff(q1p1, q0p0); \
/* abs(p1 - q1), abs(p0 - q0) */ \
const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
__m128i abs_p0q0, abs_p1q1, work; \
\
/* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
hev = _mm_cmpgt_epi16(hev, thresh); \
hev = _mm_packs_epi16(hev, hev); \
\
/* const int8_t mask = filter_mask(*limit, *blimit, */ \
/* p3, p2, p1, p0, q0, q1, q2, q3); */ \
abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */\
abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */\
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
/* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
/* abs(p3 - p2), abs(p2 - p1) */ \
work = abs_diff(p3p2, p2p1); \
flat = _mm_max_epu8(work, flat); \
/* abs(q3 - q2), abs(q2 - q1) */ \
work = abs_diff(q3q2, q2q1); \
flat = _mm_max_epu8(work, flat); \
flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
mask = _mm_unpacklo_epi64(mask, flat); \
mask = _mm_subs_epu8(mask, limit); \
mask = _mm_cmpeq_epi8(mask, zero); \
mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
} while (0)
#define FILTER4 do { \
const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, \
4, 4, 4, 4, 4, 4, 4, 4); \
const __m128i t80 = _mm_set1_epi8(0x80); \
__m128i filter, filter2filter1, work; \
\
ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \
qs1qs0 = _mm_xor_si128(q1q0, t80); \
\
/* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \
work = _mm_subs_epi8(ps1ps0, qs1qs0); \
filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \
/* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \
filter = _mm_subs_epi8(filter, work); \
filter = _mm_subs_epi8(filter, work); \
filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \
filter = _mm_and_si128(filter, mask); /* & mask */ \
filter = _mm_unpacklo_epi64(filter, filter); \
\
/* filter1 = signed_char_clamp(filter + 4) >> 3; */ \
/* filter2 = signed_char_clamp(filter + 3) >> 3; */ \
filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \
filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \
filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \
filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \
filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \
filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \
\
/* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \
filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \
filter = _mm_unpacklo_epi8(filter, filter); \
filter = _mm_srai_epi16(filter, 9); /* round */ \
filter = _mm_packs_epi16(filter, filter); \
filter = _mm_andnot_si128(hev, filter); \
\
hev = _mm_unpackhi_epi64(filter2filter1, filter); \
filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \
\
/* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \
/* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \
qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \
ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \
} while (0)
void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
const uint8_t *_blimit, const uint8_t *_limit,
const uint8_t *_thresh) {
const __m128i zero = _mm_set1_epi16(0);
const __m128i limit =
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
_mm_loadl_epi64((const __m128i *)_limit));
const __m128i thresh =
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
const __m128i ff = _mm_cmpeq_epi8(zero, zero);
__m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
__m128i mask, hev;
p3p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
_mm_loadl_epi64((__m128i *)(s - 4 * p)));
q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
_mm_loadl_epi64((__m128i *)(s + 1 * p)));
q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
_mm_loadl_epi64((__m128i *)(s + 0 * p)));
q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
_mm_loadl_epi64((__m128i *)(s + 3 * p)));
p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
FILTER_HEV_MASK;
FILTER4;
_mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1
_mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0
_mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0
_mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1
}
void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
const uint8_t *_blimit, const uint8_t *_limit,
const uint8_t *_thresh) {
const __m128i zero = _mm_set1_epi16(0);
const __m128i limit =
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
_mm_loadl_epi64((const __m128i *)_limit));
const __m128i thresh =
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
const __m128i ff = _mm_cmpeq_epi8(zero, zero);
__m128i x0, x1, x2, x3;
__m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
__m128i mask, hev;
// 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
_mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
// 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
_mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
// 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
_mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
// 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
_mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
// Transpose 8x8
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
p1p0 = _mm_unpacklo_epi16(q1q0, x1);
// 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
x0 = _mm_unpacklo_epi16(x2, x3);
// 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
p3p2 = _mm_unpacklo_epi32(p1p0, x0);
// 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
p1p0 = _mm_unpackhi_epi32(p1p0, x0);
p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high
p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high
// 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
q1q0 = _mm_unpackhi_epi16(q1q0, x1);
// 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
x2 = _mm_unpackhi_epi16(x2, x3);
// 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
q3q2 = _mm_unpackhi_epi32(q1q0, x2);
// 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
q1q0 = _mm_unpacklo_epi32(q1q0, x2);
q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
FILTER_HEV_MASK;
FILTER4;
// Transpose 8x4 to 4x8
// qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37
// ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07
// 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
// 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37
x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
// 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27
ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
// 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
*(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
*(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
*(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
*(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
*(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
*(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
*(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
*(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
}
void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
const unsigned char *_blimit,
const unsigned char *_limit,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment