Commit 6033fb85 authored by Steinar Midtskogen's avatar Steinar Midtskogen

Add v64_abs_s8, v128_abs_s8 and v256_abs_s8

Change-Id: I529509e4e997ba123799a3a581d20624d75cf582
parent 569c7b91
......@@ -97,6 +97,7 @@ SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); }
SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); }
SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
......
......@@ -208,6 +208,10 @@ SIMD_INLINE v128 v128_abs_s16(v128 x) {
return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
}
SIMD_INLINE v128 v128_abs_s8(v128 x) {
return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x)));
}
SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
return vreinterpretq_s64_s32(
vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b)));
......
......@@ -244,6 +244,10 @@ SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
}
SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) {
return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0]));
}
SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
c_v64 lo_bits = c_v64_mullo_s16(a, b);
c_v64 hi_bits = c_v64_mulhi_s16(a, b);
......
......@@ -120,6 +120,16 @@ SIMD_INLINE v128 v128_abs_s16(v128 a) {
#endif
}
SIMD_INLINE v128 v128_abs_s8(v128 a) {
#if defined(__SSSE3__)
return _mm_abs_epi8(a);
#else
v128 t = _mm_sub_epi8(_mm_setzero_si128(), a);
v128 mask = _mm_cmplt_epi8(t, a);
return _mm_or_si128(_mm_andnot_si128(mask, t), _mm_and_si128(mask, a));
#endif
}
SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
return _mm_unpacklo_epi8(b, a);
}
......
......@@ -99,6 +99,7 @@ SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); }
SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); }
SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
......
......@@ -253,6 +253,10 @@ SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
}
SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) {
return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0]));
}
SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
c_v128 lo_bits = c_v128_mullo_s16(a, b);
c_v128 hi_bits = c_v128_mulhi_s16(a, b);
......
......@@ -211,6 +211,10 @@ SIMD_INLINE v256 v256_abs_s16(v256 a) {
return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo));
}
SIMD_INLINE v256 v256_abs_s8(v256 a) {
return v256_from_v128(v128_abs_s8(a.hi), v128_abs_s8(a.lo));
}
SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
v128 lo_bits = v128_mullo_s16(a, b);
v128 hi_bits = v128_mulhi_s16(a, b);
......
......@@ -118,6 +118,8 @@ SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
// lanes of lower or upper halves of a 256bit vector because the
// unpack/pack intrinsics operate on the 256 bit input vector as 2
......
......@@ -81,6 +81,7 @@ SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); }
SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); }
SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); }
SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); }
SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); }
SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); }
SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); }
......
......@@ -242,6 +242,10 @@ SIMD_INLINE v64 v64_abs_s16(v64 x) {
return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
}
SIMD_INLINE v64 v64_abs_s8(v64 x) {
return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x)));
}
SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
return vreinterpret_s64_s16(
vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
......
......@@ -264,6 +264,13 @@ SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
return t;
}
SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
c_v64 t;
int c;
for (c = 0; c < 8; c++) t.u8[c] = (int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c];
return t;
}
SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
c_v64 t;
if (mode) {
......
......@@ -138,6 +138,16 @@ SIMD_INLINE v64 v64_abs_s16(v64 a) {
#endif
}
SIMD_INLINE v64 v64_abs_s8(v64 a) {
#if defined(__SSSE3__)
return _mm_abs_epi8(a);
#else
v64 t = _mm_sub_epi8(_mm_setzero_si128(), a);
v64 mask = _mm_cmplt_epi8(t, a);
return _mm_or_si128(_mm_andnot_si128(mask, t), _mm_and_si128(mask, a));
#endif
}
SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
......
......@@ -325,6 +325,7 @@ const mapping m[] = { MAP(v64_sad_u8),
MAP(imm_v64_align<5>),
MAP(imm_v64_align<6>),
MAP(imm_v64_align<7>),
MAP(v64_abs_s8),
MAP(v64_abs_s16),
MAP(v64_unpacklo_u8_s16),
MAP(v64_unpackhi_u8_s16),
......@@ -509,6 +510,7 @@ const mapping m[] = { MAP(v64_sad_u8),
MAP(imm_v128_align<13>),
MAP(imm_v128_align<14>),
MAP(imm_v128_align<15>),
MAP(v128_abs_s8),
MAP(v128_abs_s16),
MAP(v128_padd_s16),
MAP(v128_unpacklo_u16_s32),
......
......@@ -264,7 +264,8 @@ INSTANTIATE(
SIMD_TUPLE(imm_v64_align<5>, 0U, 0U), SIMD_TUPLE(imm_v64_align<6>, 0U, 0U),
SIMD_TUPLE(imm_v64_align<7>, 0U, 0U));
INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64), SIMD_TUPLE(v64_abs_s16, 0U, 0U),
INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64), SIMD_TUPLE(v64_abs_s8, 0U, 0U),
SIMD_TUPLE(v64_abs_s16, 0U, 0U),
SIMD_TUPLE(v64_unpacklo_u8_s16, 0U, 0U),
SIMD_TUPLE(v64_unpackhi_u8_s16, 0U, 0U),
SIMD_TUPLE(v64_unpacklo_u16_s32, 0U, 0U),
......@@ -312,10 +313,10 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64), SIMD_TUPLE(v64_abs_s16, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<6>, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<8>, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<10>, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<12>, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<14>, 0U, 0U));
SIMD_TUPLE(imm_v64_shl_n_16<12>, 0U, 0U));
INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64_Part2),
SIMD_TUPLE(imm_v64_shl_n_16<14>, 0U, 0U),
SIMD_TUPLE(imm_v64_shr_n_u16<1>, 0U, 0U),
SIMD_TUPLE(imm_v64_shr_n_u16<2>, 0U, 0U),
SIMD_TUPLE(imm_v64_shr_n_u16<4>, 0U, 0U),
......@@ -437,8 +438,8 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128_Part2),
SIMD_TUPLE(imm_v128_align<14>, 0U, 0U),
SIMD_TUPLE(imm_v128_align<15>, 0U, 0U));
INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128), SIMD_TUPLE(v128_abs_s16, 0U, 0U),
SIMD_TUPLE(v128_padd_s16, 0U, 0U),
INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128), SIMD_TUPLE(v128_abs_s8, 0U, 0U),
SIMD_TUPLE(v128_abs_s16, 0U, 0U), SIMD_TUPLE(v128_padd_s16, 0U, 0U),
SIMD_TUPLE(v128_unpacklo_u8_s16, 0U, 0U),
SIMD_TUPLE(v128_unpacklo_u16_s32, 0U, 0U),
SIMD_TUPLE(v128_unpacklo_s16_s32, 0U, 0U),
......@@ -484,10 +485,10 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128), SIMD_TUPLE(v128_abs_s16, 0U, 0U),
SIMD_TUPLE(imm_v128_shl_n_8<7>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<1>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<2>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<3>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<4>, 0U, 0U));
SIMD_TUPLE(imm_v128_shr_n_u8<3>, 0U, 0U));
INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part2),
SIMD_TUPLE(imm_v128_shr_n_u8<4>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<5>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<6>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<7>, 0U, 0U),
......@@ -533,10 +534,10 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part2),
SIMD_TUPLE(imm_v128_shr_n_u32<1>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<4>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<8>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<12>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<16>, 0U, 0U));
SIMD_TUPLE(imm_v128_shr_n_u32<12>, 0U, 0U));
INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part3),
SIMD_TUPLE(imm_v128_shr_n_u32<16>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<20>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<24>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<28>, 0U, 0U),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment