Commit 1b2b739b authored by Steinar Midtskogen's avatar Steinar Midtskogen

Add s8 -> s16 unpack instrinsics

Change-Id: Iec22c6442c55a5908d858766ff6dfb8bff69835d
parent 87c24a1d
......@@ -158,6 +158,13 @@ SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
return c_v128_unpackhi_u8_s16(a);
}
SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); }
SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
return c_v128_unpacklo_s8_s16(a);
}
SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
return c_v128_unpackhi_s8_s16(a);
}
SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
return c_v128_pack_s32_s16(a, b);
}
......
......@@ -388,6 +388,18 @@ SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a))));
}
SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a)));
}
SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))));
}
SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))));
}
SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
return v128_from_v64(
vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))),
......
......@@ -499,6 +499,20 @@ SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
c_v64_unpacklo_u8_s16(a.v64[1]));
}
SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) {
return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a));
}
SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) {
return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]),
c_v64_unpacklo_s8_s16(a.v64[0]));
}
SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) {
return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]),
c_v64_unpacklo_s8_s16(a.v64[1]));
}
SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
......
......@@ -225,6 +225,18 @@ SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
return _mm_unpackhi_epi8(a, _mm_setzero_si128());
}
SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
}
SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
}
SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8);
}
SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
return _mm_packs_epi32(b, a);
}
......
......@@ -166,6 +166,13 @@ SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
return c_v256_unpackhi_u8_s16(a);
}
SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); }
SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
return c_v256_unpacklo_s8_s16(a);
}
SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
return c_v256_unpackhi_s8_s16(a);
}
SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
return c_v256_pack_s32_s16(a, b);
}
......
......@@ -496,6 +496,20 @@ SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
c_v128_unpacklo_u8_s16(a.v128[1]));
}
SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) {
return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a));
}
SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) {
return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]),
c_v128_unpacklo_s8_s16(a.v128[0]));
}
SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) {
return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]),
c_v128_unpacklo_s8_s16(a.v128[1]));
}
SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
......
......@@ -370,6 +370,18 @@ SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
return v256_from_v128(v128_unpackhi_u8_s16(a.hi), v128_unpacklo_u8_s16(a.hi));
}
SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
}
SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
return v256_from_v128(v128_unpackhi_s8_s16(a.lo), v128_unpacklo_s8_s16(a.lo));
}
SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
return v256_from_v128(v128_unpackhi_s8_s16(a.hi), v128_unpacklo_s8_s16(a.hi));
}
SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
return v256_from_v128(v128_pack_s32_s16(a.hi, a.lo),
v128_pack_s32_s16(b.hi, b.lo));
......
......@@ -228,6 +228,20 @@ SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
v128_unpacklo_u8_s16(v256_high_v128(a)));
}
SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
}
SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
return v256_from_v128(v128_unpackhi_s8_s16(v256_low_v128(a)),
v128_unpacklo_s8_s16(v256_low_v128(a)));
}
SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
return v256_from_v128(v128_unpackhi_s8_s16(v256_high_v128(a)),
v128_unpacklo_s8_s16(v256_high_v128(a)));
}
SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
return v256_from_v128(v128_pack_s32_s16(v256_high_v128(a), v256_low_v128(a)),
v128_pack_s32_s16(v256_high_v128(b), v256_low_v128(b)));
......
......@@ -95,6 +95,8 @@ SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); }
SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); }
SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); }
SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); }
SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); }
SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); }
SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
return c_v64_pack_s32_s16(a, b);
}
......
......@@ -358,6 +358,14 @@ SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a))));
}
SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a))));
}
SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a))));
}
SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
return vreinterpret_s64_s16(vqmovn_s32(
vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
......
......@@ -425,6 +425,26 @@ SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
return t;
}
SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) {
c_v64 t;
int endian = !!CONFIG_BIG_ENDIAN * 4;
t.s16[3] = (int16_t)a.s8[3 + endian];
t.s16[2] = (int16_t)a.s8[2 + endian];
t.s16[1] = (int16_t)a.s8[1 + endian];
t.s16[0] = (int16_t)a.s8[0 + endian];
return t;
}
SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) {
c_v64 t;
int endian = !!CONFIG_BIG_ENDIAN * 4;
t.s16[3] = (int16_t)a.s8[7 - endian];
t.s16[2] = (int16_t)a.s8[6 - endian];
t.s16[1] = (int16_t)a.s8[5 - endian];
t.s16[0] = (int16_t)a.s8[4 - endian];
return t;
}
SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
c_v64 t;
if (CONFIG_BIG_ENDIAN) {
......
......@@ -228,6 +228,14 @@ SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
}
SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
}
SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8);
}
SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
return _mm_unpacklo_epi16(a, _mm_setzero_si128());
}
......
......@@ -329,6 +329,8 @@ const mapping m[] = { MAP(v64_sad_u8),
MAP(v64_abs_s16),
MAP(v64_unpacklo_u8_s16),
MAP(v64_unpackhi_u8_s16),
MAP(v64_unpacklo_s8_s16),
MAP(v64_unpackhi_s8_s16),
MAP(v64_unpacklo_u16_s32),
MAP(v64_unpacklo_s16_s32),
MAP(v64_unpackhi_u16_s32),
......@@ -622,6 +624,7 @@ const mapping m[] = { MAP(v64_sad_u8),
MAP(v128_zip_32),
MAP(v128_mul_s16),
MAP(v128_unpack_u8_s16),
MAP(v128_unpack_s8_s16),
MAP(v128_unpack_u16_s32),
MAP(v128_unpack_s16_s32),
MAP(v128_shl_8),
......@@ -646,6 +649,8 @@ const mapping m[] = { MAP(v64_sad_u8),
MAP(v128_dup_32),
MAP(v128_unpacklo_u8_s16),
MAP(v128_unpackhi_u8_s16),
MAP(v128_unpacklo_s8_s16),
MAP(v128_unpackhi_s8_s16),
MAP(u32_load_unaligned),
MAP(u32_store_unaligned),
MAP(v64_load_unaligned),
......
......@@ -268,6 +268,8 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64), SIMD_TUPLE(v64_abs_s8, 0U, 0U),
SIMD_TUPLE(v64_abs_s16, 0U, 0U),
SIMD_TUPLE(v64_unpacklo_u8_s16, 0U, 0U),
SIMD_TUPLE(v64_unpackhi_u8_s16, 0U, 0U),
SIMD_TUPLE(v64_unpacklo_s8_s16, 0U, 0U),
SIMD_TUPLE(v64_unpackhi_s8_s16, 0U, 0U),
SIMD_TUPLE(v64_unpacklo_u16_s32, 0U, 0U),
SIMD_TUPLE(v64_unpacklo_s16_s32, 0U, 0U),
SIMD_TUPLE(v64_unpackhi_u16_s32, 0U, 0U),
......@@ -311,11 +313,11 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64), SIMD_TUPLE(v64_abs_s8, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<2>, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<4>, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<6>, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<8>, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<10>, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<12>, 0U, 0U));
SIMD_TUPLE(imm_v64_shl_n_16<8>, 0U, 0U));
INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64_Part2),
SIMD_TUPLE(imm_v64_shl_n_16<10>, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<12>, 0U, 0U),
SIMD_TUPLE(imm_v64_shl_n_16<14>, 0U, 0U),
SIMD_TUPLE(imm_v64_shr_n_u16<1>, 0U, 0U),
SIMD_TUPLE(imm_v64_shr_n_u16<2>, 0U, 0U),
......@@ -441,9 +443,11 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128_Part2),
INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128), SIMD_TUPLE(v128_abs_s8, 0U, 0U),
SIMD_TUPLE(v128_abs_s16, 0U, 0U), SIMD_TUPLE(v128_padd_s16, 0U, 0U),
SIMD_TUPLE(v128_unpacklo_u8_s16, 0U, 0U),
SIMD_TUPLE(v128_unpacklo_s8_s16, 0U, 0U),
SIMD_TUPLE(v128_unpacklo_u16_s32, 0U, 0U),
SIMD_TUPLE(v128_unpacklo_s16_s32, 0U, 0U),
SIMD_TUPLE(v128_unpackhi_u8_s16, 0U, 0U),
SIMD_TUPLE(v128_unpackhi_s8_s16, 0U, 0U),
SIMD_TUPLE(v128_unpackhi_u16_s32, 0U, 0U),
SIMD_TUPLE(v128_unpackhi_s16_s32, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_byte<1>, 0U, 0U),
......@@ -483,11 +487,11 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128), SIMD_TUPLE(v128_abs_s8, 0U, 0U),
SIMD_TUPLE(imm_v128_shl_n_8<5>, 0U, 0U),
SIMD_TUPLE(imm_v128_shl_n_8<6>, 0U, 0U),
SIMD_TUPLE(imm_v128_shl_n_8<7>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<1>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<2>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<3>, 0U, 0U));
SIMD_TUPLE(imm_v128_shr_n_u8<1>, 0U, 0U));
INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part2),
SIMD_TUPLE(imm_v128_shr_n_u8<2>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<3>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<4>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<5>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u8<6>, 0U, 0U),
......@@ -532,11 +536,11 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part2),
SIMD_TUPLE(imm_v128_shl_n_32<24>, 0U, 0U),
SIMD_TUPLE(imm_v128_shl_n_32<28>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<1>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<4>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<8>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<12>, 0U, 0U));
SIMD_TUPLE(imm_v128_shr_n_u32<4>, 0U, 0U));
INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part3),
SIMD_TUPLE(imm_v128_shr_n_u32<8>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<12>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<16>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<20>, 0U, 0U),
SIMD_TUPLE(imm_v128_shr_n_u32<24>, 0U, 0U),
......@@ -558,6 +562,7 @@ INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U64U64), SIMD_TUPLE(v128_from_64, 0U, 0U));
INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V64),
SIMD_TUPLE(v128_unpack_u8_s16, 0U, 0U),
SIMD_TUPLE(v128_unpack_s8_s16, 0U, 0U),
SIMD_TUPLE(v128_unpack_u16_s32, 0U, 0U),
SIMD_TUPLE(v128_unpack_s16_s32, 0U, 0U));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment