Commit 9b8444a1 authored by Steinar Midtskogen's avatar Steinar Midtskogen Committed by Jean-Marc Valin

Add v64_ssub_u16, v128_ssub_u16 and v256_ssub_u16

Change-Id: I60543913cbd8dc5cad524ab74697227f9e93836e
parent 131a0d55
......@@ -94,6 +94,7 @@ SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); }
SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); }
SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
......
......@@ -184,6 +184,11 @@ SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) {
vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
}
SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) {
return vreinterpretq_s64_u16(
vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
}
SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) {
return vreinterpretq_s64_u8(
vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
......
......@@ -230,6 +230,11 @@ SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
c_v64_ssub_s16(a.v64[0], b.v64[0]));
}
SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) {
return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]),
c_v64_ssub_u16(a.v64[0], b.v64[0]));
}
SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
c_v64_sub_32(a.v64[0], b.v64[0]));
......
......@@ -108,6 +108,8 @@ SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
SIMD_INLINE v128 v128_abs_s16(v128 a) {
......
......@@ -96,6 +96,7 @@ SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); }
SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); }
SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); }
SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
......
......@@ -239,6 +239,11 @@ SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
c_v128_ssub_s16(a.v128[0], b.v128[0]));
}
SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) {
return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]),
c_v128_ssub_u16(a.v128[0], b.v128[0]));
}
SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
c_v128_sub_32(a.v128[0], b.v128[0]));
......
......@@ -199,6 +199,10 @@ SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
return v256_from_v128(v128_ssub_s16(a.hi, b.hi), v128_ssub_s16(a.lo, b.lo));
}
SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
return v256_from_v128(v128_ssub_u16(a.hi, b.hi), v128_ssub_u16(a.lo, b.lo));
}
SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
return v256_from_v128(v128_sub_32(a.hi, b.hi), v128_sub_32(a.lo, b.lo));
}
......
......@@ -110,6 +110,10 @@ SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
return _mm256_subs_epi16(a, b);
}
SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
return _mm256_subs_epu16(a, b);
}
SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
......
......@@ -78,6 +78,7 @@ SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); }
SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); }
SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); }
SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); }
SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); }
SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); }
SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); }
......
......@@ -218,6 +218,11 @@ SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) {
vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
}
SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) {
return vreinterpret_s64_u16(
vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
}
SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) {
return vreinterpret_s64_u8(
vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
......
......@@ -240,6 +240,15 @@ SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
return t;
}
SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) {
c_v64 t;
int c;
for (c = 0; c < 4; c++)
t.u16[c] =
(int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c];
return t;
}
SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
c_v64 t;
t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]);
......
......@@ -126,6 +126,8 @@ SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); }
SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
SIMD_INLINE v64 v64_abs_s16(v64 a) {
......
......@@ -278,6 +278,7 @@ const mapping m[] = { MAP(v64_sad_u8),
MAP(v64_ssub_s8),
MAP(v64_sub_16),
MAP(v64_ssub_s16),
MAP(v64_ssub_u16),
MAP(v64_sub_32),
MAP(v64_ziplo_8),
MAP(v64_ziphi_8),
......@@ -449,6 +450,7 @@ const mapping m[] = { MAP(v64_sad_u8),
MAP(v128_ssub_s8),
MAP(v128_sub_16),
MAP(v128_ssub_s16),
MAP(v128_ssub_u16),
MAP(v128_sub_32),
MAP(v128_ziplo_8),
MAP(v128_ziphi_8),
......
......@@ -236,32 +236,33 @@ INSTANTIATE(
SIMD_TUPLE(v64_add_32, 0U, 0U), SIMD_TUPLE(v64_sub_8, 0U, 0U),
SIMD_TUPLE(v64_ssub_u8, 0U, 0U), SIMD_TUPLE(v64_ssub_s8, 0U, 0U),
SIMD_TUPLE(v64_sub_16, 0U, 0U), SIMD_TUPLE(v64_ssub_s16, 0U, 0U),
SIMD_TUPLE(v64_sub_32, 0U, 0U), SIMD_TUPLE(v64_ziplo_8, 0U, 0U),
SIMD_TUPLE(v64_ziphi_8, 0U, 0U), SIMD_TUPLE(v64_ziplo_16, 0U, 0U),
SIMD_TUPLE(v64_ziphi_16, 0U, 0U), SIMD_TUPLE(v64_ziplo_32, 0U, 0U),
SIMD_TUPLE(v64_ziphi_32, 0U, 0U), SIMD_TUPLE(v64_pack_s32_s16, 0U, 0U),
SIMD_TUPLE(v64_pack_s16_u8, 0U, 0U), SIMD_TUPLE(v64_pack_s16_s8, 0U, 0U),
SIMD_TUPLE(v64_unziphi_8, 0U, 0U), SIMD_TUPLE(v64_unziplo_8, 0U, 0U),
SIMD_TUPLE(v64_unziphi_16, 0U, 0U), SIMD_TUPLE(v64_unziplo_16, 0U, 0U),
SIMD_TUPLE(v64_or, 0U, 0U), SIMD_TUPLE(v64_xor, 0U, 0U),
SIMD_TUPLE(v64_and, 0U, 0U), SIMD_TUPLE(v64_andn, 0U, 0U),
SIMD_TUPLE(v64_mullo_s16, 0U, 0U), SIMD_TUPLE(v64_mulhi_s16, 0U, 0U),
SIMD_TUPLE(v64_mullo_s32, 0U, 0U), SIMD_TUPLE(v64_madd_s16, 0U, 0U),
SIMD_TUPLE(v64_madd_us8, 0U, 0U), SIMD_TUPLE(v64_avg_u8, 0U, 0U),
SIMD_TUPLE(v64_rdavg_u8, 0U, 0U), SIMD_TUPLE(v64_avg_u16, 0U, 0U),
SIMD_TUPLE(v64_min_u8, 0U, 0U), SIMD_TUPLE(v64_max_u8, 0U, 0U),
SIMD_TUPLE(v64_min_s8, 0U, 0U), SIMD_TUPLE(v64_max_s8, 0U, 0U),
SIMD_TUPLE(v64_min_s16, 0U, 0U), SIMD_TUPLE(v64_max_s16, 0U, 0U),
SIMD_TUPLE(v64_cmpgt_s8, 0U, 0U), SIMD_TUPLE(v64_cmplt_s8, 0U, 0U),
SIMD_TUPLE(v64_cmpeq_8, 0U, 0U), SIMD_TUPLE(v64_cmpgt_s16, 0U, 0U),
SIMD_TUPLE(v64_cmplt_s16, 0U, 0U), SIMD_TUPLE(v64_cmpeq_16, 0U, 0U),
SIMD_TUPLE(v64_shuffle_8, 7U, 8U));
SIMD_TUPLE(v64_ssub_u16, 0U, 0U), SIMD_TUPLE(v64_sub_32, 0U, 0U),
SIMD_TUPLE(v64_ziplo_8, 0U, 0U), SIMD_TUPLE(v64_ziphi_8, 0U, 0U),
SIMD_TUPLE(v64_ziplo_16, 0U, 0U), SIMD_TUPLE(v64_ziphi_16, 0U, 0U),
SIMD_TUPLE(v64_ziplo_32, 0U, 0U), SIMD_TUPLE(v64_ziphi_32, 0U, 0U),
SIMD_TUPLE(v64_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v64_pack_s16_u8, 0U, 0U),
SIMD_TUPLE(v64_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v64_unziphi_8, 0U, 0U),
SIMD_TUPLE(v64_unziplo_8, 0U, 0U), SIMD_TUPLE(v64_unziphi_16, 0U, 0U),
SIMD_TUPLE(v64_unziplo_16, 0U, 0U), SIMD_TUPLE(v64_or, 0U, 0U),
SIMD_TUPLE(v64_xor, 0U, 0U), SIMD_TUPLE(v64_and, 0U, 0U),
SIMD_TUPLE(v64_andn, 0U, 0U), SIMD_TUPLE(v64_mullo_s16, 0U, 0U),
SIMD_TUPLE(v64_mulhi_s16, 0U, 0U), SIMD_TUPLE(v64_mullo_s32, 0U, 0U),
SIMD_TUPLE(v64_madd_s16, 0U, 0U), SIMD_TUPLE(v64_madd_us8, 0U, 0U),
SIMD_TUPLE(v64_avg_u8, 0U, 0U), SIMD_TUPLE(v64_rdavg_u8, 0U, 0U),
SIMD_TUPLE(v64_avg_u16, 0U, 0U), SIMD_TUPLE(v64_min_u8, 0U, 0U),
SIMD_TUPLE(v64_max_u8, 0U, 0U), SIMD_TUPLE(v64_min_s8, 0U, 0U),
SIMD_TUPLE(v64_max_s8, 0U, 0U), SIMD_TUPLE(v64_min_s16, 0U, 0U),
SIMD_TUPLE(v64_max_s16, 0U, 0U), SIMD_TUPLE(v64_cmpgt_s8, 0U, 0U),
SIMD_TUPLE(v64_cmplt_s8, 0U, 0U), SIMD_TUPLE(v64_cmpeq_8, 0U, 0U),
SIMD_TUPLE(v64_cmpgt_s16, 0U, 0U), SIMD_TUPLE(v64_cmplt_s16, 0U, 0U),
SIMD_TUPLE(v64_cmpeq_16, 0U, 0U));
INSTANTIATE(
ARCH, ARCH_POSTFIX(V64_V64V64_Part2), SIMD_TUPLE(imm_v64_align<1>, 0U, 0U),
SIMD_TUPLE(imm_v64_align<2>, 0U, 0U), SIMD_TUPLE(imm_v64_align<3>, 0U, 0U),
SIMD_TUPLE(imm_v64_align<4>, 0U, 0U), SIMD_TUPLE(imm_v64_align<5>, 0U, 0U),
SIMD_TUPLE(imm_v64_align<6>, 0U, 0U), SIMD_TUPLE(imm_v64_align<7>, 0U, 0U));
ARCH, ARCH_POSTFIX(V64_V64V64_Part2), SIMD_TUPLE(v64_shuffle_8, 7U, 8U),
SIMD_TUPLE(imm_v64_align<1>, 0U, 0U), SIMD_TUPLE(imm_v64_align<2>, 0U, 0U),
SIMD_TUPLE(imm_v64_align<3>, 0U, 0U), SIMD_TUPLE(imm_v64_align<4>, 0U, 0U),
SIMD_TUPLE(imm_v64_align<5>, 0U, 0U), SIMD_TUPLE(imm_v64_align<6>, 0U, 0U),
SIMD_TUPLE(imm_v64_align<7>, 0U, 0U));
INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64), SIMD_TUPLE(v64_abs_s16, 0U, 0U),
SIMD_TUPLE(v64_unpacklo_u8_s16, 0U, 0U),
......@@ -394,29 +395,30 @@ INSTANTIATE(
SIMD_TUPLE(v128_add_32, 0U, 0U), SIMD_TUPLE(v128_sub_8, 0U, 0U),
SIMD_TUPLE(v128_ssub_u8, 0U, 0U), SIMD_TUPLE(v128_ssub_s8, 0U, 0U),
SIMD_TUPLE(v128_sub_16, 0U, 0U), SIMD_TUPLE(v128_ssub_s16, 0U, 0U),
SIMD_TUPLE(v128_sub_32, 0U, 0U), SIMD_TUPLE(v128_ziplo_8, 0U, 0U),
SIMD_TUPLE(v128_ziphi_8, 0U, 0U), SIMD_TUPLE(v128_ziplo_16, 0U, 0U),
SIMD_TUPLE(v128_ziphi_16, 0U, 0U), SIMD_TUPLE(v128_ziplo_32, 0U, 0U),
SIMD_TUPLE(v128_ziphi_32, 0U, 0U), SIMD_TUPLE(v128_ziplo_64, 0U, 0U),
SIMD_TUPLE(v128_ziphi_64, 0U, 0U), SIMD_TUPLE(v128_unziphi_8, 0U, 0U),
SIMD_TUPLE(v128_unziplo_8, 0U, 0U), SIMD_TUPLE(v128_unziphi_16, 0U, 0U),
SIMD_TUPLE(v128_unziplo_16, 0U, 0U), SIMD_TUPLE(v128_unziphi_32, 0U, 0U),
SIMD_TUPLE(v128_unziplo_32, 0U, 0U), SIMD_TUPLE(v128_pack_s32_s16, 0U, 0U),
SIMD_TUPLE(v128_pack_s16_u8, 0U, 0U), SIMD_TUPLE(v128_pack_s16_s8, 0U, 0U),
SIMD_TUPLE(v128_or, 0U, 0U), SIMD_TUPLE(v128_xor, 0U, 0U),
SIMD_TUPLE(v128_and, 0U, 0U), SIMD_TUPLE(v128_andn, 0U, 0U),
SIMD_TUPLE(v128_mullo_s16, 0U, 0U), SIMD_TUPLE(v128_mulhi_s16, 0U, 0U),
SIMD_TUPLE(v128_mullo_s32, 0U, 0U), SIMD_TUPLE(v128_madd_s16, 0U, 0U),
SIMD_TUPLE(v128_madd_us8, 0U, 0U), SIMD_TUPLE(v128_avg_u8, 0U, 0U),
SIMD_TUPLE(v128_rdavg_u8, 0U, 0U), SIMD_TUPLE(v128_avg_u16, 0U, 0U),
SIMD_TUPLE(v128_min_u8, 0U, 0U), SIMD_TUPLE(v128_max_u8, 0U, 0U),
SIMD_TUPLE(v128_min_s8, 0U, 0U), SIMD_TUPLE(v128_max_s8, 0U, 0U),
SIMD_TUPLE(v128_min_s16, 0U, 0U), SIMD_TUPLE(v128_max_s16, 0U, 0U),
SIMD_TUPLE(v128_cmpgt_s8, 0U, 0U), SIMD_TUPLE(v128_cmplt_s8, 0U, 0U),
SIMD_TUPLE(v128_cmpeq_8, 0U, 0U), SIMD_TUPLE(v128_cmpgt_s16, 0U, 0U),
SIMD_TUPLE(v128_cmpeq_16, 0U, 0U));
SIMD_TUPLE(v128_ssub_u16, 0U, 0U), SIMD_TUPLE(v128_sub_32, 0U, 0U),
SIMD_TUPLE(v128_ziplo_8, 0U, 0U), SIMD_TUPLE(v128_ziphi_8, 0U, 0U),
SIMD_TUPLE(v128_ziplo_16, 0U, 0U), SIMD_TUPLE(v128_ziphi_16, 0U, 0U),
SIMD_TUPLE(v128_ziplo_32, 0U, 0U), SIMD_TUPLE(v128_ziphi_32, 0U, 0U),
SIMD_TUPLE(v128_ziplo_64, 0U, 0U), SIMD_TUPLE(v128_ziphi_64, 0U, 0U),
SIMD_TUPLE(v128_unziphi_8, 0U, 0U), SIMD_TUPLE(v128_unziplo_8, 0U, 0U),
SIMD_TUPLE(v128_unziphi_16, 0U, 0U), SIMD_TUPLE(v128_unziplo_16, 0U, 0U),
SIMD_TUPLE(v128_unziphi_32, 0U, 0U), SIMD_TUPLE(v128_unziplo_32, 0U, 0U),
SIMD_TUPLE(v128_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v128_pack_s16_u8, 0U, 0U),
SIMD_TUPLE(v128_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v128_or, 0U, 0U),
SIMD_TUPLE(v128_xor, 0U, 0U), SIMD_TUPLE(v128_and, 0U, 0U),
SIMD_TUPLE(v128_andn, 0U, 0U), SIMD_TUPLE(v128_mullo_s16, 0U, 0U),
SIMD_TUPLE(v128_mulhi_s16, 0U, 0U), SIMD_TUPLE(v128_mullo_s32, 0U, 0U),
SIMD_TUPLE(v128_madd_s16, 0U, 0U), SIMD_TUPLE(v128_madd_us8, 0U, 0U),
SIMD_TUPLE(v128_avg_u8, 0U, 0U), SIMD_TUPLE(v128_rdavg_u8, 0U, 0U),
SIMD_TUPLE(v128_avg_u16, 0U, 0U), SIMD_TUPLE(v128_min_u8, 0U, 0U),
SIMD_TUPLE(v128_max_u8, 0U, 0U), SIMD_TUPLE(v128_min_s8, 0U, 0U),
SIMD_TUPLE(v128_max_s8, 0U, 0U), SIMD_TUPLE(v128_min_s16, 0U, 0U),
SIMD_TUPLE(v128_max_s16, 0U, 0U), SIMD_TUPLE(v128_cmpgt_s8, 0U, 0U),
SIMD_TUPLE(v128_cmplt_s8, 0U, 0U), SIMD_TUPLE(v128_cmpeq_8, 0U, 0U),
SIMD_TUPLE(v128_cmpgt_s16, 0U, 0U));
INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128_Part2),
SIMD_TUPLE(v128_cmpeq_16, 0U, 0U),
SIMD_TUPLE(v128_cmplt_s16, 0U, 0U),
SIMD_TUPLE(v128_shuffle_8, 15U, 8U),
SIMD_TUPLE(imm_v128_align<1>, 0U, 0U),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment