Commit 009946c8 authored by Timothy B. Terriberry's avatar Timothy B. Terriberry Committed by Tim Terriberry

daala_tx: Undo manual SIMD multiply expansion

On x86 there is no PMULHRSD for use in the 32-bit transform
versions, so the fastest approach is to just do a normal 32-bit
multiply and manually shift and round. This requires keeping the
constants in their reduced precision instead of always promoting
them to Q15.

Change-Id: I76339b5567da3f08f34882a707e0c93122991946
parent 170c946e
......@@ -55,6 +55,20 @@ static INLINE __m128i od_mulhrs_epi16(__m128i a, int16_t b) {
return _mm_mulhrs_epi16(a, _mm_set1_epi16(b));
}
static INLINE __m128i od_mul_epi16(__m128i a, int32_t b, int r) {
int32_t b_q15;
b_q15 = b << (15 - r);
/* b and r are in all cases compile-time constants, so these branches
disappear when this function gets inlined. */
if (b_q15 > 32767) {
return _mm_add_epi16(a, od_mulhrs_epi16(a, (int16_t)(b_q15 - 32768)));
} else if (b_q15 < -32767) {
return _mm_sub_epi16(od_mulhrs_epi16(a, (int16_t)(32768 + b_q15)), a);
} else {
return od_mulhrs_epi16(a, b_q15);
}
}
static INLINE __m128i od_hbd_max_epi16(int bd) {
return _mm_set1_epi16((1 << bd) - 1);
}
......@@ -451,7 +465,7 @@ static INLINE void od_transpose8x8(__m128i *r0, __m128i *r1, __m128i *r2,
#undef OD_RSHIFT1
#undef OD_AVG
#undef OD_HRSUB
#undef OD_MULHRS
#undef OD_MUL
#undef OD_SWAP
/* Define 8-wide 16-bit SSSE3 kernels. */
......@@ -464,7 +478,7 @@ static INLINE void od_transpose8x8(__m128i *r0, __m128i *r1, __m128i *r2,
#define OD_RSHIFT1 od_unbiased_rshift1_epi16
#define OD_AVG od_avg_epi16
#define OD_HRSUB od_hrsub_epi16
#define OD_MULHRS od_mulhrs_epi16
#define OD_MUL od_mul_epi16
#define OD_SWAP od_swap_epi16
#include "av1/common/x86/daala_tx_kernels.h"
......
......@@ -23,8 +23,8 @@
I.e., (a + b + 1) >> 1, without overflow
OD_HRSUB The function that implements a VHRSUB.S<16|32>
I.e., (a - b + 1) >> 1, without overflow
OD_MULHRS The function that implements a PMULHRS[WD]
I.e., (a * b + 16384) >> 15, without overflow
OD_MUL The function that implements the multiplies
I.e., (a * b + ((1 << r) >> 1)) >> r, without overflow
OD_SWAP The function that swaps two SIMD registers
See daala_inv_txfm_avx2.c for examples. */
......@@ -38,9 +38,9 @@ static INLINE void OD_KERNEL_FUNC(od_idct2)(OD_REG *p0, OD_REG *p1) {
OD_REG t_;
t_ = OD_ADD(*p0, *p1);
/* 11585/8192 ~= 2*Sin[Pi/4] = 1.4142135623730951 */
*p1 = OD_ADD(*p0, OD_MULHRS(*p0, (11585 - 8192) << 2));
*p1 = OD_MUL(*p0, 11585, 13);
/* 11585/16384 ~= Cos[Pi/4] = 0.7071067811865475 */
*p0 = OD_MULHRS(t_, 11585 << 1);
*p0 = OD_MUL(t_, 11585, 14);
*p1 = OD_SUB(*p1, *p0);
}
......@@ -48,11 +48,11 @@ static INLINE void OD_KERNEL_FUNC(od_idst2)(OD_REG *p0, OD_REG *p1) {
OD_REG t_;
t_ = OD_AVG(*p0, *p1);
/* 8867/16384 ~= Cos[3*Pi/8]*Sqrt[2] = 0.541196100146197 */
*p0 = OD_MULHRS(*p0, 8867 << 1);
*p0 = OD_MUL(*p0, 8867, 14);
/* 21407/16384 ~= Sin[3*Pi/8]*Sqrt[2] = 1.3065629648763766 */
*p1 = OD_ADD(*p1, OD_MULHRS(*p1, (21407 - 16384) << 1));
*p1 = OD_MUL(*p1, 21407, 14);
/* 15137/8192 ~= 2*Cos[Pi/8] = 1.8477590650225735 */
t_ = OD_ADD(t_, OD_MULHRS(t_, (15137 - 8192) << 2));
t_ = OD_MUL(t_, 15137, 13);
*p0 = OD_SUB(t_, *p0);
*p1 = OD_SUB(t_, *p1);
}
......@@ -69,11 +69,11 @@ static INLINE void OD_KERNEL_FUNC(od_idst2_asym)(OD_REG *p0, OD_REG *p1) {
OD_REG u_;
t_ = OD_AVG(*p0, *p1);
/* 3135/4096 ~= (Cos[Pi/8] - Sin[Pi/8])*Sqrt[2] = 0.7653668647301795 */
u_ = OD_MULHRS(*p1, 3135 << 3);
u_ = OD_MUL(*p1, 3135, 12);
/* 15137/16384 ~= (Cos[Pi/8] + Sin[Pi/8])/Sqrt[2] = 0.9238795325112867 */
*p1 = OD_MULHRS(*p0, 15137 << 1);
*p1 = OD_MUL(*p0, 15137, 14);
/* 8867/8192 ~= Cos[3*Pi/8]*2*Sqrt[2] = 1.082392200292394 */
t_ = OD_ADD(t_, OD_MULHRS(t_, (8867 - 8192) << 2));
t_ = OD_MUL(t_, 8867, 13);
*p0 = OD_ADD(u_, t_);
*p1 = OD_SUB(*p1, OD_RSHIFT1(t_));
}
......@@ -117,15 +117,15 @@ static INLINE void OD_KERNEL_FUNC(od_idst_vii4)(OD_REG *q0, OD_REG *q1,
t3 = *q1;
t4 = OD_ADD(*q2, *q3);
/* 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779 */
t0 = OD_MULHRS(t0, 467 << 4);
t0 = OD_MUL(t0, 467, 11);
/* 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360 */
t1 = OD_MULHRS(t1, 7021 << 1);
t1 = OD_MUL(t1, 7021, 14);
/* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
t2 = OD_ADD(t2, OD_MULHRS(t2, 37837 - 32768));
t2 = OD_MUL(t2, 37837, 15);
/* 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 */
t3 = OD_ADD(t3, OD_MULHRS(t3, 37837 - 32768));
t3 = OD_MUL(t3, 37837, 15);
/* 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139 */
t4 = OD_MULHRS(t4, 21513);
t4 = OD_MUL(t4, 21513, 15);
t3h = OD_RSHIFT1(t3);
u4 = OD_ADD(t4, t3h);
*q0 = OD_ADD(t0, u4);
......@@ -151,9 +151,9 @@ static INLINE void OD_KERNEL_FUNC(od_idst4_asym)(OD_REG *q0, OD_REG *q2,
OD_REG q3h;
t_ = OD_AVG(*q1, *q2);
/* 11585/8192 2*Sin[Pi/4] = 1.4142135623730951 */
*q1 = OD_ADD(*q2, OD_MULHRS(*q2, (11585 - 8192) << 2));
*q1 = OD_MUL(*q2, 11585, 13);
/* -46341/32768 = -2*Cos[Pi/4] = -1.4142135623730951 */
*q2 = OD_SUB(OD_MULHRS(t_, 32768 - 46341), t_);
*q2 = OD_MUL(t_, -46341, 15);
*q1 = OD_ADD(*q1, *q2);
*q1 = OD_ADD(*q1, *q0);
q1h = OD_RSHIFT1(*q1);
......@@ -163,20 +163,20 @@ static INLINE void OD_KERNEL_FUNC(od_idst4_asym)(OD_REG *q0, OD_REG *q2,
*q2 = OD_ADD(*q2, q3h);
t_ = OD_ADD(q1h, *q2);
/* 45451/32768 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 */
u_ = OD_ADD(*q2, OD_MULHRS(*q2, 45451 - 32768));
u_ = OD_MUL(*q2, 45451, 15);
/* 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.27589937928294306 */
*q2 = OD_MULHRS(*q1, 9041);
*q2 = OD_MUL(*q1, 9041, 15);
/* 18205/16384 = 2*Cos[5*Pi/16] = 1.1111404660392044 */
t_ = OD_ADD(t_, OD_MULHRS(t_, (18205 - 16384) << 1));
t_ = OD_MUL(t_, 18205, 14);
*q1 = OD_SUB(OD_RSHIFT1(t_), u_);
*q2 = OD_ADD(*q2, t_);
t_ = OD_ADD(*q0, q3h);
/* 38531/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 */
u_ = OD_ADD(*q0, OD_MULHRS(*q0, 38531 - 32768));
u_ = OD_MUL(*q0, 38531, 15);
/* 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 */
*q0 = OD_MULHRS(*q3, 12873 << 1);
*q0 = OD_MUL(*q3, 12873, 14);
/* 12785/32768 = 2*Cos[7*Pi/16] = 0.3901806440322565 */
t_ = OD_MULHRS(t_, 12785);
t_ = OD_MUL(t_, 12785, 15);
*q3 = OD_SUB(u_, OD_RSHIFT1(t_));
*q0 = OD_ADD(*q0, t_);
}
......@@ -209,26 +209,26 @@ static INLINE void OD_KERNEL_FUNC(od_idst8)(OD_REG *r0, OD_REG *r4, OD_REG *r2,
OD_REG r7h;
t_ = OD_AVG(*r1, *r6);
/* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */
*r1 = OD_ADD(*r6, OD_MULHRS(*r6, (11585 - 8192) << 2));
*r1 = OD_MUL(*r6, 11585, 13);
/* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */
*r6 = OD_ADD(t_, OD_MULHRS(t_, (11585 - 8192) << 2));
*r6 = OD_MUL(t_, 11585, 13);
*r1 = OD_SUB(*r1, *r6);
t_ = OD_HRSUB(*r5, *r2);
/* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */
u_ = OD_ADD(*r5, OD_MULHRS(*r5, (21407 - 16384) << 1));
u_ = OD_MUL(*r5, 21407, 14);
/* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */
*r5 = OD_MULHRS(*r2, 8867 << 1);
*r5 = OD_MUL(*r2, 8867, 14);
/* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */
t_ = OD_MULHRS(t_, 3135 << 3);
t_ = OD_MUL(t_, 3135, 12);
*r5 = OD_SUB(*r5, t_);
*r2 = OD_SUB(t_, u_);
t_ = OD_AVG(*r3, *r4);
/* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */
u_ = OD_ADD(*r4, OD_MULHRS(*r4, (21407 - 16384) << 1));
u_ = OD_MUL(*r4, 21407, 14);
/* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */
*r4 = OD_MULHRS(*r3, 8867 << 1);
*r4 = OD_MUL(*r3, 8867, 14);
/* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */
t_ = OD_MULHRS(t_, 3135 << 3);
t_ = OD_MUL(t_, 3135, 12);
*r3 = OD_SUB(u_, t_);
*r4 = OD_ADD(*r4, t_);
*r7 = OD_ADD(*r7, *r6);
......@@ -253,38 +253,38 @@ static INLINE void OD_KERNEL_FUNC(od_idst8)(OD_REG *r0, OD_REG *r4, OD_REG *r2,
*r2 = OD_ADD(*r2, *r1);
t_ = OD_ADD(*r0, *r7);
/* 17911/16384 ~= Sin[15*Pi/32] + Cos[15*Pi/32] ~= 1.0932018670017576 */
u_ = OD_ADD(*r0, OD_MULHRS(*r0, (17911 - 16384) << 1));
u_ = OD_MUL(*r0, 17911, 14);
/* 14699/16384 ~= Sin[15*Pi/32] - Cos[15*Pi/32] ~= 0.8971675863426363 */
*r0 = OD_MULHRS(*r7, 14699 << 1);
*r0 = OD_MUL(*r7, 14699, 14);
/* 803/8192 ~= Cos[15*Pi/32] ~= 0.0980171403295606 */
t_ = OD_MULHRS(t_, 803 << 2);
t_ = OD_MUL(t_, 803, 13);
*r7 = OD_SUB(u_, t_);
*r0 = OD_ADD(*r0, t_);
t_ = OD_SUB(*r1, *r6);
/* 40869/32768 ~= Sin[13*Pi/32] + Cos[13*Pi/32] ~= 1.247225012986671 */
u_ = OD_ADD(*r6, OD_MULHRS(*r6, 40869 - 32768));
u_ = OD_MUL(*r6, 40869, 15);
/* 21845/32768 ~= Sin[13*Pi/32] - Cos[13*Pi/32] ~= 0.6666556584777465 */
*r6 = OD_MULHRS(*r1, 21845);
*r6 = OD_MUL(*r1, 21845, 15);
/* 1189/4096 ~= Cos[13*Pi/32] ~= 0.29028467725446233 */
t_ = OD_MULHRS(t_, 1189 << 3);
t_ = OD_MUL(t_, 1189, 12);
*r1 = OD_ADD(u_, t_);
*r6 = OD_ADD(*r6, t_);
t_ = OD_ADD(*r2, *r5);
/* 22173/16384 ~= Sin[11*Pi/32] + Cos[11*Pi/32] ~= 1.3533180011743526 */
u_ = OD_ADD(*r2, OD_MULHRS(*r2, (22173 - 16384) << 1));
u_ = OD_MUL(*r2, 22173, 14);
/* 3363/8192 ~= Sin[11*Pi/32] - Cos[11*Pi/32] ~= 0.4105245275223574 */
*r2 = OD_MULHRS(*r5, 3363 << 2);
*r2 = OD_MUL(*r5, 3363, 13);
/* 15447/32768 ~= Cos[11*Pi/32] ~= 0.47139673682599764 */
t_ = OD_MULHRS(t_, 15447);
t_ = OD_MUL(t_, 15447, 15);
*r5 = OD_SUB(u_, t_);
*r2 = OD_ADD(*r2, t_);
t_ = OD_SUB(*r3, *r4);
/* 23059/16384 ~= Sin[9*Pi/32] + Cos[9*Pi/32] ~= 1.4074037375263826 */
u_ = OD_ADD(*r4, OD_MULHRS(*r4, (23059 - 16384) << 1));
u_ = OD_MUL(*r4, 23059, 14);
/* 2271/16384 ~= Sin[9*Pi/32] - Cos[9*Pi/32] ~= 0.1386171691990915 */
*r4 = OD_MULHRS(*r3, 2271 << 1);
*r4 = OD_MUL(*r3, 2271, 14);
/* 5197/8192 ~= Cos[9*Pi/32] ~= 0.6343932841636455 */
t_ = OD_MULHRS(t_, 5197 << 2);
t_ = OD_MUL(t_, 5197, 13);
*r3 = OD_ADD(u_, t_);
*r4 = OD_ADD(*r4, t_);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment