Commit b0191d21 authored by Timothy B. Terriberry's avatar Timothy B. Terriberry Committed by Tim Terriberry

daala_tx: Add SIMD version of the 16-point DCT

Change-Id: Ie3e599def556a90c474680567c4537508de2e30a
parent dc857d1b
This diff is collapsed.
......@@ -144,6 +144,44 @@ static INLINE void OD_KERNEL_FUNC(od_flip_idst_vii4)(OD_REG *q0, OD_REG *q1,
OD_SWAP(q1, q2);
}
static INLINE void OD_KERNEL_FUNC(od_idst4)(OD_REG *q0, OD_REG *q1, OD_REG *q2,
OD_REG *q3) {
OD_REG t_;
OD_REG u_;
OD_REG q2h;
OD_REG q3h;
t_ = OD_AVG(*q1, *q2);
/* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */
*q2 = OD_MUL(*q1, 11585, 13);
/* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */
*q1 = OD_MUL(t_, 11585, 13);
*q2 = OD_SUB(*q2, *q1);
*q2 = OD_ADD(*q2, *q0);
q2h = OD_RSHIFT1(*q2);
*q0 = OD_SUB(*q0, q2h);
*q3 = OD_ADD(*q3, *q1);
q3h = OD_RSHIFT1(*q3);
*q1 = OD_SUB(*q1, q3h);
t_ = OD_ADD(*q1, q2h);
/* 16069/16384 ~= (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] ~= 0.9807852804032 */
u_ = OD_MUL(*q2, 16069, 14);
/* 12785/32768 ~= (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] ~= 0.3901806440323 */
*q2 = OD_MUL(*q1, 12785, 15);
/* 12873/16384 ~= Cos[5*Pi/16]*Sqrt[2] ~= 0.7856949583871021 */
t_ = OD_MUL(t_, 12873, 14);
*q1 = OD_SUB(u_, t_);
*q2 = OD_ADD(*q2, t_);
t_ = OD_SUB(*q0, q3h);
/* 13623/16384 ~= (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] ~= 0.8314696123025 */
u_ = OD_MUL(*q3, 13623, 14);
/* 18205/16384 ~= (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] ~= 1.1111404660392 */
*q3 = OD_MUL(*q0, 18205, 14);
/* 9041/32768 ~= Cos[7*Pi/16]*Sqrt[2] = 0.275899379282943 */
t_ = OD_MUL(t_, 9041, 15);
*q0 = OD_ADD(u_, t_);
*q3 = OD_ADD(*q3, t_);
}
static INLINE void OD_KERNEL_FUNC(od_idst4_asym)(OD_REG *q0, OD_REG *q2,
OD_REG *q1, OD_REG *q3) {
OD_REG t_;
......@@ -199,6 +237,25 @@ static INLINE void OD_KERNEL_FUNC(od_idct8)(OD_REG *r0, OD_REG *r4, OD_REG *r2,
*r7 = OD_SUB(*r0, *r7);
}
static INLINE void OD_KERNEL_FUNC(od_idct8_asym)(
OD_REG *r0, OD_REG *r4, OD_REG *r2, OD_REG *r6, OD_REG *r1, OD_REG *r1h,
OD_REG *r5, OD_REG *r5h, OD_REG *r3, OD_REG *r3h, OD_REG *r7, OD_REG *r7h) {
OD_KERNEL_FUNC(od_idst4)(r7, r5, r6, r4);
OD_KERNEL_FUNC(od_idct4)(r0, r2, r1, r3);
*r7 = OD_SUB(*r0, *r7);
*r7h = OD_RSHIFT1(*r7);
*r0 = OD_SUB(*r0, *r7h);
*r1 = OD_SUB(*r1, *r6);
*r1h = OD_RSHIFT1(*r1);
*r6 = OD_ADD(*r6, *r1h);
*r5 = OD_SUB(*r2, *r5);
*r5h = OD_RSHIFT1(*r5);
*r2 = OD_SUB(*r2, *r5h);
*r3 = OD_SUB(*r3, *r4);
*r3h = OD_RSHIFT1(*r3);
*r4 = OD_ADD(*r4, *r3h);
}
static INLINE void OD_KERNEL_FUNC(od_idst8)(OD_REG *r0, OD_REG *r4, OD_REG *r2,
OD_REG *r6, OD_REG *r1, OD_REG *r5,
OD_REG *r3, OD_REG *r7) {
......@@ -290,6 +347,98 @@ static INLINE void OD_KERNEL_FUNC(od_idst8)(OD_REG *r0, OD_REG *r4, OD_REG *r2,
*r4 = OD_ADD(*r4, t_);
}
static INLINE void OD_KERNEL_FUNC(od_idst8_asym)(OD_REG *r0, OD_REG *r4,
OD_REG *r2, OD_REG *r6,
OD_REG *r1, OD_REG *r5,
OD_REG *r3, OD_REG *r7) {
OD_REG t_;
OD_REG u_;
OD_REG r0h;
OD_REG r2h;
OD_REG r5h;
OD_REG r7h;
t_ = OD_AVG(*r1, *r6);
/* 11585/8192 ~= 2*Sin[Pi/4] ~= 1.4142135623730951 */
*r1 = OD_MUL(*r6, 11585, 13);
/* 11585/8192 ~= 2*Cos[Pi/4] ~= 1.4142135623730951 */
*r6 = OD_MUL(t_, 11585, 13);
*r1 = OD_SUB(*r1, *r6);
t_ = OD_HRSUB(*r5, *r2);
/* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */
u_ = OD_MUL(*r5, 21407, 14);
/* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */
*r5 = OD_MUL(*r2, 8867, 14);
/* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */
t_ = OD_MUL(t_, 3135, 12);
*r5 = OD_SUB(*r5, t_);
*r2 = OD_SUB(t_, u_);
t_ = OD_AVG(*r3, *r4);
/* 21407/16384 ~= Sin[3*Pi/8] + Cos[3*Pi/8] ~= 1.3065629648763766 */
u_ = OD_MUL(*r4, 21407, 14);
/* 8867/16384 ~= Sin[3*Pi/8] - Cos[3*Pi/8] ~= 0.5411961001461969 */
*r4 = OD_MUL(*r3, 8867, 14);
/* 3135/4096 ~= 2*Cos[3*Pi/8] ~= 0.7653668647301796 */
t_ = OD_MUL(t_, 3135, 12);
*r3 = OD_SUB(u_, t_);
*r4 = OD_ADD(*r4, t_);
*r7 = OD_ADD(*r7, *r6);
r7h = OD_RSHIFT1(*r7);
*r6 = OD_SUB(*r6, r7h);
*r2 = OD_ADD(*r2, *r4);
r2h = OD_RSHIFT1(*r2);
*r4 = OD_SUB(*r4, r2h);
*r0 = OD_SUB(*r0, *r1);
r0h = OD_RSHIFT1(*r0);
*r1 = OD_ADD(*r1, r0h);
*r5 = OD_ADD(*r5, *r3);
r5h = OD_RSHIFT1(*r5);
*r3 = OD_SUB(*r3, r5h);
*r4 = OD_SUB(*r4, r7h);
*r7 = OD_ADD(*r7, *r4);
*r6 = OD_ADD(*r6, r5h);
*r5 = OD_SUB(*r5, *r6);
*r3 = OD_ADD(*r3, r0h);
*r0 = OD_SUB(*r0, *r3);
*r1 = OD_SUB(*r1, r2h);
*r2 = OD_ADD(*r2, *r1);
t_ = OD_ADD(*r0, *r7);
/* 12665/16384 ~= (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] ~= 0.77301045336 */
u_ = OD_MUL(*r0, 12665, 14);
/* 5197/4096 ~= (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] ~= 1.2687865683273 */
*r0 = OD_MUL(*r7, 5197, 12);
/* 2271/16384 ~= Cos[15*Pi/32]*Sqrt[2] ~= 0.13861716919909148 */
t_ = OD_MUL(t_, 2271, 14);
*r7 = OD_SUB(u_, OD_RSHIFT1(t_));
*r0 = OD_ADD(*r0, t_);
t_ = OD_SUB(*r1, *r6);
/* 28899/32768 ~= (Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] ~= 0.88192126435 */
u_ = OD_MUL(*r6, 28899, 15);
/* 30893/32768 ~= (Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] ~= 0.94279347365 */
*r6 = OD_MUL(*r1, 30893, 15);
/* 3363/8192 ~= Cos[13*Pi/32]*Sqrt[2] ~= 0.41052452752235735 */
t_ = OD_MUL(t_, 3363, 13);
*r1 = OD_ADD(u_, OD_RSHIFT1(t_));
*r6 = OD_ADD(*r6, t_);
t_ = OD_ADD(*r2, *r5);
/* 31357/32768 ~= (Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] ~= 0.95694033573 */
u_ = OD_MUL(*r2, 31357, 15);
/* 1189/2048 ~= (Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] ~= 0.5805693545089 */
*r2 = OD_MUL(*r5, 1189, 11);
/* 21845/32768 ~= Cos[11*Pi/32] ~= 0.6666556584777465 */
t_ = OD_MUL(t_, 21845, 15);
*r5 = OD_SUB(u_, OD_RSHIFT1(t_));
*r2 = OD_ADD(*r2, t_);
t_ = OD_SUB(*r3, *r4);
/* 16305/16384 ~= (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] ~= 0.9951847266722 */
u_ = OD_MUL(*r4, 16305, 14);
/* 803/4096 ~= (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] ~= 0.1960342806591213 */
*r4 = OD_MUL(*r3, 803, 12);
/* 14699/16384 ~= Cos[9*Pi/32]*Sqrt[2] ~= 0.8971675863426364 */
t_ = OD_MUL(t_, 14699, 14);
*r3 = OD_ADD(u_, OD_RSHIFT1(t_));
*r4 = OD_ADD(*r4, t_);
}
static INLINE void OD_KERNEL_FUNC(od_flip_idst8)(OD_REG *r0, OD_REG *r4,
OD_REG *r2, OD_REG *r6,
OD_REG *r1, OD_REG *r5,
......@@ -300,3 +449,34 @@ static INLINE void OD_KERNEL_FUNC(od_flip_idst8)(OD_REG *r0, OD_REG *r4,
OD_SWAP(r2, r5);
OD_SWAP(r6, r1);
}
static INLINE void OD_KERNEL_FUNC(od_idct16)(OD_REG *s0, OD_REG *s8, OD_REG *s4,
OD_REG *sc, OD_REG *s2, OD_REG *sa,
OD_REG *s6, OD_REG *se, OD_REG *s1,
OD_REG *s9, OD_REG *s5, OD_REG *sd,
OD_REG *s3, OD_REG *sb, OD_REG *s7,
OD_REG *sf) {
OD_REG s1h;
OD_REG s3h;
OD_REG s5h;
OD_REG s7h;
OD_KERNEL_FUNC(od_idst8_asym)(sf, sb, sd, s9, se, sa, sc, s8);
OD_KERNEL_FUNC(od_idct8_asym)
(s0, s4, s2, s6, s1, &s1h, s5, &s5h, s3, &s3h, s7, &s7h);
*s8 = OD_ADD(*s8, s7h);
*s7 = OD_SUB(*s7, *s8);
*s6 = OD_ADD(*s6, OD_RSHIFT1(*s9));
*s9 = OD_SUB(*s6, *s9);
*sa = OD_ADD(*sa, s5h);
*s5 = OD_SUB(*s5, *sa);
*s4 = OD_ADD(*s4, OD_RSHIFT1(*sb));
*sb = OD_SUB(*s4, *sb);
*sc = OD_ADD(*sc, s3h);
*s3 = OD_SUB(*s3, *sc);
*s2 = OD_ADD(*s2, OD_RSHIFT1(*sd));
*sd = OD_SUB(*s2, *sd);
*se = OD_ADD(*se, s1h);
*s1 = OD_SUB(*s1, *se);
*s0 = OD_ADD(*s0, OD_RSHIFT1(*sf));
*sf = OD_SUB(*s0, *sf);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment