From a4e245a98b9f87a202c74227caeda2c7b87e55f2 Mon Sep 17 00:00:00 2001 From: Monty Montgomery Date: Sat, 22 Jul 2017 00:48:31 -0400 Subject: [PATCH] Add CONFIG_DAALA_DCT64 experiment. This experiment replaces the 64-point Type-II DCT and related scaling vp9 transforms with the 64-point orthonormal Daala transforms. subset-1: monty-square-baseline-s1-F2@2017-07-28T03:35:45.962Z -> monty-square-dct64-s1-F2@2017-07-29T04:50:58.412Z PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 -0.1930 | -0.2037 | -0.0643 | -0.1917 | -0.2331 | -0.3510 | -0.1810 objective-1-fast: monty-square-baseline-o1f-F2@2017-07-28T03:35:35.533Z -> monty-square-dct64-o1f-F2@2017-07-29T04:50:28.542Z PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 -0.2557 | -0.1743 | -0.4900 | -0.3028 | -0.4147 | -0.5764 | -0.2864 Change-Id: I1f944df29e44d2e350c42555af274f2d75a62a92 --- aom_dsp/inv_txfm.c | 13 +- aom_dsp/inv_txfm.h | 3 + av1/common/daala_tx.c | 1732 +++++++++++++++++++++++++ av1/common/daala_tx.h | 5 +- av1/common/idct.c | 50 +- av1/encoder/dct.c | 56 +- build/cmake/aom_config_defaults.cmake | 1 + build/cmake/aom_configure.cmake | 10 +- configure | 7 +- 9 files changed, 1869 insertions(+), 8 deletions(-) diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c index fbf09db5f..0aa46721d 100644 --- a/aom_dsp/inv_txfm.c +++ b/aom_dsp/inv_txfm.c @@ -15,7 +15,7 @@ #include "./aom_dsp_rtcd.h" #include "aom_dsp/inv_txfm.h" #if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \ - CONFIG_DAALA_DCT32 + CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64 #include "av1/common/daala_tx.h" #endif @@ -1469,6 +1469,17 @@ void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } +#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64 +void aom_idct64_c(const tran_low_t *input, tran_low_t *output) { + int i; + od_coeff x[64]; + od_coeff y[64]; + for (i = 0; i < 64; i++) y[i] = (od_coeff)input[i]; + od_bin_idct64(x, 1, y); + for (i = 0; i < 64; i++) output[i] = (tran_low_t)x[i]; +} +#endif + void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, diff --git a/aom_dsp/inv_txfm.h b/aom_dsp/inv_txfm.h index a9c485e74..dee7599cf 100644 --- a/aom_dsp/inv_txfm.h +++ b/aom_dsp/inv_txfm.h @@ -68,6 +68,9 @@ void aom_idct4_c(const tran_low_t *input, tran_low_t *output); void aom_idct8_c(const tran_low_t *input, tran_low_t *output); void aom_idct16_c(const tran_low_t *input, tran_low_t *output); void aom_idct32_c(const tran_low_t *input, tran_low_t *output); +#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64 +void aom_idct64_c(const tran_low_t *input, tran_low_t *output); +#endif void aom_iadst4_c(const tran_low_t *input, tran_low_t *output); void aom_iadst8_c(const tran_low_t *input, tran_low_t *output); void aom_iadst16_c(const tran_low_t *input, tran_low_t *output); diff --git a/av1/common/daala_tx.c b/av1/common/daala_tx.c index 82c8af549..72e9ebca7 100644 --- a/av1/common/daala_tx.c +++ b/av1/common/daala_tx.c @@ -1788,6 +1788,1336 @@ } \ while (0) +#if CONFIG_TX64X64 +#define OD_FDCT_32_ASYM(t0, tg, tgh, t8, to, toh, t4, tk, tkh, tc, ts, tsh, \ + t2, ti, tih, ta, tq, tqh, t6, tm, tmh, te, tu, tuh, t1, th, thh, \ + t9, tp, tph, t5, tl, tlh, td, tt, tth, t3, tj, tjh, tb, tr, trh, \ + t7, tn, tnh, tf, tv, tvh) \ + /* Embedded 32-point asymmetric Type-II fDCT. */ \ + do { \ + t0 += tvh; \ + tv = t0 - tv; \ + t1 = tuh - t1; \ + tu -= t1; \ + t2 += tth; \ + tt = t2 - tt; \ + t3 = tsh - t3; \ + ts -= t3; \ + t4 += trh; \ + tr = t4 - tr; \ + t5 = tqh - t5; \ + tq -= t5; \ + t6 += tph; \ + tp = t6 - tp; \ + t7 = toh - t7; \ + to -= t7; \ + t8 += tnh; \ + tn = t8 - tn; \ + t9 = tmh - t9; \ + tm -= t9; \ + ta += tlh; \ + tl = ta - tl; \ + tb = tkh - tb; \ + tk -= tb; \ + tc += tjh; \ + tj = tc - tj; \ + td = tih - td; \ + ti -= td; \ + te += thh; \ + th = te - th; \ + tf = tgh - tf; \ + tg -= tf; \ + OD_FDCT_16(t0, tg, t8, to, t4, tk, tc, ts, \ + t2, ti, ta, tq, t6, tm, te, tu); \ + OD_FDST_16(tv, tf, tn, t7, tr, tb, tj, t3, \ + tt, td, tl, t5, tp, t9, th, t1); \ + } \ + while (0) + +#define OD_IDCT_32_ASYM(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, \ + t6, tm, te, tu, t1, t1h, th, thh, t9, t9h, tp, tph, t5, t5h, tl, tlh, \ + td, tdh, tt, tth, t3, t3h, tj, tjh, tb, tbh, tr, trh, t7, t7h, tn, tnh, \ + tf, tfh, tv, tvh) \ + /* Embedded 32-point asymmetric Type-II iDCT. */ \ + do { \ + OD_IDST_16(tv, tn, tr, tj, tt, tl, tp, th, \ + tu, tm, tq, ti, ts, tk, to, tg); \ + OD_IDCT_16(t0, t8, t4, tc, t2, ta, t6, te, \ + t1, t9, t5, td, t3, tb, t7, tf); \ + tv = t0 - tv; \ + tvh = OD_DCT_RSHIFT(tv, 1); \ + t0 -= tvh; \ + t1 += tu; \ + t1h = OD_DCT_RSHIFT(t1, 1); \ + tu = t1h - tu; \ + tt = t2 - tt; \ + tth = OD_DCT_RSHIFT(tt, 1); \ + t2 -= tth; \ + t3 += ts; \ + t3h = OD_DCT_RSHIFT(t3, 1); \ + ts = t3h - ts; \ + tr = t4 - tr; \ + trh = OD_DCT_RSHIFT(tr, 1); \ + t4 -= trh; \ + t5 += tq; \ + t5h = OD_DCT_RSHIFT(t5, 1); \ + tq = t5h - tq; \ + tp = t6 - tp; \ + tph = OD_DCT_RSHIFT(tp, 1); \ + t6 -= tph; \ + t7 += to; \ + t7h = OD_DCT_RSHIFT(t7, 1); \ + to = t7h - to; \ + tn = t8 - tn; \ + tnh = OD_DCT_RSHIFT(tn, 1); \ + t8 -= tnh; \ + t9 += tm; \ + t9h = OD_DCT_RSHIFT(t9, 1); \ + tm = t9h - tm; \ + tl = ta - tl; \ + tlh = OD_DCT_RSHIFT(tl, 1); \ + ta -= tlh; \ + tb += tk; \ + tbh = OD_DCT_RSHIFT(tb, 1); \ + tk = tbh - tk; \ + tj = tc - tj; \ + tjh = OD_DCT_RSHIFT(tj, 1); \ + tc -= tjh; \ + td += ti; \ + tdh = OD_DCT_RSHIFT(td, 1); \ + ti = tdh - ti; \ + th = te - th; \ + thh = OD_DCT_RSHIFT(th, 1); \ + te -= thh; \ + tf += tg; \ + tfh = OD_DCT_RSHIFT(tf, 1); \ + tg = tfh - tg; \ + } \ + while (0) + +#define OD_FDST_32_ASYM(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \ + tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \ + /* Embedded 32-point asymmetric Type-IV fDST. */ \ + do { \ + int t0h; \ + int t1h; \ + int t4h; \ + int t5h; \ + int tqh; \ + int trh; \ + int tuh; \ + int tvh; \ + \ + tu = -tu; \ + \ + /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \ + OD_DCT_OVERFLOW_CHECK(tq, 13573, 8192, 271); \ + t5 -= (tq*13573 + 8192) >> 14; \ + /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \ + OD_DCT_OVERFLOW_CHECK(t5, 11585, 16384, 272); \ + tq += (t5*11585 + 16384) >> 15; \ + /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \ + OD_DCT_OVERFLOW_CHECK(tq, 13573, 8192, 273); \ + t5 -= (tq*13573 + 8192) >> 14; \ + /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \ + OD_DCT_OVERFLOW_CHECK(t6, 29957, 16384, 274); \ + tp += (t6*29957 + 16384) >> 15; \ + /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \ + OD_DCT_OVERFLOW_CHECK(tp, 11585, 8192, 275); \ + t6 -= (tp*11585 + 8192) >> 14; \ + /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \ + OD_DCT_OVERFLOW_CHECK(t6, 19195, 16384, 276); \ + tp -= (t6*19195 + 16384) >> 15; \ + /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \ + OD_DCT_OVERFLOW_CHECK(t1, 29957, 16384, 277); \ + tu += (t1*29957 + 16384) >> 15; \ + /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \ + OD_DCT_OVERFLOW_CHECK(tu, 11585, 8192, 278); \ + t1 -= (tu*11585 + 8192) >> 14; \ + /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \ + OD_DCT_OVERFLOW_CHECK(t1, 19195, 16384, 279); \ + tu -= (t1*19195 + 16384) >> 15; \ + /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \ + OD_DCT_OVERFLOW_CHECK(t2, 28681, 16384, 280); \ + tt += (t2*28681 + 16384) >> 15; \ + /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \ + OD_DCT_OVERFLOW_CHECK(tt, 15137, 8192, 281); \ + t2 -= (tt*15137 + 8192) >> 14; \ + /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \ + OD_DCT_OVERFLOW_CHECK(t2, 4161, 8192, 282); \ + tt += (t2*4161 + 8192) >> 14; \ + /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \ + OD_DCT_OVERFLOW_CHECK(ts, 4161, 8192, 283); \ + t3 += (ts*4161 + 8192) >> 14; \ + /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \ + OD_DCT_OVERFLOW_CHECK(t3, 15137, 8192, 284); \ + ts -= (t3*15137 + 8192) >> 14; \ + /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \ + OD_DCT_OVERFLOW_CHECK(ts, 14341, 8192, 285); \ + t3 += (ts*14341 + 8192) >> 14; \ + /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \ + OD_DCT_OVERFLOW_CHECK(tm, 19195, 16384, 286); \ + t9 -= (tm*19195 + 16384) >> 15; \ + /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \ + OD_DCT_OVERFLOW_CHECK(t9, 11585, 8192, 287); \ + tm -= (t9*11585 + 8192) >> 14; \ + /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \ + OD_DCT_OVERFLOW_CHECK(tm, 7489, 4096, 288); \ + t9 += (tm*7489 + 4096) >> 13; \ + /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \ + OD_DCT_OVERFLOW_CHECK(tl, 3259, 4096, 289); \ + ta += (tl*3259 + 4096) >> 13; \ + /* 3135/16384 ~= Sin[Pi/8]/2 ~= 0.1913417161825449 */ \ + OD_DCT_OVERFLOW_CHECK(ta, 3135, 8192, 290); \ + tl -= (ta*3135 + 8192) >> 14; \ + /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \ + OD_DCT_OVERFLOW_CHECK(tl, 3259, 4096, 291); \ + ta += (tl*3259 + 4096) >> 13; \ + /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \ + OD_DCT_OVERFLOW_CHECK(tk, 4161, 8192, 292); \ + tb += (tk*4161 + 8192) >> 14; \ + /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \ + OD_DCT_OVERFLOW_CHECK(tb, 15137, 8192, 293); \ + tk -= (tb*15137 + 8192) >> 14; \ + /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \ + OD_DCT_OVERFLOW_CHECK(tk, 14341, 8192, 294); \ + tb += (tk*14341 + 8192) >> 14; \ + /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \ + OD_DCT_OVERFLOW_CHECK(te, 29957, 16384, 295); \ + th += (te*29957 + 16384) >> 15; \ + /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \ + OD_DCT_OVERFLOW_CHECK(th, 11585, 8192, 296); \ + te -= (th*11585 + 8192) >> 14; \ + /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \ + OD_DCT_OVERFLOW_CHECK(te, 19195, 16384, 297); \ + th -= (te*19195 + 16384) >> 15; \ + /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \ + OD_DCT_OVERFLOW_CHECK(tc, 28681, 16384, 298); \ + tj += (tc*28681 + 16384) >> 15; \ + /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \ + OD_DCT_OVERFLOW_CHECK(tj, 15137, 8192, 299); \ + tc -= (tj*15137 + 8192) >> 14; \ + /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \ + OD_DCT_OVERFLOW_CHECK(tc, 4161, 8192, 300); \ + tj += (tc*4161 + 8192) >> 14; \ + /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \ + OD_DCT_OVERFLOW_CHECK(ti, 4161, 8192, 301); \ + td += (ti*4161 + 8192) >> 14; \ + /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \ + OD_DCT_OVERFLOW_CHECK(td, 15137, 8192, 302); \ + ti -= (td*15137 + 8192) >> 14; \ + /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \ + OD_DCT_OVERFLOW_CHECK(ti, 14341, 8192, 303); \ + td += (ti*14341 + 8192) >> 14; \ + \ + t1 = -t1; \ + t2 = -t2; \ + t3 = -t3; \ + td = -td; \ + tg = -tg; \ + to = -to; \ + ts = -ts; \ + \ + tr -= OD_DCT_RSHIFT(t5, 1); \ + t5 += tr; \ + tq -= OD_DCT_RSHIFT(t4, 1); /* pass */ \ + t4 += tq; \ + t6 -= OD_DCT_RSHIFT(t7, 1); \ + t7 += t6; \ + to -= OD_DCT_RSHIFT(tp, 1); /* pass */ \ + tp += to; \ + t1 += OD_DCT_RSHIFT(t0, 1); /* pass */ \ + t0 -= t1; \ + tv -= OD_DCT_RSHIFT(tu, 1); \ + tu += tv; \ + t3 -= OD_DCT_RSHIFT(tt, 1); \ + tt += t3; \ + t2 += OD_DCT_RSHIFT(ts, 1); \ + ts -= t2; \ + t9 -= OD_DCT_RSHIFT(t8, 1); /* pass */ \ + t8 += t9; \ + tn += OD_DCT_RSHIFT(tm, 1); \ + tm -= tn; \ + tb += OD_DCT_RSHIFT(ta, 1); \ + ta -= tb; \ + tl -= OD_DCT_RSHIFT(tk, 1); \ + tk += tl; \ + te -= OD_DCT_RSHIFT(tf, 1); /* pass */ \ + tf += te; \ + tg -= OD_DCT_RSHIFT(th, 1); \ + th += tg; \ + tc -= OD_DCT_RSHIFT(ti, 1); \ + ti += tc; \ + td += OD_DCT_RSHIFT(tj, 1); \ + tj -= td; \ + \ + t4 = -t4; \ + \ + /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \ + OD_DCT_OVERFLOW_CHECK(tr, 6723, 4096, 304); \ + t4 += (tr*6723 + 4096) >> 13; \ + /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.9807852804032304 */ \ + OD_DCT_OVERFLOW_CHECK(t4, 16069, 8192, 305); \ + tr -= (t4*16069 + 8192) >> 14; \ + /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \ + OD_DCT_OVERFLOW_CHECK(tr, 6723, 4096, 306); \ + t4 += (tr*6723 + 4096) >> 13; \ + /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \ + OD_DCT_OVERFLOW_CHECK(tq, 17515, 16384, 307); \ + t5 += (tq*17515 + 16384) >> 15; \ + /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.8314696123025452 */ \ + OD_DCT_OVERFLOW_CHECK(t5, 13623, 8192, 308); \ + tq -= (t5*13623 + 8192) >> 14; \ + /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \ + OD_DCT_OVERFLOW_CHECK(tq, 17515, 16384, 309); \ + t5 += (tq*17515 + 16384) >> 15; \ + /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \ + OD_DCT_OVERFLOW_CHECK(to, 3227, 16384, 310); \ + t7 += (to*3227 + 16384) >> 15; \ + /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \ + OD_DCT_OVERFLOW_CHECK(t7, 6393, 16384, 311); \ + to -= (t7*6393 + 16384) >> 15; \ + /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \ + OD_DCT_OVERFLOW_CHECK(to, 3227, 16384, 312); \ + t7 += (to*3227 + 16384) >> 15; \ + /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \ + OD_DCT_OVERFLOW_CHECK(tp, 2485, 4096, 313); \ + t6 += (tp*2485 + 4096) >> 13; \ + /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \ + OD_DCT_OVERFLOW_CHECK(t6, 18205, 16384, 314); \ + tp -= (t6*18205 + 16384) >> 15; \ + /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \ + OD_DCT_OVERFLOW_CHECK(tp, 2485, 4096, 315); \ + t6 += (tp*2485 + 4096) >> 13; \ + \ + t5 = -t5; \ + \ + tr += to; \ + trh = OD_DCT_RSHIFT(tr, 1); \ + to -= trh; \ + t4 += t7; \ + t4h = OD_DCT_RSHIFT(t4, 1); \ + t7 -= t4h; \ + t5 += tp; \ + t5h = OD_DCT_RSHIFT(t5, 1); \ + tp -= t5h; \ + tq += t6; \ + tqh = OD_DCT_RSHIFT(tq, 1); \ + t6 -= tqh; \ + t0 -= t3; \ + t0h = OD_DCT_RSHIFT(t0, 1); \ + t3 += t0h; \ + tv -= ts; \ + tvh = OD_DCT_RSHIFT(tv, 1); \ + ts += tvh; \ + tu += tt; \ + tuh = OD_DCT_RSHIFT(tu, 1); \ + tt -= tuh; \ + t1 -= t2; \ + t1h = OD_DCT_RSHIFT(t1, 1); \ + t2 += t1h; \ + t8 += tb; \ + tb -= OD_DCT_RSHIFT(t8, 1); \ + tn += tk; \ + tk -= OD_DCT_RSHIFT(tn, 1); \ + t9 += tl; \ + tl -= OD_DCT_RSHIFT(t9, 1); \ + tm -= ta; \ + ta += OD_DCT_RSHIFT(tm, 1); \ + tc -= tf; \ + tf += OD_DCT_RSHIFT(tc, 1); \ + tj += tg; \ + tg -= OD_DCT_RSHIFT(tj, 1); \ + td -= te; \ + te += OD_DCT_RSHIFT(td, 1); \ + ti += th; \ + th -= OD_DCT_RSHIFT(ti, 1); \ + \ + t9 = -t9; \ + tl = -tl; \ + \ + /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \ + OD_DCT_OVERFLOW_CHECK(tn, 805, 8192, 316); \ + t8 += (tn*805 + 8192) >> 14; \ + /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \ + OD_DCT_OVERFLOW_CHECK(t8, 803, 4096, 317); \ + tn -= (t8*803 + 4096) >> 13; \ + /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \ + OD_DCT_OVERFLOW_CHECK(tn, 805, 8192, 318); \ + t8 += (tn*805 + 8192) >> 14; \ + /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \ + OD_DCT_OVERFLOW_CHECK(tb, 11725, 16384, 319); \ + tk += (tb*11725 + 16384) >> 15; \ + /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \ + OD_DCT_OVERFLOW_CHECK(tk, 5197, 4096, 320); \ + tb -= (tk*5197 + 4096) >> 13; \ + /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \ + OD_DCT_OVERFLOW_CHECK(tb, 11725, 16384, 321); \ + tk += (tb*11725 + 16384) >> 15; \ + /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \ + OD_DCT_OVERFLOW_CHECK(tl, 2455, 2048, 322); \ + ta += (tl*2455 + 2048) >> 12; \ + /* 14449/16384 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \ + OD_DCT_OVERFLOW_CHECK(ta, 14449, 8192, 323); \ + tl -= (ta*14449 + 8192) >> 14; \ + /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \ + OD_DCT_OVERFLOW_CHECK(tl, 2455, 2048, 324); \ + ta += (tl*2455 + 2048) >> 12; \ + /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \ + OD_DCT_OVERFLOW_CHECK(tm, 4861, 16384, 325); \ + t9 += (tm*4861 + 16384) >> 15; \ + /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \ + OD_DCT_OVERFLOW_CHECK(t9, 1189, 2048, 326); \ + tm -= (t9*1189 + 2048) >> 12; \ + /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \ + OD_DCT_OVERFLOW_CHECK(tm, 4861, 16384, 327); \ + t9 += (tm*4861 + 16384) >> 15; \ + /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \ + OD_DCT_OVERFLOW_CHECK(tg, 805, 8192, 328); \ + tf += (tg*805 + 8192) >> 14; \ + /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \ + OD_DCT_OVERFLOW_CHECK(tf, 803, 4096, 329); \ + tg -= (tf*803 + 4096) >> 13; \ + /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \ + OD_DCT_OVERFLOW_CHECK(tg, 805, 8192, 330); \ + tf += (tg*805 + 8192) >> 14; \ + /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \ + OD_DCT_OVERFLOW_CHECK(tj, 2931, 4096, 331); \ + tc += (tj*2931 + 4096) >> 13; \ + /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \ + OD_DCT_OVERFLOW_CHECK(tc, 5197, 4096, 332); \ + tj -= (tc*5197 + 4096) >> 13; \ + /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \ + OD_DCT_OVERFLOW_CHECK(tj, 2931, 4096, 333); \ + tc += (tj*2931 + 4096) >> 13; \ + /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \ + OD_DCT_OVERFLOW_CHECK(ti, 513, 1024, 334); \ + td += (ti*513 + 1024) >> 11; \ + /* 7723/16384 ~= Sin[5*Pi/32] ~= 0.47139673682599764 */ \ + OD_DCT_OVERFLOW_CHECK(td, 7723, 8192, 335); \ + ti -= (td*7723 + 8192) >> 14; \ + /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \ + OD_DCT_OVERFLOW_CHECK(ti, 513, 1024, 336); \ + td += (ti*513 + 1024) >> 11; \ + /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \ + OD_DCT_OVERFLOW_CHECK(th, 4861, 16384, 337); \ + te += (th*4861 + 16384) >> 15; \ + /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \ + OD_DCT_OVERFLOW_CHECK(te, 1189, 2048, 338); \ + th -= (te*1189 + 2048) >> 12; \ + /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \ + OD_DCT_OVERFLOW_CHECK(th, 4861, 16384, 339); \ + te += (th*4861 + 16384) >> 15; \ + \ + ta = -ta; \ + tb = -tb; \ + \ + tt += t5h; \ + t5 -= tt; \ + t2 -= tqh; \ + tq += t2; \ + tp += t1h; \ + t1 -= tp; \ + t6 -= tuh; \ + tu += t6; \ + t7 += tvh; \ + tv -= t7; \ + to += t0h; \ + t0 -= to; \ + t3 -= t4h; \ + t4 += t3; \ + ts += trh; \ + tr -= ts; \ + tf -= OD_DCT_RSHIFT(tn, 1); \ + tn += tf; \ + tg -= OD_DCT_RSHIFT(t8, 1); \ + t8 += tg; \ + tk += OD_DCT_RSHIFT(tc, 1); \ + tc -= tk; \ + tb += OD_DCT_RSHIFT(tj, 1); \ + tj -= tb; \ + ta += OD_DCT_RSHIFT(ti, 1); \ + ti -= ta; \ + tl += OD_DCT_RSHIFT(td, 1); \ + td -= tl; \ + te -= OD_DCT_RSHIFT(tm, 1); \ + tm += te; \ + th -= OD_DCT_RSHIFT(t9, 1); \ + t9 += th; \ + ta -= t5; \ + t5 += OD_DCT_RSHIFT(ta, 1); \ + tq -= tl; \ + tl += OD_DCT_RSHIFT(tq, 1); \ + t2 -= ti; \ + ti += OD_DCT_RSHIFT(t2, 1); \ + td -= tt; \ + tt += OD_DCT_RSHIFT(td, 1); \ + tm += tp; \ + tp -= OD_DCT_RSHIFT(tm, 1); \ + t6 += t9; \ + t9 -= OD_DCT_RSHIFT(t6, 1); \ + te -= tu; \ + tu += OD_DCT_RSHIFT(te, 1); \ + t1 -= th; \ + th += OD_DCT_RSHIFT(t1, 1); \ + t0 -= tg; \ + tg += OD_DCT_RSHIFT(t0, 1); \ + tf += tv; \ + tv -= OD_DCT_RSHIFT(tf, 1); \ + t8 -= t7; \ + t7 += OD_DCT_RSHIFT(t8, 1); \ + to -= tn; \ + tn += OD_DCT_RSHIFT(to, 1); \ + t4 -= tk; \ + tk += OD_DCT_RSHIFT(t4, 1); \ + tb -= tr; \ + tr += OD_DCT_RSHIFT(tb, 1); \ + t3 -= tj; \ + tj += OD_DCT_RSHIFT(t3, 1); \ + tc -= ts; \ + ts += OD_DCT_RSHIFT(tc, 1); \ + \ + tr = -tr; \ + ts = -ts; \ + tt = -tt; \ + tu = -tu; \ + \ + /* 2847/4096 ~= (1/Sqrt[2] - Cos[63*Pi/128]/2)/Sin[63*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t0, 2847, 2048, 340); \ + tv += (t0*2847 + 2048) >> 12; \ + /* 5791/4096 ~= Sqrt[2]*Sin[63*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tv, 5791, 2048, 341); \ + t0 -= (tv*5791 + 2048) >> 12; \ + /* 5593/8192 ~= (1/Sqrt[2] - Cos[63*Pi/128])/Sin[63*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t0, 5593, 4096, 342); \ + tv += (t0*5593 + 4096) >> 13; \ + /* 4099/8192 ~= (1/Sqrt[2] - Cos[31*Pi/128]/2)/Sin[31*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tf, 4099, 4096, 343); \ + tg -= (tf*4099 + 4096) >> 13; \ + /* 1997/2048 ~= Sqrt[2]*Sin[31*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tg, 1997, 1024, 344); \ + tf += (tg*1997 + 1024) >> 11; \ + /* -815/32768 ~= (1/Sqrt[2] - Cos[31*Pi/128])/Sin[31*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tf, 815, 16384, 345); \ + tg += (tf*815 + 16384) >> 15; \ + /* 2527/4096 ~= (1/Sqrt[2] - Cos[17*Pi/128]/2)/Sin[17*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t8, 2527, 2048, 346); \ + tn -= (t8*2527 + 2048) >> 12; \ + /* 4695/8192 ~= Sqrt[2]*Sin[17*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tn, 4695, 4096, 347); \ + t8 += (tn*4695 + 4096) >> 13; \ + /* -4187/8192 ~= (1/Sqrt[2] - Cos[17*Pi/128])/Sin[17*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t8, 4187, 4096, 348); \ + tn += (t8*4187 + 4096) >> 13; \ + /* 5477/8192 ~= (1/Sqrt[2] - Cos[15*Pi/128]/2)/Sin[15*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(to, 5477, 4096, 349); \ + t7 += (to*5477 + 4096) >> 13; \ + /* 4169/8192 ~= Sqrt[2]*Sin[15*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t7, 4169, 4096, 350); \ + to -= (t7*4169 + 4096) >> 13; \ + /* -2571/4096 ~= (1/Sqrt[2] - Cos[15*Pi/128])/Sin[15*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(to, 2571, 2048, 351); \ + t7 -= (to*2571 + 2048) >> 12; \ + /* 5331/8192 ~= (1/Sqrt[2] - Cos[59*Pi/128]/2)/Sin[59*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t2, 5331, 4096, 352); \ + tt += (t2*5331 + 4096) >> 13; \ + /* 5749/4096 ~= Sqrt[2]*Sin[59*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tt, 5749, 2048, 353); \ + t2 -= (tt*5749 + 2048) >> 12; \ + /* 2413/4096 ~= (1/Sqrt[2] - Cos[59*Pi/128])/Sin[59*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t2, 2413, 2048, 354); \ + tt += (t2*2413 + 2048) >> 12; \ + /* 4167/8192 ~= (1/Sqrt[2] - Cos[27*Pi/128]/2)/Sin[27*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(td, 4167, 4096, 355); \ + ti -= (td*4167 + 4096) >> 13; \ + /* 891/1024 ~= Sqrt[2]*Sin[27*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(ti, 891, 512, 356); \ + td += (ti*891 + 512) >> 10; \ + /* -4327/32768 ~= (1/Sqrt[2] - Cos[27*Pi/128])/Sin[27*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(td, 4327, 16384, 357); \ + ti += (td*4327 + 16384) >> 15; \ + /* 2261/4096 ~= (1/Sqrt[2] - Cos[21*Pi/128]/2)/Sin[21*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(ta, 2261, 2048, 358); \ + tl -= (ta*2261 + 2048) >> 12; \ + /* 2855/4096 ~= Sqrt[2]*Sin[21*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tl, 2855, 2048, 359); \ + ta += (tl*2855 + 2048) >> 12; \ + /* -5417/16384 ~= (1/Sqrt[2] - Cos[21*Pi/128])/Sin[21*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(ta, 5417, 8192, 360); \ + tl += (ta*5417 + 8192) >> 14; \ + /* 3459/4096 ~= (1/Sqrt[2] - Cos[11*Pi/128]/2)/Sin[11*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tq, 3459, 2048, 361); \ + t5 += (tq*3459 + 2048) >> 12; \ + /* 1545/4096 ~= Sqrt[2]*Sin[11*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t5, 1545, 2048, 362); \ + tq -= (t5*1545 + 2048) >> 12; \ + /* -1971/2048 ~= (1/Sqrt[2] - Cos[11*Pi/128])/Sin[11*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tq, 1971, 1024, 363); \ + t5 -= (tq*1971 + 1024) >> 11; \ + /* 323/512 ~= (1/Sqrt[2] - Cos[57*Pi/128]/2)/Sin[57*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t3, 323, 256, 364); \ + ts += (t3*323 + 256) >> 9; \ + /* 5707/4096 ~= Sqrt[2]*Sin[57*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(ts, 5707, 2048, 365); \ + t3 -= (ts*5707 + 2048) >> 12; \ + /* 2229/4096 ~= (1/Sqrt[2] - Cos[57*Pi/128])/Sin[57*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t3, 2229, 2048, 366); \ + ts += (t3*2229 + 2048) >> 12; \ + /* 1061/2048 ~= (1/Sqrt[2] - Cos[25*Pi/128]/2)/Sin[25*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tc, 1061, 1024, 367); \ + tj -= (tc*1061 + 1024) >> 11; \ + /* 6671/8192 ~= Sqrt[2]*Sin[25*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tj, 6671, 4096, 368); \ + tc += (tj*6671 + 4096) >> 13; \ + /* -6287/32768 ~= (1/Sqrt[2] - Cos[25*Pi/128])/Sin[25*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tc, 6287, 16384, 369); \ + tj += (tc*6287 + 16384) >> 15; \ + /* 4359/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128]/2)/Sin[23*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tb, 4359, 4096, 370); \ + tk -= (tb*4359 + 4096) >> 13; \ + /* 3099/4096 ~= Sqrt[2]*Sin[23*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tk, 3099, 2048, 371); \ + tb += (tk*3099 + 2048) >> 12; \ + /* -2109/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128])/Sin[23*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tb, 2109, 4096, 372); \ + tk += (tb*2109 + 4096) >> 13; \ + /* 5017/8192 ~= (1/Sqrt[2] - Cos[55*Pi/128]/2)/Sin[55*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t4, 5017, 4096, 373); \ + tr += (t4*5017 + 4096) >> 13; \ + /* 1413/1024 ~= Sqrt[2]*Sin[55*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tr, 1413, 512, 374); \ + t4 -= (tr*1413 + 512) >> 10; \ + /* 8195/16384 ~= (1/Sqrt[2] - Cos[55*Pi/128])/Sin[55*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t4, 8195, 8192, 375); \ + tr += (t4*8195 + 8192) >> 14; \ + /* 2373/4096 ~= (1/Sqrt[2] - Cos[19*Pi/128]/2)/Sin[19*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tm, 2373, 2048, 376); \ + t9 += (tm*2373 + 2048) >> 12; \ + /* 5209/8192 ~= Sqrt[2]*Sin[19*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t9, 5209, 4096, 377); \ + tm -= (t9*5209 + 4096) >> 13; \ + /* -3391/8192 ~= (1/Sqrt[2] - Cos[19*Pi/128])/Sin[19*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tm, 3391, 4096, 378); \ + t9 -= (tm*3391 + 4096) >> 13; \ + /* 1517/2048 ~= (1/Sqrt[2] - Cos[13*Pi/128]/2)/Sin[13*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t6, 1517, 1024, 379); \ + tp -= (t6*1517 + 1024) >> 11; \ + /* 1817/4096 ~= Sqrt[2]*Sin[13*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tp, 1817, 2048, 380); \ + t6 += (tp*1817 + 2048) >> 12; \ + /* -6331/8192 ~= (1/Sqrt[2] - Cos[13*Pi/128])/Sin[13*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t6, 6331, 4096, 381); \ + tp += (t6*6331 + 4096) >> 13; \ + /* 515/1024 ~= (1/Sqrt[2] - Cos[29*Pi/128]/2)/Sin[29*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(te, 515, 512, 382); \ + th -= (te*515 + 512) >> 10; \ + /* 7567/8192 ~= Sqrt[2]*Sin[29*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(th, 7567, 4096, 383); \ + te += (th*7567 + 4096) >> 13; \ + /* -2513/32768 ~= (1/Sqrt[2] - Cos[29*Pi/128])/Sin[29*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(te, 2513, 16384, 384); \ + th += (te*2513 + 16384) >> 15; \ + /* 2753/4096 ~= (1/Sqrt[2] - Cos[61*Pi/128]/2)/Sin[61*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t1, 2753, 2048, 385); \ + tu += (t1*2753 + 2048) >> 12; \ + /* 5777/4096 ~= Sqrt[2]*Sin[61*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(tu, 5777, 2048, 386); \ + t1 -= (tu*5777 + 2048) >> 12; \ + /* 1301/2048 ~= (1/Sqrt[2] - Cos[61*Pi/128])/Sin[61*Pi/128] */ \ + OD_DCT_OVERFLOW_CHECK(t1, 1301, 1024, 387); \ + tu += (t1*1301 + 1024) >> 11; \ + } \ + while (0) + +#define OD_IDST_32_ASYM(t0, tg, t8, to, t4, tk, tc, ts, t2, ti, ta, tq, t6, \ + tm, te, tu, t1, th, t9, tp, t5, tl, td, tt, t3, tj, tb, tr, t7, tn, tf, tv) \ + /* Embedded 32-point asymmetric Type-IV iDST. */ \ + do { \ + int t0h; \ + int t4h; \ + int tbh; \ + int tfh; \ + int tgh; \ + int tkh; \ + int trh; \ + int tvh; \ + /* 1301/2048 ~= (1/Sqrt[2] - Cos[61*Pi/128])/Sin[61*Pi/128] */ \ + tf -= (tg*1301 + 1024) >> 11; \ + /* 5777/4096 ~= Sqrt[2]*Sin[61*Pi/128] */ \ + tg += (tf*5777 + 2048) >> 12; \ + /* 2753/4096 ~= (1/Sqrt[2] - Cos[61*Pi/128]/2)/Sin[61*Pi/128] */ \ + tf -= (tg*2753 + 2048) >> 12; \ + /* -2513/32768 ~= (1/Sqrt[2] - Cos[29*Pi/128])/Sin[29*Pi/128] */ \ + th -= (te*2513 + 16384) >> 15; \ + /* 7567/8192 ~= Sqrt[2]*Sin[29*Pi/128] */ \ + te -= (th*7567 + 4096) >> 13; \ + /* 515/1024 ~= (1/Sqrt[2] - Cos[29*Pi/128]/2)/Sin[29*Pi/128] */ \ + th += (te*515 + 512) >> 10; \ + /* -6331/8192 ~= (1/Sqrt[2] - Cos[13*Pi/128])/Sin[13*Pi/128] */ \ + tj -= (tc*6331 + 4096) >> 13; \ + /* 1817/4096 ~= Sqrt[2]*Sin[13*Pi/128] */ \ + tc -= (tj*1817 + 2048) >> 12; \ + /* 1517/2048 ~= (1/Sqrt[2] - Cos[13*Pi/128]/2)/Sin[13*Pi/128] */ \ + tj += (tc*1517 + 1024) >> 11; \ + /* -3391/8192 ~= (1/Sqrt[2] - Cos[19*Pi/128])/Sin[19*Pi/128] */ \ + ti += (td*3391 + 4096) >> 13; \ + /* 5209/8192 ~= Sqrt[2]*Sin[19*Pi/128] */ \ + td += (ti*5209 + 4096) >> 13; \ + /* 2373/4096 ~= (1/Sqrt[2] - Cos[19*Pi/128]/2)/Sin[19*Pi/128] */ \ + ti -= (td*2373 + 2048) >> 12; \ + /* 8195/16384 ~= (1/Sqrt[2] - Cos[55*Pi/128])/Sin[55*Pi/128] */ \ + tr -= (t4*8195 + 8192) >> 14; \ + /* 1413/1024 ~= Sqrt[2]*Sin[55*Pi/128] */ \ + t4 += (tr*1413 + 512) >> 10; \ + /* 5017/8192 ~= (1/Sqrt[2] - Cos[55*Pi/128]/2)/Sin[55*Pi/128] */ \ + tr -= (t4*5017 + 4096) >> 13; \ + /* -2109/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128])/Sin[23*Pi/128] */ \ + t5 -= (tq*2109 + 4096) >> 13; \ + /* 3099/4096 ~= Sqrt[2]*Sin[23*Pi/128] */ \ + tq -= (t5*3099 + 2048) >> 12; \ + /* 4359/8192 ~= (1/Sqrt[2] - Cos[23*Pi/128]/2)/Sin[23*Pi/128] */ \ + t5 += (tq*4359 + 4096) >> 13; \ + /* -6287/32768 ~= (1/Sqrt[2] - Cos[25*Pi/128])/Sin[25*Pi/128] */ \ + tp -= (t6*6287 + 16384) >> 15; \ + /* 6671/8192 ~= Sqrt[2]*Sin[25*Pi/128] */ \ + t6 -= (tp*6671 + 4096) >> 13; \ + /* 1061/2048 ~= (1/Sqrt[2] - Cos[25*Pi/128]/2)/Sin[25*Pi/128] */ \ + tp += (t6*1061 + 1024) >> 11; \ + /* 2229/4096 ~= (1/Sqrt[2] - Cos[57*Pi/128])/Sin[57*Pi/128] */ \ + t7 -= (to*2229 + 2048) >> 12; \ + /* 5707/4096 ~= Sqrt[2]*Sin[57*Pi/128] */ \ + to += (t7*5707 + 2048) >> 12; \ + /* 323/512 ~= (1/Sqrt[2] - Cos[57*Pi/128]/2)/Sin[57*Pi/128] */ \ + t7 -= (to*323 + 256) >> 9; \ + /* -1971/2048 ~= (1/Sqrt[2] - Cos[11*Pi/128])/Sin[11*Pi/128] */ \ + tk += (tb*1971 + 1024) >> 11; \ + /* 1545/4096 ~= Sqrt[2]*Sin[11*Pi/128] */ \ + tb += (tk*1545 + 2048) >> 12; \ + /* 3459/4096 ~= (1/Sqrt[2] - Cos[11*Pi/128]/2)/Sin[11*Pi/128] */ \ + tk -= (tb*3459 + 2048) >> 12; \ + /* -5417/16384 ~= (1/Sqrt[2] - Cos[21*Pi/128])/Sin[21*Pi/128] */ \ + tl -= (ta*5417 + 8192) >> 14; \ + /* 2855/4096 ~= Sqrt[2]*Sin[21*Pi/128] */ \ + ta -= (tl*2855 + 2048) >> 12; \ + /* 2261/4096 ~= (1/Sqrt[2] - Cos[21*Pi/128]/2)/Sin[21*Pi/128] */ \ + tl += (ta*2261 + 2048) >> 12; \ + /* -4327/32768 ~= (1/Sqrt[2] - Cos[27*Pi/128])/Sin[27*Pi/128] */ \ + t9 -= (tm*4327 + 16384) >> 15; \ + /* 891/1024 ~= Sqrt[2]*Sin[27*Pi/128] */ \ + tm -= (t9*891 + 512) >> 10; \ + /* 4167/8192 ~= (1/Sqrt[2] - Cos[27*Pi/128]/2)/Sin[27*Pi/128] */ \ + t9 += (tm*4167 + 4096) >> 13; \ + /* 2413/4096 ~= (1/Sqrt[2] - Cos[59*Pi/128])/Sin[59*Pi/128] */ \ + tn -= (t8*2413 + 2048) >> 12; \ + /* 5749/4096 ~= Sqrt[2]*Sin[59*Pi/128] */ \ + t8 += (tn*5749 + 2048) >> 12; \ + /* 5331/8192 ~= (1/Sqrt[2] - Cos[59*Pi/128]/2)/Sin[59*Pi/128] */ \ + tn -= (t8*5331 + 4096) >> 13; \ + /* -2571/4096 ~= (1/Sqrt[2] - Cos[15*Pi/128])/Sin[15*Pi/128] */ \ + ts += (t3*2571 + 2048) >> 12; \ + /* 4169/8192 ~= Sqrt[2]*Sin[15*Pi/128] */ \ + t3 += (ts*4169 + 4096) >> 13; \ + /* 5477/8192 ~= (1/Sqrt[2] - Cos[15*Pi/128]/2)/Sin[15*Pi/128] */ \ + ts -= (t3*5477 + 4096) >> 13; \ + /* -4187/8192 ~= (1/Sqrt[2] - Cos[17*Pi/128])/Sin[17*Pi/128] */ \ + tt -= (t2*4187 + 4096) >> 13; \ + /* 4695/8192 ~= Sqrt[2]*Sin[17*Pi/128] */ \ + t2 -= (tt*4695 + 4096) >> 13; \ + /* 2527/4096 ~= (1/Sqrt[2] - Cos[17*Pi/128]/2)/Sin[17*Pi/128] */ \ + tt += (t2*2527 + 2048) >> 12; \ + /* -815/32768 ~= (1/Sqrt[2] - Cos[31*Pi/128])/Sin[31*Pi/128] */ \ + t1 -= (tu*815 + 16384) >> 15; \ + /* 1997/2048 ~= Sqrt[2]*Sin[31*Pi/128] */ \ + tu -= (t1*1997 + 1024) >> 11; \ + /* 4099/8192 ~= (1/Sqrt[2] - Cos[31*Pi/128]/2)/Sin[31*Pi/128] */ \ + t1 += (tu*4099 + 4096) >> 13; \ + /* 5593/8192 ~= (1/Sqrt[2] - Cos[63*Pi/128])/Sin[63*Pi/128] */ \ + tv -= (t0*5593 + 4096) >> 13; \ + /* 5791/4096 ~= Sqrt[2]*Sin[63*Pi/128] */ \ + t0 += (tv*5791 + 2048) >> 12; \ + /* 2847/4096 ~= (1/Sqrt[2] - Cos[63*Pi/128]/2)/Sin[63*Pi/128] */ \ + tv -= (t0*2847 + 2048) >> 12; \ + \ + t7 = -t7; \ + tf = -tf; \ + tn = -tn; \ + tr = -tr; \ + \ + t7 -= OD_DCT_RSHIFT(t6, 1); \ + t6 += t7; \ + tp -= OD_DCT_RSHIFT(to, 1); \ + to += tp; \ + tr -= OD_DCT_RSHIFT(tq, 1); \ + tq += tr; \ + t5 -= OD_DCT_RSHIFT(t4, 1); \ + t4 += t5; \ + tt -= OD_DCT_RSHIFT(t3, 1); \ + t3 += tt; \ + ts -= OD_DCT_RSHIFT(t2, 1); \ + t2 += ts; \ + tv += OD_DCT_RSHIFT(tu, 1); \ + tu -= tv; \ + t1 -= OD_DCT_RSHIFT(t0, 1); \ + t0 += t1; \ + th -= OD_DCT_RSHIFT(tg, 1); \ + tg += th; \ + tf -= OD_DCT_RSHIFT(te, 1); \ + te += tf; \ + ti += OD_DCT_RSHIFT(tc, 1); \ + tc -= ti; \ + tj += OD_DCT_RSHIFT(td, 1); \ + td -= tj; \ + tn -= OD_DCT_RSHIFT(tm, 1); \ + tm += tn; \ + t9 -= OD_DCT_RSHIFT(t8, 1); \ + t8 += t9; \ + tl -= OD_DCT_RSHIFT(tb, 1); \ + tb += tl; \ + tk -= OD_DCT_RSHIFT(ta, 1); \ + ta += tk; \ + \ + ti -= th; \ + th += OD_DCT_RSHIFT(ti, 1); \ + td -= te; \ + te += OD_DCT_RSHIFT(td, 1); \ + tm += tl; \ + tl -= OD_DCT_RSHIFT(tm, 1); \ + t9 += ta; \ + ta -= OD_DCT_RSHIFT(t9, 1); \ + tp += tq; \ + tq -= OD_DCT_RSHIFT(tp, 1); \ + t6 += t5; \ + t5 -= OD_DCT_RSHIFT(t6, 1); \ + t2 -= t1; \ + t1 += OD_DCT_RSHIFT(t2, 1); \ + tt -= tu; \ + tu += OD_DCT_RSHIFT(tt, 1); \ + tr += t7; \ + trh = OD_DCT_RSHIFT(tr, 1); \ + t7 -= trh; \ + t4 -= to; \ + t4h = OD_DCT_RSHIFT(t4, 1); \ + to += t4h; \ + t0 += t3; \ + t0h = OD_DCT_RSHIFT(t0, 1); \ + t3 -= t0h; \ + tv += ts; \ + tvh = OD_DCT_RSHIFT(tv, 1); \ + ts -= tvh; \ + tf -= tc; \ + tfh = OD_DCT_RSHIFT(tf, 1); \ + tc += tfh; \ + tg += tj; \ + tgh = OD_DCT_RSHIFT(tg, 1); \ + tj -= tgh; \ + tb -= t8; \ + tbh = OD_DCT_RSHIFT(tb, 1); \ + t8 += tbh; \ + tk += tn; \ + tkh = OD_DCT_RSHIFT(tk, 1); \ + tn -= tkh; \ + \ + ta = -ta; \ + tq = -tq; \ + \ + /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \ + te -= (th*4861 + 16384) >> 15; \ + /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \ + th += (te*1189 + 2048) >> 12; \ + /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \ + te -= (th*4861 + 16384) >> 15; \ + /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \ + tm -= (t9*513 + 1024) >> 11; \ + /* 7723/16384 ~= Sin[5*Pi/32] ~= 0.47139673682599764 */ \ + t9 += (tm*7723 + 8192) >> 14; \ + /* 513/2048 ~= Tan[5*Pi/64] ~= 0.25048696019130545 */ \ + tm -= (t9*513 + 1024) >> 11; \ + /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \ + t6 -= (tp*2931 + 4096) >> 13; \ + /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \ + tp += (t6*5197 + 4096) >> 13; \ + /* 2931/8192 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \ + t6 -= (tp*2931 + 4096) >> 13; \ + /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \ + tu -= (t1*805 + 8192) >> 14; \ + /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \ + t1 += (tu*803 + 4096) >> 13; \ + /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \ + tu -= (t1*805 + 8192) >> 14; \ + /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \ + ti -= (td*4861 + 16384) >> 15; \ + /* 1189/4096 ~= Sin[3*Pi/32] ~= 0.29028467725446233 */ \ + td += (ti*1189 + 2048) >> 12; \ + /* 4861/32768 ~= Tan[3*Pi/64] ~= 0.14833598753834742 */ \ + ti -= (td*4861 + 16384) >> 15; \ + /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \ + ta -= (tl*2455 + 2048) >> 12; \ + /* 14449/16384 ~= Sin[11*Pi/32] ~= 0.881921264348355 */ \ + tl += (ta*14449 + 8192) >> 14; \ + /* 2455/4096 ~= Tan[11*Pi/64] ~= 0.5993769336819237 */ \ + ta -= (tl*2455 + 2048) >> 12; \ + /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \ + t5 -= (tq*11725 + 16384) >> 15; \ + /* 5197/8192 ~= Sin[7*Pi/32] ~= 0.6343932841636455 */ \ + tq += (t5*5197 + 4096) >> 13; \ + /* 11725/32768 ~= Tan[7*Pi/64] ~= 0.3578057213145241 */ \ + t5 -= (tq*11725 + 16384) >> 15; \ + /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \ + t2 -= (tt*805 + 8192) >> 14; \ + /* 803/8192 ~= Sin[Pi/32] ~= 0.0980171403295606 */ \ + tt += (t2*803 + 4096) >> 13; \ + /* 805/16384 ~= Tan[Pi/64] ~= 0.04912684976946793 */ \ + t2 -= (tt*805 + 8192) >> 14; \ + \ + tl = -tl; \ + ti = -ti; \ + \ + th += OD_DCT_RSHIFT(t9, 1); \ + t9 -= th; \ + te -= OD_DCT_RSHIFT(tm, 1); \ + tm += te; \ + t1 += OD_DCT_RSHIFT(tp, 1); \ + tp -= t1; \ + tu -= OD_DCT_RSHIFT(t6, 1); \ + t6 += tu; \ + ta -= OD_DCT_RSHIFT(td, 1); \ + td += ta; \ + tl += OD_DCT_RSHIFT(ti, 1); \ + ti -= tl; \ + t5 += OD_DCT_RSHIFT(tt, 1); \ + tt -= t5; \ + tq += OD_DCT_RSHIFT(t2, 1); \ + t2 -= tq; \ + \ + t8 -= tgh; \ + tg += t8; \ + tn += tfh; \ + tf -= tn; \ + t7 -= tvh; \ + tv += t7; \ + to -= t0h; \ + t0 += to; \ + tc += tbh; \ + tb -= tc; \ + tj += tkh; \ + tk -= tj; \ + ts += t4h; \ + t4 -= ts; \ + t3 += trh; \ + tr -= t3; \ + \ + tk = -tk; \ + \ + /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \ + tc -= (tj*2485 + 4096) >> 13; \ + /* 18205/32768 ~= Sin[3*Pi/16] ~= 0.555570233019602 */ \ + tj += (tc*18205 + 16384) >> 15; \ + /* 2485/8192 ~= Tan[3*Pi/32] ~= 0.303346683607342 */ \ + tc -= (tj*2485 + 4096) >> 13; \ + /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \ + ts -= (t3*3227 + 16384) >> 15; \ + /* 6393/32768 ~= Sin[Pi/16] ~= 0.19509032201612825 */ \ + t3 += (ts*6393 + 16384) >> 15; \ + /* 3227/32768 ~= Tan[Pi/32] ~= 0.09849140335716425 */ \ + ts -= (t3*3227 + 16384) >> 15; \ + /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \ + tk -= (tb*17515 + 16384) >> 15; \ + /* 13623/16384 ~= Sin[5*Pi/16] ~= 0.8314696123025452 */ \ + tb += (tk*13623 + 8192) >> 14; \ + /* 17515/32768 ~= Tan[5*Pi/32] ~= 0.5345111359507916 */ \ + tk -= (tb*17515 + 16384) >> 15; \ + /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \ + t4 -= (tr*6723 + 4096) >> 13; \ + /* 16069/16384 ~= Sin[7*Pi/16] ~= 0.9807852804032304 */ \ + tr += (t4*16069 + 8192) >> 14; \ + /* 6723/8192 ~= Tan[7*Pi/32] ~= 0.8206787908286602 */ \ + t4 -= (tr*6723 + 4096) >> 13; \ + \ + t4 = -t4; \ + \ + tp += tm; \ + tm -= OD_DCT_RSHIFT(tp, 1); \ + t9 -= t6; \ + t6 += OD_DCT_RSHIFT(t9, 1); \ + th -= t1; \ + t1 += OD_DCT_RSHIFT(th, 1); \ + tu -= te; \ + te += OD_DCT_RSHIFT(tu, 1); /* pass */ \ + t5 -= tl; \ + tl += OD_DCT_RSHIFT(t5, 1); \ + ta += tq; \ + tq -= OD_DCT_RSHIFT(ta, 1); \ + td += tt; \ + tt -= OD_DCT_RSHIFT(td, 1); \ + t2 -= ti; \ + ti += OD_DCT_RSHIFT(t2, 1); /* pass */ \ + t7 += t8; \ + t8 -= OD_DCT_RSHIFT(t7, 1); \ + tn -= to; \ + to += OD_DCT_RSHIFT(tn, 1); \ + tf -= tv; \ + tv += OD_DCT_RSHIFT(tf, 1); \ + t0 += tg; \ + tg -= OD_DCT_RSHIFT(t0, 1); /* pass */ \ + tj -= t3; \ + t3 += OD_DCT_RSHIFT(tj, 1); /* pass */ \ + ts -= tc; \ + tc += OD_DCT_RSHIFT(ts, 1); \ + t4 -= tb; \ + tb += OD_DCT_RSHIFT(t4, 1); /* pass */ \ + tk -= tr; \ + tr += OD_DCT_RSHIFT(tk, 1); \ + \ + t1 = -t1; \ + t3 = -t3; \ + t7 = -t7; \ + t8 = -t8; \ + tg = -tg; \ + tm = -tm; \ + to = -to; \ + \ + /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \ + tm -= (t9*14341 + 8192) >> 14; \ + /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \ + t9 += (tm*15137 + 8192) >> 14; \ + /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \ + tm -= (t9*4161 + 8192) >> 14; \ + /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \ + tp -= (t6*4161 + 8192) >> 14; \ + /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \ + t6 += (tp*15137 + 8192) >> 14; \ + /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \ + tp -= (t6*28681 + 16384) >> 15; \ + /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \ + th += (te*19195 + 16384) >> 15; \ + /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \ + te += (th*11585 + 8192) >> 14; \ + /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \ + th -= (te*29957 + 16384) >> 15; \ + /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \ + tq -= (t5*14341 + 8192) >> 14; \ + /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \ + t5 += (tq*15137 + 8192) >> 14; \ + /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \ + tq -= (t5*4161 + 8192) >> 14; \ + /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \ + ta -= (tl*3259 + 4096) >> 13; \ + /* 3135/16384 ~= Sin[Pi/8]/2 ~= 0.1913417161825449 */ \ + tl += (ta*3135 + 8192) >> 14; \ + /* 3259/8192 ~= 2*Tan[Pi/16] ~= 0.397824734759316 */ \ + ta -= (tl*3259 + 4096) >> 13; \ + /* 7489/8192 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \ + ti -= (td*7489 + 4096) >> 13; \ + /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \ + td += (ti*11585 + 8192) >> 14; \ + /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \ + ti += (td*19195 + 16384) >> 15; \ + /* 14341/16384 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \ + to -= (t7*14341 + 8192) >> 14; \ + /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \ + t7 += (to*15137 + 8192) >> 14; \ + /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \ + to -= (t7*4161 + 8192) >> 14; \ + /* 4161/16384 ~= Tan[3*Pi/16] - Tan[Pi/8] ~= 0.253965075546204 */ \ + tn -= (t8*4161 + 8192) >> 14; \ + /* 15137/16384 ~= Sin[3*Pi/8] ~= 0.923879532511287 */ \ + t8 += (tn*15137 + 8192) >> 14; \ + /* 28681/32768 ~= Tan[3*Pi/16] + Tan[Pi/8]/2 ~= 0.875285419105846 */ \ + tn -= (t8*28681 + 16384) >> 15; \ + /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \ + tf += (tg*19195 + 16384) >> 15; \ + /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \ + tg += (tf*11585 + 8192) >> 14; \ + /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \ + tf -= (tg*29957 + 16384) >> 15; \ + /* -19195/32768 ~= Tan[Pi/8] - Tan[Pi/4] ~= -0.585786437626905 */ \ + tj += (tc*19195 + 16384) >> 15; \ + /* 11585/16384 ~= Sin[Pi/4] ~= 0.707106781186548 */ \ + tc += (tj*11585 + 8192) >> 14; \ + /* 29957/32768 ~= Tan[Pi/8] + Tan[Pi/4]/2 ~= 0.914213562373095 */ \ + tj -= (tc*29957 + 16384) >> 15; \ + /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \ + tk += (tb*13573 + 8192) >> 14; \ + /* 11585/32768 ~= Sin[Pi/4]/2 ~= 0.353553390593274 */ \ + tb -= (tk*11585 + 16384) >> 15; \ + /* 13573/16384 ~= 2*Tan[Pi/8] ~= 0.828427124746190 */ \ + tk += (tb*13573 + 8192) >> 14; \ + \ + tf = -tf; \ + \ + } \ + while (0) + +#define OD_FDCT_64(u0, uw, ug, uM, u8, uE, uo, uU, u4, uA, uk, uQ, uc, uI, \ + us, uY, u2, uy, ui, uO, ua, uG, uq, uW, u6, uC, um, uS, ue, uK, uu, u_, u1, \ + ux, uh, uN, u9, uF, up, uV, u5, uB, ul, uR, ud, uJ, ut, uZ, u3, uz, uj, uP, \ + ub, uH, ur, uX, u7, uD, un, uT, uf, uL, uv, u) \ + /* Embedded 64-point orthonormal Type-II fDCT. */ \ + do { \ + int uwh; \ + int uxh; \ + int uyh; \ + int uzh; \ + int uAh; \ + int uBh; \ + int uCh; \ + int uDh; \ + int uEh; \ + int uFh; \ + int uGh; \ + int uHh; \ + int uIh; \ + int uJh; \ + int uKh; \ + int uLh; \ + int uMh; \ + int uNh; \ + int uOh; \ + int uPh; \ + int uQh; \ + int uRh; \ + int uSh; \ + int uTh; \ + int uUh; \ + int uVh; \ + int uWh; \ + int uXh; \ + int uYh; \ + int uZh; \ + int u_h; \ + int uh_; \ + u = u0 - u; \ + uh_ = OD_DCT_RSHIFT(u, 1); \ + u0 -= uh_; \ + u_ += u1; \ + u_h = OD_DCT_RSHIFT(u_, 1); \ + u1 = u_h - u1; \ + uZ = u2 - uZ; \ + uZh = OD_DCT_RSHIFT(uZ, 1); \ + u2 -= uZh; \ + uY += u3; \ + uYh = OD_DCT_RSHIFT(uY, 1); \ + u3 = uYh - u3; \ + uX = u4 - uX; \ + uXh = OD_DCT_RSHIFT(uX, 1); \ + u4 -= uXh; \ + uW += u5; \ + uWh = OD_DCT_RSHIFT(uW, 1); \ + u5 = uWh - u5; \ + uV = u6 - uV; \ + uVh = OD_DCT_RSHIFT(uV, 1); \ + u6 -= uVh; \ + uU += u7; \ + uUh = OD_DCT_RSHIFT(uU, 1); \ + u7 = uUh - u7; \ + uT = u8 - uT; \ + uTh = OD_DCT_RSHIFT(uT, 1); \ + u8 -= uTh; \ + uS += u9; \ + uSh = OD_DCT_RSHIFT(uS, 1); \ + u9 = uSh - u9; \ + uR = ua - uR; \ + uRh = OD_DCT_RSHIFT(uR, 1); \ + ua -= uRh; \ + uQ += ub; \ + uQh = OD_DCT_RSHIFT(uQ, 1); \ + ub = uQh - ub; \ + uP = uc - uP; \ + uPh = OD_DCT_RSHIFT(uP, 1); \ + uc -= uPh; \ + uO += ud; \ + uOh = OD_DCT_RSHIFT(uO, 1); \ + ud = uOh - ud; \ + uN = ue - uN; \ + uNh = OD_DCT_RSHIFT(uN, 1); \ + ue -= uNh; \ + uM += uf; \ + uMh = OD_DCT_RSHIFT(uM, 1); \ + uf = uMh - uf; \ + uL = ug - uL; \ + uLh = OD_DCT_RSHIFT(uL, 1); \ + ug -= uLh; \ + uK += uh; \ + uKh = OD_DCT_RSHIFT(uK, 1); \ + uh = uKh - uh; \ + uJ = ui - uJ; \ + uJh = OD_DCT_RSHIFT(uJ, 1); \ + ui -= uJh; \ + uI += uj; \ + uIh = OD_DCT_RSHIFT(uI, 1); \ + uj = uIh - uj; \ + uH = uk - uH; \ + uHh = OD_DCT_RSHIFT(uH, 1); \ + uk -= uHh; \ + uG += ul; \ + uGh = OD_DCT_RSHIFT(uG, 1); \ + ul = uGh - ul; \ + uF = um - uF; \ + uFh = OD_DCT_RSHIFT(uF, 1); \ + um -= uFh; \ + uE += un; \ + uEh = OD_DCT_RSHIFT(uE, 1); \ + un = uEh - un; \ + uD = uo - uD; \ + uDh = OD_DCT_RSHIFT(uD, 1); \ + uo -= uDh; \ + uC += up; \ + uCh = OD_DCT_RSHIFT(uC, 1); \ + up = uCh - up; \ + uB = uq - uB; \ + uBh = OD_DCT_RSHIFT(uB, 1); \ + uq -= uBh; \ + uA += ur; \ + uAh = OD_DCT_RSHIFT(uA, 1); \ + ur = uAh - ur; \ + uz = us - uz; \ + uzh = OD_DCT_RSHIFT(uz, 1); \ + us -= uzh; \ + uy += ut; \ + uyh = OD_DCT_RSHIFT(uy, 1); \ + ut = uyh - ut; \ + ux = uu - ux; \ + uxh = OD_DCT_RSHIFT(ux, 1); \ + uu -= uxh; \ + uw += uv; \ + uwh = OD_DCT_RSHIFT(uw, 1); \ + uv = uwh - uv; \ + OD_FDCT_32_ASYM(u0, uw, uwh, ug, uM, uMh, u8, uE, uEh, uo, uU, uUh, \ + u4, uA, uAh, uk, uQ, uQh, uc, uI, uIh, us, uY, uYh, u2, uy, uyh, \ + ui, uO, uOh, ua, uG, uGh, uq, uW, uWh, u6, uC, uCh, um, uS, uSh, \ + ue, uK, uKh, uu, u_, u_h); \ + OD_FDST_32_ASYM(u, uv, uL, uf, uT, un, uD, u7, uX, ur, uH, ub, uP, uj, \ + uz, u3, uZ, ut, uJ, ud, uR, ul, uB, u5, uV, up, uF, u9, uN, uh, ux, u1); \ + } \ + while (0) + +#define OD_IDCT_64(u0, uw, ug, uM, u8, uE, uo, uU, u4, uA, uk, uQ, uc, uI, \ + us, uY, u2, uy, ui, uO, ua, uG, uq, uW, u6, uC, um, uS, ue, uK, uu, u_, u1, \ + ux, uh, uN, u9, uF, up, uV, u5, uB, ul, uR, ud, uJ, ut, uZ, u3, uz, uj, uP, \ + ub, uH, ur, uX, u7, uD, un, uT, uf, uL, uv, u) \ + /* Embedded 64-point orthonormal Type-II fDCT. */ \ + do { \ + int u1h; \ + int u3h; \ + int u5h; \ + int u7h; \ + int u9h; \ + int ubh; \ + int udh; \ + int ufh; \ + int uhh; \ + int ujh; \ + int ulh; \ + int unh; \ + int uph; \ + int urh; \ + int uth; \ + int uvh; \ + int uxh; \ + int uzh; \ + int uBh; \ + int uDh; \ + int uFh; \ + int uHh; \ + int uJh; \ + int uLh; \ + int uNh; \ + int uPh; \ + int uRh; \ + int uTh; \ + int uVh; \ + int uXh; \ + int uZh; \ + int uh_; \ + OD_IDST_32_ASYM(u, uL, uT, uD, uX, uH, uP, uz, uZ, uJ, uR, uB, uV, uF, \ + uN, ux, u_, uK, uS, uC, uW, uG, uO, uy, uY, uI, uQ, uA, uU, uE, uM, uw); \ + OD_IDCT_32_ASYM(u0, ug, u8, uo, u4, uk, uc, us, u2, ui, ua, uq, u6, um, \ + ue, uu, u1, u1h, uh, uhh, u9, u9h, up, uph, u5, u5h, ul, ulh, ud, udh, \ + ut, uth, u3, u3h, uj, ujh, ub, ubh, ur, urh, u7, u7h, un, unh, uf, ufh, \ + uv, uvh); \ + uh_ = OD_DCT_RSHIFT(u, 1); \ + u0 += uh_; \ + u = u0 - u; \ + u_ = u1h - u_; \ + u1 -= u_; \ + uZh = OD_DCT_RSHIFT(uZ, 1); \ + u2 += uZh; \ + uZ = u2 - uZ; \ + uY = u3h - uY; \ + u3 -= uY; \ + uXh = OD_DCT_RSHIFT(uX, 1); \ + u4 += uXh; \ + uX = u4 - uX; \ + uW = u5h - uW; \ + u5 -= uW; \ + uVh = OD_DCT_RSHIFT(uV, 1); \ + u6 += uVh; \ + uV = u6 - uV; \ + uU = u7h - uU; \ + u7 -= uU; \ + uTh = OD_DCT_RSHIFT(uT, 1); \ + u8 += uTh; \ + uT = u8 - uT; \ + uS = u9h - uS; \ + u9 -= uS; \ + uRh = OD_DCT_RSHIFT(uR, 1); \ + ua += uRh; \ + uR = ua - uR; \ + uQ = ubh - uQ; \ + ub -= uQ; \ + uPh = OD_DCT_RSHIFT(uP, 1); \ + uc += uPh; \ + uP = uc - uP; \ + uO = udh - uO; \ + ud -= uO; \ + uNh = OD_DCT_RSHIFT(uN, 1); \ + ue += uNh; \ + uN = ue - uN; \ + uM = ufh - uM; \ + uf -= uM; \ + uLh = OD_DCT_RSHIFT(uL, 1); \ + ug += uLh; \ + uL = ug - uL; \ + uK = uhh - uK; \ + uh -= uK; \ + uJh = OD_DCT_RSHIFT(uJ, 1); \ + ui += uJh; \ + uJ = ui - uJ; \ + uI = ujh - uI; \ + uj -= uI; \ + uHh = OD_DCT_RSHIFT(uH, 1); \ + uk += uHh; \ + uH = uk - uH; \ + uG = ulh - uG; \ + ul -= uG; \ + uFh = OD_DCT_RSHIFT(uF, 1); \ + um += uFh; \ + uF = um - uF; \ + uE = unh - uE; \ + un -= uE; \ + uDh = OD_DCT_RSHIFT(uD, 1); \ + uo += uDh; \ + uD = uo - uD; \ + uC = uph - uC; \ + up -= uC; \ + uBh = OD_DCT_RSHIFT(uB, 1); \ + uq += uBh; \ + uB = uq - uB; \ + uA = urh - uA; \ + ur -= uA; \ + uzh = OD_DCT_RSHIFT(uz, 1); \ + us += uzh; \ + uz = us - uz; \ + uy = uth - uy; \ + ut -= uy; \ + uxh = OD_DCT_RSHIFT(ux, 1); \ + uu += uxh; \ + ux = uu - ux; \ + uw = uvh - uw; \ + uv -= uw; \ + } while (0) +#endif + void od_bin_fdct4(od_coeff y[4], const od_coeff *x, int xstride) { int q0; int q1; @@ -2342,3 +3672,405 @@ void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]) { x[30*xstride] = (od_coeff)tu; x[31*xstride] = (od_coeff)tv; } + +#if CONFIG_TX64X64 +void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride) { + int t0; + int t1; + int t2; + int t3; + int t4; + int t5; + int t6; + int t7; + int t8; + int t9; + int ta; + int tb; + int tc; + int td; + int te; + int tf; + int tg; + int th; + int ti; + int tj; + int tk; + int tl; + int tm; + int tn; + int to; + int tp; + int tq; + int tr; + int ts; + int tt; + int tu; + int tv; + int tw; + int tx; + int ty; + int tz; + int tA; + int tB; + int tC; + int tD; + int tE; + int tF; + int tG; + int tH; + int tI; + int tJ; + int tK; + int tL; + int tM; + int tN; + int tO; + int tP; + int tQ; + int tR; + int tS; + int tT; + int tU; + int tV; + int tW; + int tX; + int tY; + int tZ; + int t_; + int t; + t0 = x[0*xstride]; + tw = x[1*xstride]; + tg = x[2*xstride]; + tM = x[3*xstride]; + t8 = x[4*xstride]; + tE = x[5*xstride]; + to = x[6*xstride]; + tU = x[7*xstride]; + t4 = x[8*xstride]; + tA = x[9*xstride]; + tk = x[10*xstride]; + tQ = x[11*xstride]; + tc = x[12*xstride]; + tI = x[13*xstride]; + ts = x[14*xstride]; + tY = x[15*xstride]; + t2 = x[16*xstride]; + ty = x[17*xstride]; + ti = x[18*xstride]; + tO = x[19*xstride]; + ta = x[20*xstride]; + tG = x[21*xstride]; + tq = x[22*xstride]; + tW = x[23*xstride]; + t6 = x[24*xstride]; + tC = x[25*xstride]; + tm = x[26*xstride]; + tS = x[27*xstride]; + te = x[28*xstride]; + tK = x[29*xstride]; + tu = x[30*xstride]; + t_ = x[31*xstride]; + t1 = x[32*xstride]; + tx = x[33*xstride]; + th = x[34*xstride]; + tN = x[35*xstride]; + t9 = x[36*xstride]; + tF = x[37*xstride]; + tp = x[38*xstride]; + tV = x[39*xstride]; + t5 = x[40*xstride]; + tB = x[41*xstride]; + tl = x[42*xstride]; + tR = x[43*xstride]; + td = x[44*xstride]; + tJ = x[45*xstride]; + tt = x[46*xstride]; + tZ = x[47*xstride]; + t3 = x[48*xstride]; + tz = x[49*xstride]; + tj = x[50*xstride]; + tP = x[51*xstride]; + tb = x[52*xstride]; + tH = x[53*xstride]; + tr = x[54*xstride]; + tX = x[55*xstride]; + t7 = x[56*xstride]; + tD = x[57*xstride]; + tn = x[58*xstride]; + tT = x[59*xstride]; + tf = x[60*xstride]; + tL = x[61*xstride]; + tv = x[62*xstride]; + t = x[63*xstride]; + OD_FDCT_64(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY, + t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx, + th, tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP, + tb, tH, tr, tX, t7, tD, tn, tT, tf, tL, tv, t); + y[0] = (od_coeff)t0; + y[1] = (od_coeff)t1; + y[2] = (od_coeff)t2; + y[3] = (od_coeff)t3; + y[4] = (od_coeff)t4; + y[5] = (od_coeff)t5; + y[6] = (od_coeff)t6; + y[7] = (od_coeff)t7; + y[8] = (od_coeff)t8; + y[9] = (od_coeff)t9; + y[10] = (od_coeff)ta; + y[11] = (od_coeff)tb; + y[12] = (od_coeff)tc; + y[13] = (od_coeff)td; + y[14] = (od_coeff)te; + y[15] = (od_coeff)tf; + y[16] = (od_coeff)tg; + y[17] = (od_coeff)th; + y[18] = (od_coeff)ti; + y[19] = (od_coeff)tj; + y[20] = (od_coeff)tk; + y[21] = (od_coeff)tl; + y[22] = (od_coeff)tm; + y[23] = (od_coeff)tn; + y[24] = (od_coeff)to; + y[25] = (od_coeff)tp; + y[26] = (od_coeff)tq; + y[27] = (od_coeff)tr; + y[28] = (od_coeff)ts; + y[29] = (od_coeff)tt; + y[30] = (od_coeff)tu; + y[31] = (od_coeff)tv; + y[32] = (od_coeff)tw; + y[33] = (od_coeff)tx; + y[34] = (od_coeff)ty; + y[35] = (od_coeff)tz; + y[36] = (od_coeff)tA; + y[37] = (od_coeff)tB; + y[38] = (od_coeff)tC; + y[39] = (od_coeff)tD; + y[40] = (od_coeff)tE; + y[41] = (od_coeff)tF; + y[41] = (od_coeff)tF; + y[42] = (od_coeff)tG; + y[43] = (od_coeff)tH; + y[44] = (od_coeff)tI; + y[45] = (od_coeff)tJ; + y[46] = (od_coeff)tK; + y[47] = (od_coeff)tL; + y[48] = (od_coeff)tM; + y[49] = (od_coeff)tN; + y[50] = (od_coeff)tO; + y[51] = (od_coeff)tP; + y[52] = (od_coeff)tQ; + y[53] = (od_coeff)tR; + y[54] = (od_coeff)tS; + y[55] = (od_coeff)tT; + y[56] = (od_coeff)tU; + y[57] = (od_coeff)tV; + y[58] = (od_coeff)tW; + y[59] = (od_coeff)tX; + y[60] = (od_coeff)tY; + y[61] = (od_coeff)tZ; + y[62] = (od_coeff)t_; + y[63] = (od_coeff)t; +} + +void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]) { + int t0; + int t1; + int t2; + int t3; + int t4; + int t5; + int t6; + int t7; + int t8; + int t9; + int ta; + int tb; + int tc; + int td; + int te; + int tf; + int tg; + int th; + int ti; + int tj; + int tk; + int tl; + int tm; + int tn; + int to; + int tp; + int tq; + int tr; + int ts; + int tt; + int tu; + int tv; + int tw; + int tx; + int ty; + int tz; + int tA; + int tB; + int tC; + int tD; + int tE; + int tF; + int tG; + int tH; + int tI; + int tJ; + int tK; + int tL; + int tM; + int tN; + int tO; + int tP; + int tQ; + int tR; + int tS; + int tT; + int tU; + int tV; + int tW; + int tX; + int tY; + int tZ; + int t_; + int t; + t0 = y[0]; + tw = y[1]; + tg = y[2]; + tM = y[3]; + t8 = y[4]; + tE = y[5]; + to = y[6]; + tU = y[7]; + t4 = y[8]; + tA = y[9]; + tk = y[10]; + tQ = y[11]; + tc = y[12]; + tI = y[13]; + ts = y[14]; + tY = y[15]; + t2 = y[16]; + ty = y[17]; + ti = y[18]; + tO = y[19]; + ta = y[20]; + tG = y[21]; + tq = y[22]; + tW = y[23]; + t6 = y[24]; + tC = y[25]; + tm = y[26]; + tS = y[27]; + te = y[28]; + tK = y[29]; + tu = y[30]; + t_ = y[31]; + t1 = y[32]; + tx = y[33]; + th = y[34]; + tN = y[35]; + t9 = y[36]; + tF = y[37]; + tp = y[38]; + tV = y[39]; + t5 = y[40]; + tB = y[41]; + tl = y[42]; + tR = y[43]; + td = y[44]; + tJ = y[45]; + tt = y[46]; + tZ = y[47]; + t3 = y[48]; + tz = y[49]; + tj = y[50]; + tP = y[51]; + tb = y[52]; + tH = y[53]; + tr = y[54]; + tX = y[55]; + t7 = y[56]; + tD = y[57]; + tn = y[58]; + tT = y[59]; + tf = y[60]; + tL = y[61]; + tv = y[62]; + t = y[63]; + OD_IDCT_64(t0, tw, tg, tM, t8, tE, to, tU, t4, tA, tk, tQ, tc, tI, ts, tY, + t2, ty, ti, tO, ta, tG, tq, tW, t6, tC, tm, tS, te, tK, tu, t_, t1, tx, + th, tN, t9, tF, tp, tV, t5, tB, tl, tR, td, tJ, tt, tZ, t3, tz, tj, tP, + tb, tH, tr, tX, t7, tD, tn, tT, tf, tL, tv, t); + x[0*xstride] = (od_coeff)t0; + x[1*xstride] = (od_coeff)t1; + x[2*xstride] = (od_coeff)t2; + x[3*xstride] = (od_coeff)t3; + x[4*xstride] = (od_coeff)t4; + x[5*xstride] = (od_coeff)t5; + x[6*xstride] = (od_coeff)t6; + x[7*xstride] = (od_coeff)t7; + x[8*xstride] = (od_coeff)t8; + x[9*xstride] = (od_coeff)t9; + x[10*xstride] = (od_coeff)ta; + x[11*xstride] = (od_coeff)tb; + x[12*xstride] = (od_coeff)tc; + x[13*xstride] = (od_coeff)td; + x[14*xstride] = (od_coeff)te; + x[15*xstride] = (od_coeff)tf; + x[16*xstride] = (od_coeff)tg; + x[17*xstride] = (od_coeff)th; + x[18*xstride] = (od_coeff)ti; + x[19*xstride] = (od_coeff)tj; + x[20*xstride] = (od_coeff)tk; + x[21*xstride] = (od_coeff)tl; + x[22*xstride] = (od_coeff)tm; + x[23*xstride] = (od_coeff)tn; + x[24*xstride] = (od_coeff)to; + x[25*xstride] = (od_coeff)tp; + x[26*xstride] = (od_coeff)tq; + x[27*xstride] = (od_coeff)tr; + x[28*xstride] = (od_coeff)ts; + x[29*xstride] = (od_coeff)tt; + x[30*xstride] = (od_coeff)tu; + x[31*xstride] = (od_coeff)tv; + x[32*xstride] = (od_coeff)tw; + x[33*xstride] = (od_coeff)tx; + x[34*xstride] = (od_coeff)ty; + x[35*xstride] = (od_coeff)tz; + x[36*xstride] = (od_coeff)tA; + x[37*xstride] = (od_coeff)tB; + x[38*xstride] = (od_coeff)tC; + x[39*xstride] = (od_coeff)tD; + x[40*xstride] = (od_coeff)tE; + x[41*xstride] = (od_coeff)tF; + x[41*xstride] = (od_coeff)tF; + x[42*xstride] = (od_coeff)tG; + x[43*xstride] = (od_coeff)tH; + x[44*xstride] = (od_coeff)tI; + x[45*xstride] = (od_coeff)tJ; + x[46*xstride] = (od_coeff)tK; + x[47*xstride] = (od_coeff)tL; + x[48*xstride] = (od_coeff)tM; + x[49*xstride] = (od_coeff)tN; + x[50*xstride] = (od_coeff)tO; + x[51*xstride] = (od_coeff)tP; + x[52*xstride] = (od_coeff)tQ; + x[53*xstride] = (od_coeff)tR; + x[54*xstride] = (od_coeff)tS; + x[55*xstride] = (od_coeff)tT; + x[56*xstride] = (od_coeff)tU; + x[57*xstride] = (od_coeff)tV; + x[58*xstride] = (od_coeff)tW; + x[59*xstride] = (od_coeff)tX; + x[60*xstride] = (od_coeff)tY; + x[61*xstride] = (od_coeff)tZ; + x[62*xstride] = (od_coeff)t_; + x[63*xstride] = (od_coeff)t; +} +#endif diff --git a/av1/common/daala_tx.h b/av1/common/daala_tx.h index b0f24a1bc..cef35c979 100644 --- a/av1/common/daala_tx.h +++ b/av1/common/daala_tx.h @@ -15,5 +15,8 @@ void od_bin_fdst16(od_coeff y[16], const od_coeff *x, int xstride); void od_bin_idst16(od_coeff *x, int xstride, const od_coeff y[16]); void od_bin_fdct32(od_coeff y[32], const od_coeff *x, int xstride); void od_bin_idct32(od_coeff *x, int xstride, const od_coeff y[32]); - +#if CONFIG_TX64X64 +void od_bin_fdct64(od_coeff y[64], const od_coeff *x, int xstride); +void od_bin_idct64(od_coeff *x, int xstride, const od_coeff y[64]); +#endif #endif diff --git a/av1/common/idct.c b/av1/common/idct.c index f5f65936b..13596f2c0 100644 --- a/av1/common/idct.c +++ b/av1/common/idct.c @@ -81,8 +81,13 @@ static void iidtx32_c(const tran_low_t *input, tran_low_t *output) { #if CONFIG_TX64X64 static void iidtx64_c(const tran_low_t *input, tran_low_t *output) { int i; - for (i = 0; i < 64; ++i) + for (i = 0; i < 64; ++i) { +#if CONFIG_DAALA_DCT64 + output[i] = input[i]; +#else output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2); +#endif + } } #endif // CONFIG_TX64X64 #endif // CONFIG_EXT_TX @@ -118,6 +123,29 @@ static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) { #endif #if CONFIG_TX64X64 +#if CONFIG_DAALA_DCT64 +static void idct64_col_c(const tran_low_t *input, tran_low_t *output) { + aom_idct64_c(input, output); +} + +static void idct64_row_c(const tran_low_t *input, tran_low_t *output) { + aom_idct64_c(input, output); +} + +static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) { + int i; + tran_low_t inputhalf[32]; + // No scaling within; Daala transforms are all orthonormal + for (i = 0; i < 32; ++i) { + inputhalf[i] = input[i]; + } + for (i = 0; i < 32; ++i) { + output[i] = input[32 + i]; + } + aom_idct32_c(inputhalf, output + 32); +} + +#else static void idct64_col_c(const tran_low_t *input, tran_low_t *output) { int32_t in[64], out[64]; int i; @@ -148,6 +176,7 @@ static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) { aom_idct32_c(inputhalf, output + 32); // Note overall scaling factor is 4 * sqrt(2) times orthogonal } +#endif // CONFIG_DAALA_DCT64 #endif // CONFIG_TX64X64 // Inverse identity transform and add. @@ -1416,8 +1445,15 @@ void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride, // inverse transform row vectors for (i = 0; i < 64; ++i) { +#if CONFIG_DAALA_DCT64 + tran_low_t temp_in[64]; + for (j = 0; j < 64; j++) temp_in[j] = input[j] * 2; + IHT_64[tx_type].rows(temp_in, out[i]); +// Do not rescale intermediate for Daala +#else IHT_64[tx_type].rows(input, out[i]); for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1); +#endif input += 64; } @@ -1440,7 +1476,11 @@ void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride, for (j = 0; j < 64; ++j) { int d = i * stride + j; int s = j * outstride + i; +#if CONFIG_DAALA_DCT64 + dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 2)); +#else dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5)); +#endif } } } @@ -1575,13 +1615,13 @@ static void idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, } #endif -#if CONFIG_TX64X64 +#if CONFIG_TX64X64 && !CONFIG_DAALA_DCT64 static void idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { (void)txfm_param; av1_iht64x64_4096_add(input, dest, stride, txfm_param); } -#endif // CONFIG_TX64X64 +#endif // CONFIG_TX64X64 && !CONFIG_DAALA_DCT64 #if CONFIG_CHROMA_2X2 static void inv_txfm_add_2x2(const tran_low_t *input, uint8_t *dest, int stride, @@ -1875,7 +1915,11 @@ static void inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { const TX_TYPE tx_type = txfm_param->tx_type; switch (tx_type) { +#if !CONFIG_DAALA_DCT64 case DCT_DCT: idct64x64_add(input, dest, stride, txfm_param); break; +#else + case DCT_DCT: +#endif #if CONFIG_EXT_TX case ADST_DCT: case DCT_ADST: diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c index bc5d8947e..0bb4798c5 100644 --- a/av1/encoder/dct.c +++ b/av1/encoder/dct.c @@ -22,7 +22,7 @@ #include "av1/common/av1_fwd_txfm1d_cfg.h" #include "av1/common/idct.h" #if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \ - CONFIG_DAALA_DCT32 + CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64 #include "av1/common/daala_tx.h" #endif @@ -782,6 +782,16 @@ static void fdct32(const tran_low_t *input, tran_low_t *output) { #endif #ifndef AV1_DCT_GTEST +#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64 +static void fdct64(const tran_low_t *input, tran_low_t *output) { + int i; + od_coeff x[64]; + od_coeff y[64]; + for (i = 0; i < 64; i++) x[i] = (od_coeff)input[i]; + od_bin_fdct64(y, x, 1); + for (i = 0; i < 64; i++) output[i] = (tran_low_t)y[i]; +} +#endif static void fadst4(const tran_low_t *input, tran_low_t *output) { tran_high_t x0, x1, x2, x3; @@ -2530,6 +2540,37 @@ void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride, } #if CONFIG_TX64X64 +#if CONFIG_DAALA_DCT64 +#if CONFIG_EXT_TX +static void fidtx64(const tran_low_t *input, tran_low_t *output) { + int i; + for (i = 0; i < 64; ++i) output[i] = input[i]; +} + +// For use in lieu of ADST +static void fhalfright64(const tran_low_t *input, tran_low_t *output) { + int i; + tran_low_t inputhalf[32]; + // No scaling within; Daala transforms are all orthonormal + for (i = 0; i < 32; ++i) { + output[32 + i] = input[i]; + } + for (i = 0; i < 32; ++i) { + inputhalf[i] = input[i + 32]; + } + fdct32(inputhalf, output); + // Note overall scaling factor is 2 times unitary +} +#endif // CONFIG_EXT_TX + +static void fdct64_col(const tran_low_t *input, tran_low_t *output) { + fdct64(input, output); +} + +static void fdct64_row(const tran_low_t *input, tran_low_t *output) { + fdct64(input, output); +} +#else #if CONFIG_EXT_TX static void fidtx64(const tran_low_t *input, tran_low_t *output) { int i; @@ -2568,6 +2609,7 @@ static void fdct64_row(const tran_low_t *input, tran_low_t *output) { av1_fdct64_new(in, out, fwd_cos_bit_row_dct_64, fwd_stage_range_row_dct_64); for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i]; } +#endif void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride, TxfmParam *txfm_param) { @@ -2609,10 +2651,18 @@ void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride, // Columns for (i = 0; i < 64; ++i) { +#if CONFIG_DAALA_DCT64 + for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i] * 16; + ht.cols(temp_in, temp_out); + for (j = 0; j < 64; ++j) + out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 3; + +#else for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i]; ht.cols(temp_in, temp_out); for (j = 0; j < 64; ++j) out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; +#endif } // Rows @@ -2620,8 +2670,12 @@ void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride, for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64]; ht.rows(temp_in, temp_out); for (j = 0; j < 64; ++j) +#if CONFIG_DAALA_DCT64 + output[j + i * 64] = temp_out[j]; +#else output[j + i * 64] = (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); +#endif } } #endif // CONFIG_TX64X64 diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake index 5f6873f3e..66a591805 100644 --- a/build/cmake/aom_config_defaults.cmake +++ b/build/cmake/aom_config_defaults.cmake @@ -127,6 +127,7 @@ set(CONFIG_CDEF 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_DAALA_DCT16 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_DAALA_DCT32 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_DAALA_DCT4 0 CACHE NUMBER "AV1 experiment flag.") +set(CONFIG_DAALA_DCT64 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_DAALA_DCT8 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_DAALA_DIST 0 CACHE NUMBER "AV1 experiment flag.") set(CONFIG_DCT_ONLY 0 CACHE NUMBER "AV1 experiment flag.") diff --git a/build/cmake/aom_configure.cmake b/build/cmake/aom_configure.cmake index 8c9993bba..df4252f61 100644 --- a/build/cmake/aom_configure.cmake +++ b/build/cmake/aom_configure.cmake @@ -247,8 +247,16 @@ if (CONFIG_DAALA_DCT4 AND NOT CONFIG_DCT_ONLY) change_config_and_warn(CONFIG_DCT_ONLY 1 CONFIG_DAALA_DCT4) endif() +if (CONFIG_DAALA_DCT64) + if (NOT CONFIG_TX64X64) + message(WARNING + "--- Enabled CONFIG_TX64X64, needed for CONFIG_DAALA_DCT64.") + set(CONFIG_TX64X64 1) + endif() +endif() + if (CONFIG_DAALA_DCT4 OR CONFIG_DAALA_DCT8 OR CONFIG_DAALA_DCT16 OR - CONFIG_DAALA_DCT32) + CONFIG_DAALA_DCT32 OR CONFIG_DAALA_DCT64) if (HAVE_MMX) change_config_and_warn(HAVE_MMX 0 CONFIG_DAALA_DCTx) endif() diff --git a/configure b/configure index 3dbd9d56a..009300775 100755 --- a/configure +++ b/configure @@ -295,6 +295,7 @@ EXPERIMENT_LIST=" daala_dct8 daala_dct16 daala_dct32 + daala_dct64 cb4x4 chroma_2x2 chroma_sub8x8 @@ -576,10 +577,14 @@ post_process_cmdline() { if enabled daala_dct4; then enable_feature dct_only fi + if enabled daala_dct64; then + enable_feature tx64x64 + fi if enabled daala_dct4 || enabled daala_dct8 || enabled daala_dct16 || - enabled daala_dct32; then + enabled daala_dct32 || + enabled daala_dct64; then disable_feature mmx disable_feature rect_tx disable_feature var_tx -- GitLab