kiss_fft.c 15.7 KB
Newer Older
1
2
3
4
/*Copyright (c) 2003-2004, Mark Borgerding
  Lots of modifications by Jean-Marc Valin
  Copyright (c) 2005-2007, Xiph.Org Foundation
  Copyright (c) 2008,      Xiph.Org Foundation, CSIRO
5

6
  All rights reserved.
7

8
9
  Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are met:
10

11
12
13
14
15
    * Redistributions of source code must retain the above copyright notice,
       this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice,
       this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
16

17
18
19
20
21
22
23
24
25
26
27
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  POSSIBILITY OF SUCH DAMAGE.*/
28
29
30
31

/* This code is originally from Mark Borgerding's KISS-FFT but has been
   heavily modified to better suit Opus */

32
33
34
35
#ifndef SKIP_CONFIG_H
#  ifdef HAVE_CONFIG_H
#    include "config.h"
#  endif
36
37
38
39
40
#endif

#include "_kiss_fft_guts.h"
#include "arch.h"
#include "os_support.h"
41
#include "mathops.h"
42
#include "stack_alloc.h"
43
44

/* The guts header contains all the multiplication and addition macros that are defined for
45
46
   complex numbers.  It also delares the kf_ internal functions.
*/
47
48

static void kf_bfly2(
49
50
                     kiss_fft_cpx * Fout,
                     int m,
51
                     int N
52
                    )
53
{
54
   kiss_fft_cpx * Fout2;
55
   int i;
56
   (void)m;
57
58
#ifdef CUSTOM_MODES
   if (m==1)
59
   {
60
61
      celt_assert(m==1);
      for (i=0;i<N;i++)
62
      {
63
         kiss_fft_cpx t;
64
65
         Fout2 = Fout + 1;
         t = *Fout2;
66
         C_SUB( *Fout2 ,  *Fout , t );
67
         C_ADDTO( *Fout ,  t );
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
         Fout += 2;
      }
   } else
#endif
   {
      opus_val16 tw;
      tw = QCONST16(0.7071067812f, 15);
      /* We know that m==4 here because the radix-2 is just after a radix-4 */
      celt_assert(m==4);
      for (i=0;i<N;i++)
      {
         kiss_fft_cpx t;
         Fout2 = Fout + 4;
         t = Fout2[0];
         C_SUB( Fout2[0] ,  Fout[0] , t );
         C_ADDTO( Fout[0] ,  t );

         t.r = S_MUL(Fout2[1].r+Fout2[1].i, tw);
         t.i = S_MUL(Fout2[1].i-Fout2[1].r, tw);
         C_SUB( Fout2[1] ,  Fout[1] , t );
         C_ADDTO( Fout[1] ,  t );

         t.r = Fout2[2].i;
         t.i = -Fout2[2].r;
         C_SUB( Fout2[2] ,  Fout[2] , t );
         C_ADDTO( Fout[2] ,  t );

         t.r = S_MUL(Fout2[3].i-Fout2[3].r, tw);
         t.i = S_MUL(-Fout2[3].i-Fout2[3].r, tw);
         C_SUB( Fout2[3] ,  Fout[3] , t );
         C_ADDTO( Fout[3] ,  t );
         Fout += 8;
100
101
      }
   }
102
103
}

104
105
106
static void kf_bfly4(
                     kiss_fft_cpx * Fout,
                     const size_t fstride,
107
                     const kiss_fft_state *st,
108
109
110
111
112
                     int m,
                     int N,
                     int mm
                    )
{
113
   int i;
114

115
   if (m==1)
116
   {
117
118
      /* Degenerate case where all the twiddles are 1. */
      for (i=0;i<N;i++)
119
      {
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
         kiss_fft_cpx scratch0, scratch1;

         C_SUB( scratch0 , *Fout, Fout[2] );
         C_ADDTO(*Fout, Fout[2]);
         C_ADD( scratch1 , Fout[1] , Fout[3] );
         C_SUB( Fout[2], *Fout, scratch1 );
         C_ADDTO( *Fout , scratch1 );
         C_SUB( scratch1 , Fout[1] , Fout[3] );

         Fout[1].r = scratch0.r + scratch1.i;
         Fout[1].i = scratch0.i - scratch1.r;
         Fout[3].r = scratch0.r - scratch1.i;
         Fout[3].i = scratch0.i + scratch1.r;
         Fout+=4;
      }
   } else {
      int j;
      kiss_fft_cpx scratch[6];
      const kiss_twiddle_cpx *tw1,*tw2,*tw3;
      const int m2=2*m;
      const int m3=3*m;
      kiss_fft_cpx * Fout_beg = Fout;
      for (i=0;i<N;i++)
      {
         Fout = Fout_beg + i*mm;
         tw3 = tw2 = tw1 = st->twiddles;
146
         /* m is guaranteed to be a multiple of 4. */
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
         for (j=0;j<m;j++)
         {
            C_MUL(scratch[0],Fout[m] , *tw1 );
            C_MUL(scratch[1],Fout[m2] , *tw2 );
            C_MUL(scratch[2],Fout[m3] , *tw3 );

            C_SUB( scratch[5] , *Fout, scratch[1] );
            C_ADDTO(*Fout, scratch[1]);
            C_ADD( scratch[3] , scratch[0] , scratch[2] );
            C_SUB( scratch[4] , scratch[0] , scratch[2] );
            C_SUB( Fout[m2], *Fout, scratch[3] );
            tw1 += fstride;
            tw2 += fstride*2;
            tw3 += fstride*3;
            C_ADDTO( *Fout , scratch[3] );

            Fout[m].r = scratch[5].r + scratch[4].i;
            Fout[m].i = scratch[5].i - scratch[4].r;
            Fout[m3].r = scratch[5].r - scratch[4].i;
            Fout[m3].i = scratch[5].i + scratch[4].r;
            ++Fout;
         }
169
170
171
172
      }
   }
}

173

174
#ifndef RADIX_TWO_ONLY
175

176
static void kf_bfly3(
177
178
                     kiss_fft_cpx * Fout,
                     const size_t fstride,
179
                     const kiss_fft_state *st,
180
181
182
                     int m,
                     int N,
                     int mm
183
                    )
184
{
185
186
   int i;
   size_t k;
187
   const size_t m2 = 2*m;
Jean-Marc Valin's avatar
Jean-Marc Valin committed
188
   const kiss_twiddle_cpx *tw1,*tw2;
189
   kiss_fft_cpx scratch[5];
190
   kiss_twiddle_cpx epi3;
191

192
   kiss_fft_cpx * Fout_beg = Fout;
193
194
195
196
#ifdef FIXED_POINT
   epi3.r = -16384;
   epi3.i = -28378;
#else
197
   epi3 = st->twiddles[fstride*m];
198
#endif
199
200
201
202
   for (i=0;i<N;i++)
   {
      Fout = Fout_beg + i*mm;
      tw1=tw2=st->twiddles;
203
      /* For non-custom modes, m is guaranteed to be a multiple of 4. */
204
205
      k=m;
      do {
206

207
208
         C_MUL(scratch[1],Fout[m] , *tw1);
         C_MUL(scratch[2],Fout[m2] , *tw2);
209

210
211
212
213
         C_ADD(scratch[3],scratch[1],scratch[2]);
         C_SUB(scratch[0],scratch[1],scratch[2]);
         tw1 += fstride;
         tw2 += fstride*2;
214

215
216
         Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
         Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
217

218
         C_MULBYSCALAR( scratch[0] , epi3.i );
219

220
         C_ADDTO(*Fout,scratch[3]);
221

222
223
         Fout[m2].r = Fout[m].r + scratch[0].i;
         Fout[m2].i = Fout[m].i - scratch[0].r;
224

225
226
         Fout[m].r -= scratch[0].i;
         Fout[m].i += scratch[0].r;
227

228
229
230
         ++Fout;
      } while(--k);
   }
231
}
232

233

234
235
236
static void kf_bfly5(
                     kiss_fft_cpx * Fout,
                     const size_t fstride,
237
                     const kiss_fft_state *st,
238
239
240
                     int m,
                     int N,
                     int mm
241
242
243
                    )
{
   kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
244
   int i, u;
245
   kiss_fft_cpx scratch[13];
Jean-Marc Valin's avatar
Jean-Marc Valin committed
246
   const kiss_twiddle_cpx *tw;
247
   kiss_twiddle_cpx ya,yb;
248
249
   kiss_fft_cpx * Fout_beg = Fout;

250
251
252
253
254
255
256
257
258
#ifdef FIXED_POINT
   ya.r = 10126;
   ya.i = -31164;
   yb.r = -26510;
   yb.i = -19261;
#else
   ya = st->twiddles[fstride*m];
   yb = st->twiddles[fstride*2*m];
#endif
259
   tw=st->twiddles;
260

261
262
263
264
265
266
267
268
   for (i=0;i<N;i++)
   {
      Fout = Fout_beg + i*mm;
      Fout0=Fout;
      Fout1=Fout0+m;
      Fout2=Fout0+2*m;
      Fout3=Fout0+3*m;
      Fout4=Fout0+4*m;
269

270
      /* For non-custom modes, m is guaranteed to be a multiple of 4. */
271
272
      for ( u=0; u<m; ++u ) {
         scratch[0] = *Fout0;
273

274
275
276
277
         C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
         C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
         C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
         C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
278

279
280
281
282
         C_ADD( scratch[7],scratch[1],scratch[4]);
         C_SUB( scratch[10],scratch[1],scratch[4]);
         C_ADD( scratch[8],scratch[2],scratch[3]);
         C_SUB( scratch[9],scratch[2],scratch[3]);
283

284
285
         Fout0->r += scratch[7].r + scratch[8].r;
         Fout0->i += scratch[7].i + scratch[8].i;
286

287
288
         scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
         scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
289

290
291
         scratch[6].r =  S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
         scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
292

293
294
         C_SUB(*Fout1,scratch[5],scratch[6]);
         C_ADD(*Fout4,scratch[5],scratch[6]);
295

296
297
298
299
         scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
         scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
         scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
         scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
300

301
302
         C_ADD(*Fout2,scratch[11],scratch[12]);
         C_SUB(*Fout3,scratch[11],scratch[12]);
303

304
305
         ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
      }
306
307
   }
}
308

309

310
#endif
311

312

313
#ifdef CUSTOM_MODES
314
315
316
317

static
void compute_bitrev_table(
         int Fout,
318
         opus_int16 *f,
319
320
         const size_t fstride,
         int in_stride,
321
         opus_int16 * factors,
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
         const kiss_fft_state *st
            )
{
   const int p=*factors++; /* the radix  */
   const int m=*factors++; /* stage's fft length/p */

    /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/
   if (m==1)
   {
      int j;
      for (j=0;j<p;j++)
      {
         *f = Fout+j;
         f += fstride*in_stride;
      }
   } else {
      int j;
      for (j=0;j<p;j++)
      {
         compute_bitrev_table( Fout , f, fstride*p, in_stride, factors,st);
         f += fstride*in_stride;
         Fout += m;
      }
   }
}

348
/*  facbuf is populated by p1,m1,p2,m2, ...
349
    where
350
351
    p[i] * m[i] = m[i-1]
    m0 = n                  */
352
static
353
int kf_factor(int n,opus_int16 * facbuf)
354
355
{
    int p=4;
356
357
358
    int i;
    int stages=0;
    int nbak = n;
359
360
361
362
363
364
365
366
367

    /*factor out powers of 4, powers of 2, then any remaining primes */
    do {
        while (n % p) {
            switch (p) {
                case 4: p = 2; break;
                case 2: p = 3; break;
                default: p += 2; break;
            }
368
            if (p>32000 || (opus_int32)p*(opus_int32)p > n)
369
370
371
                p = n;          /* no more factors, skip to end */
        }
        n /= p;
372
373
374
#ifdef RADIX_TWO_ONLY
        if (p!=2 && p != 4)
#else
375
        if (p>5)
376
#endif
377
378
379
        {
           return 0;
        }
380
        facbuf[2*stages] = p;
381
382
383
384
385
        if (p==2 && stages > 1)
        {
           facbuf[2*stages] = 4;
           facbuf[2] = 2;
        }
386
        stages++;
387
    } while (n > 1);
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
    n = nbak;
    /* Reverse the order to get the radix 4 at the end, so we can use the
       fast degenerate case. It turns out that reversing the order also
       improves the noise behaviour. */
    for (i=0;i<stages/2;i++)
    {
       int tmp;
       tmp = facbuf[2*i];
       facbuf[2*i] = facbuf[2*(stages-i-1)];
       facbuf[2*(stages-i-1)] = tmp;
    }
    for (i=0;i<stages;i++)
    {
        n /= facbuf[2*i];
        facbuf[2*i+1] = n;
    }
404
    return 1;
405
}
406
407
408
409

static void compute_twiddles(kiss_twiddle_cpx *twiddles, int nfft)
{
   int i;
410
#ifdef FIXED_POINT
411
   for (i=0;i<nfft;++i) {
412
      opus_val32 phase = -i;
413
414
415
416
417
418
419
420
421
422
423
      kf_cexp2(twiddles+i, DIV32(SHL32(phase,17),nfft));
   }
#else
   for (i=0;i<nfft;++i) {
      const double pi=3.14159265358979323846264338327;
      double phase = ( -2*pi /nfft ) * i;
      kf_cexp(twiddles+i, phase );
   }
#endif
}

424
425
/*
 *
426
427
428
 * Allocates all necessary storage space for the fft and ifft.
 * The return value is a contiguous block of memory.  As such,
 * It can be freed with free().
429
 * */
430
kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,  const kiss_fft_state *base)
431
{
432
    kiss_fft_state *st=NULL;
433
    size_t memneeded = sizeof(struct kiss_fft_state); /* twiddle factors*/
434
435

    if ( lenmem==NULL ) {
436
        st = ( kiss_fft_state*)KISS_FFT_MALLOC( memneeded );
437
438
    }else{
        if (mem != NULL && *lenmem >= memneeded)
439
            st = (kiss_fft_state*)mem;
440
441
442
        *lenmem = memneeded;
    }
    if (st) {
443
        opus_int16 *bitrev;
Jean-Marc Valin's avatar
Jean-Marc Valin committed
444
445
        kiss_twiddle_cpx *twiddles;

446
        st->nfft=nfft;
447
448
449
450
451
452
453
#ifdef FIXED_POINT
        st->scale_shift = celt_ilog2(st->nfft);
        if (st->nfft == 1<<st->scale_shift)
           st->scale = Q15ONE;
        else
           st->scale = (1073741824+st->nfft/2)/st->nfft>>(15-st->scale_shift);
#else
454
        st->scale = 1.f/nfft;
455
#endif
456
457
458
459
460
461
462
        if (base != NULL)
        {
           st->twiddles = base->twiddles;
           st->shift = 0;
           while (nfft<<st->shift != base->nfft && st->shift < 32)
              st->shift++;
           if (st->shift>=32)
463
              goto fail;
464
        } else {
Jean-Marc Valin's avatar
Jean-Marc Valin committed
465
466
           st->twiddles = twiddles = (kiss_twiddle_cpx*)KISS_FFT_MALLOC(sizeof(kiss_twiddle_cpx)*nfft);
           compute_twiddles(twiddles, nfft);
467
           st->shift = -1;
468
        }
469
470
        if (!kf_factor(nfft,st->factors))
        {
471
           goto fail;
472
        }
Jean-Marc Valin's avatar
Jean-Marc Valin committed
473

474
        /* bitrev */
475
        st->bitrev = bitrev = (opus_int16*)KISS_FFT_MALLOC(sizeof(opus_int16)*nfft);
476
477
        if (st->bitrev==NULL)
            goto fail;
Jean-Marc Valin's avatar
Jean-Marc Valin committed
478
        compute_bitrev_table(0, bitrev, 1,1, st->factors,st);
479
480
    }
    return st;
481
fail:
482
    opus_fft_free(st);
483
    return NULL;
484
485
}

486
kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem )
487
{
488
   return opus_fft_alloc_twiddles(nfft, mem, lenmem, NULL);
489
}
490

491
void opus_fft_free(const kiss_fft_state *cfg)
492
{
493
494
   if (cfg)
   {
495
      opus_free((opus_int16*)cfg->bitrev);
496
      if (cfg->shift < 0)
497
498
         opus_free((kiss_twiddle_cpx*)cfg->twiddles);
      opus_free((kiss_fft_state*)cfg);
499
   }
500
501
}

502
#endif /* CUSTOM_MODES */
503

504
void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout)
505
{
506
507
508
509
    int m2, m;
    int p;
    int L;
    int fstride[MAXFACTORS];
Jean-Marc Valin's avatar
Jean-Marc Valin committed
510
    int i;
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
    int shift;

    /* st->shift can be -1 */
    shift = st->shift>0 ? st->shift : 0;

    fstride[0] = 1;
    L=0;
    do {
       p = st->factors[2*L];
       m = st->factors[2*L+1];
       fstride[L+1] = fstride[L]*p;
       L++;
    } while(m!=1);
    m = st->factors[2*L-1];
    for (i=L-1;i>=0;i--)
    {
       if (i!=0)
          m2 = st->factors[2*i-1];
       else
          m2 = 1;
       switch (st->factors[2*i])
       {
       case 2:
534
          kf_bfly2(fout, m, fstride[i]);
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
          break;
       case 4:
          kf_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2);
          break;
 #ifndef RADIX_TWO_ONLY
       case 3:
          kf_bfly3(fout,fstride[i]<<shift,st,m, fstride[i], m2);
          break;
       case 5:
          kf_bfly5(fout,fstride[i]<<shift,st,m, fstride[i], m2);
          break;
 #endif
       }
       m = m2;
    }
550
551
}

552
553
554
555
void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
{
   int i;
   opus_val16 scale;
556
557
558
559
#ifdef FIXED_POINT
   /* Allows us to scale with MULT16_32_Q16(), which is faster than
      MULT16_32_Q15() on ARM. */
   int scale_shift = st->scale_shift-1;
560
#endif
561
   scale = st->scale;
562
563
564
565
566
567

   celt_assert2 (fin != fout, "In-place FFT not supported");
   /* Bit-reverse the input */
   for (i=0;i<st->nfft;i++)
   {
      kiss_fft_cpx x = fin[i];
568
569
      fout[st->bitrev[i]].r = SHR32(MULT16_32_Q16(scale, x.r), scale_shift);
      fout[st->bitrev[i]].i = SHR32(MULT16_32_Q16(scale, x.i), scale_shift);
570
571
572
573
   }
   opus_fft_impl(st, fout);
}

574

575
576
577
578
579
580
581
582
#ifdef TEST_UNIT_DFT_C
void opus_ifft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
{
   int i;
   celt_assert2 (fin != fout, "In-place FFT not supported");
   /* Bit-reverse the input */
   for (i=0;i<st->nfft;i++)
      fout[st->bitrev[i]] = fin[i];
583
584
585
586
587
   for (i=0;i<st->nfft;i++)
      fout[i].i = -fout[i].i;
   opus_fft_impl(st, fout);
   for (i=0;i<st->nfft;i++)
      fout[i].i = -fout[i].i;
588
589
}
#endif