vp9_dct.c 45.1 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
/*
2
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
John Koleszar's avatar
John Koleszar committed
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
6
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
John Koleszar's avatar
John Koleszar committed
9
10
 */

11
#include <assert.h>
John Koleszar's avatar
John Koleszar committed
12
#include <math.h>
13

14
#include "./vpx_config.h"
15
#include "./vp9_rtcd.h"
16

17
#include "vp9/common/vp9_blockd.h"
18
#include "vp9/common/vp9_idct.h"
19
20
#include "vp9/common/vp9_systemdependent.h"

21
22
23
24
25
26
static INLINE int fdct_round_shift(int input) {
  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
  assert(INT16_MIN <= rv && rv <= INT16_MAX);
  return rv;
}

27
static void fdct4(const int16_t *input, int16_t *output) {
Yaowu Xu's avatar
Yaowu Xu committed
28
29
30
31
32
33
34
35
36
37
  int16_t step[4];
  int temp1, temp2;

  step[0] = input[0] + input[3];
  step[1] = input[1] + input[2];
  step[2] = input[1] - input[2];
  step[3] = input[0] - input[3];

  temp1 = (step[0] + step[1]) * cospi_16_64;
  temp2 = (step[0] - step[1]) * cospi_16_64;
38
39
  output[0] = fdct_round_shift(temp1);
  output[2] = fdct_round_shift(temp2);
Yaowu Xu's avatar
Yaowu Xu committed
40
41
  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
42
43
  output[1] = fdct_round_shift(temp1);
  output[3] = fdct_round_shift(temp2);
Yaowu Xu's avatar
Yaowu Xu committed
44
}
Jingning Han's avatar
Jingning Han committed
45

46
47
48
49
50
51
52
void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
  int r, c;
  int16_t sum = 0;
  for (r = 0; r < 4; ++r)
    for (c = 0; c < 4; ++c)
      sum += input[r * stride + c];

53
  output[0] = sum << 1;
54
55
56
  output[1] = 0;
}

57
void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
58
59
60
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
  // the results. In the second one, we transform the rows. To achieve that,
61
  // as the first pass results are transposed, we transpose the columns (that
62
63
64
65
66
  // is the transposed rows) and transpose the results (so that it goes back
  // in normal/row positions).
  int pass;
  // We need an intermediate buffer between passes.
  int16_t intermediate[4 * 4];
67
  const int16_t *in = input;
68
69
70
71
72
73
74
75
76
77
  int16_t *out = intermediate;
  // Do the two transform/transpose passes
  for (pass = 0; pass < 2; ++pass) {
    /*canbe16*/ int input[4];
    /*canbe16*/ int step[4];
    /*needs32*/ int temp1, temp2;
    int i;
    for (i = 0; i < 4; ++i) {
      // Load inputs.
      if (0 == pass) {
Yaowu Xu's avatar
Yaowu Xu committed
78
79
80
81
        input[0] = in[0 * stride] * 16;
        input[1] = in[1 * stride] * 16;
        input[2] = in[2 * stride] * 16;
        input[3] = in[3 * stride] * 16;
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
        if (i == 0 && input[0]) {
          input[0] += 1;
        }
      } else {
        input[0] = in[0 * 4];
        input[1] = in[1 * 4];
        input[2] = in[2 * 4];
        input[3] = in[3 * 4];
      }
      // Transform.
      step[0] = input[0] + input[3];
      step[1] = input[1] + input[2];
      step[2] = input[1] - input[2];
      step[3] = input[0] - input[3];
      temp1 = (step[0] + step[1]) * cospi_16_64;
      temp2 = (step[0] - step[1]) * cospi_16_64;
98
99
      out[0] = fdct_round_shift(temp1);
      out[2] = fdct_round_shift(temp2);
100
101
      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
102
103
      out[1] = fdct_round_shift(temp1);
      out[3] = fdct_round_shift(temp2);
104
105
106
107
108
109
110
      // Do next column (which is a transposed row in second/horizontal pass)
      in++;
      out += 4;
    }
    // Setup in/out for next pass.
    in = intermediate;
    out = output;
John Koleszar's avatar
John Koleszar committed
111
  }
112

113
114
115
116
117
118
  {
    int i, j;
    for (i = 0; i < 4; ++i) {
      for (j = 0; j < 4; ++j)
        output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
    }
John Koleszar's avatar
John Koleszar committed
119
  }
John Koleszar's avatar
John Koleszar committed
120
}
121

122
static void fadst4(const int16_t *input, int16_t *output) {
123
124
  int x0, x1, x2, x3;
  int s0, s1, s2, s3, s4, s5, s6, s7;
125

126
127
128
129
  x0 = input[0];
  x1 = input[1];
  x2 = input[2];
  x3 = input[3];
130

131
132
133
134
  if (!(x0 | x1 | x2 | x3)) {
    output[0] = output[1] = output[2] = output[3] = 0;
    return;
  }
135

136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
  s0 = sinpi_1_9 * x0;
  s1 = sinpi_4_9 * x0;
  s2 = sinpi_2_9 * x1;
  s3 = sinpi_1_9 * x1;
  s4 = sinpi_3_9 * x2;
  s5 = sinpi_4_9 * x3;
  s6 = sinpi_2_9 * x3;
  s7 = x0 + x1 - x3;

  x0 = s0 + s2 + s5;
  x1 = sinpi_3_9 * s7;
  x2 = s1 - s3 + s6;
  x3 = s4;

  s0 = x0 + x3;
  s1 = x1;
  s2 = x2 - x3;
  s3 = x2 - x0 + x3;

  // 1-D transform scaling factor is sqrt(2).
156
157
158
159
  output[0] = fdct_round_shift(s0);
  output[1] = fdct_round_shift(s1);
  output[2] = fdct_round_shift(s2);
  output[3] = fdct_round_shift(s3);
160
}
161

Dmitry Kovalev's avatar
Dmitry Kovalev committed
162
static const transform_2d FHT_4[] = {
163
164
165
166
  { fdct4,  fdct4  },  // DCT_DCT  = 0
  { fadst4, fdct4  },  // ADST_DCT = 1
  { fdct4,  fadst4 },  // DCT_ADST = 2
  { fadst4, fadst4 }   // ADST_ADST = 3
167
168
};

169
170
171
172
173
174
175
176
177
178
void vp9_fht4x4_c(const int16_t *input, int16_t *output,
                  int stride, int tx_type) {
  if (tx_type == DCT_DCT) {
    vp9_fdct4x4_c(input, output, stride);
  } else {
    int16_t out[4 * 4];
    int16_t *outptr = &out[0];
    int i, j;
    int16_t temp_in[4], temp_out[4];
    const transform_2d ht = FHT_4[tx_type];
179

180
181
182
183
184
185
186
187
188
189
    // Columns
    for (i = 0; i < 4; ++i) {
      for (j = 0; j < 4; ++j)
        temp_in[j] = input[j * stride + i] * 16;
      if (i == 0 && temp_in[0])
        temp_in[0] += 1;
      ht.cols(temp_in, temp_out);
      for (j = 0; j < 4; ++j)
        outptr[j * 4 + i] = temp_out[j];
    }
Yaowu Xu's avatar
Yaowu Xu committed
190

191
192
193
194
195
196
197
198
    // Rows
    for (i = 0; i < 4; ++i) {
      for (j = 0; j < 4; ++j)
        temp_in[j] = out[j + i * 4];
      ht.rows(temp_in, temp_out);
      for (j = 0; j < 4; ++j)
        output[j + i * 4] = (temp_out[j] + 1) >> 2;
    }
John Koleszar's avatar
John Koleszar committed
199
  }
Yaowu Xu's avatar
Yaowu Xu committed
200
}
201

202
static void fdct8(const int16_t *input, int16_t *output) {
Christian Duvivier's avatar
Christian Duvivier committed
203
204
205
  /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
  /*needs32*/ int t0, t1, t2, t3;
  /*canbe16*/ int x0, x1, x2, x3;
Yunqing Wang's avatar
Yunqing Wang committed
206
207

  // stage 1
Christian Duvivier's avatar
Christian Duvivier committed
208
209
210
211
212
213
214
215
216
  s0 = input[0] + input[7];
  s1 = input[1] + input[6];
  s2 = input[2] + input[5];
  s3 = input[3] + input[4];
  s4 = input[3] - input[4];
  s5 = input[2] - input[5];
  s6 = input[1] - input[6];
  s7 = input[0] - input[7];

217
  // fdct4(step, step);
Christian Duvivier's avatar
Christian Duvivier committed
218
219
220
221
222
223
224
225
  x0 = s0 + s3;
  x1 = s1 + s2;
  x2 = s1 - s2;
  x3 = s0 - s3;
  t0 = (x0 + x1) * cospi_16_64;
  t1 = (x0 - x1) * cospi_16_64;
  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
226
227
228
229
  output[0] = fdct_round_shift(t0);
  output[2] = fdct_round_shift(t2);
  output[4] = fdct_round_shift(t1);
  output[6] = fdct_round_shift(t3);
230

Yunqing Wang's avatar
Yunqing Wang committed
231
  // Stage 2
Christian Duvivier's avatar
Christian Duvivier committed
232
233
  t0 = (s6 - s5) * cospi_16_64;
  t1 = (s6 + s5) * cospi_16_64;
234
235
  t2 = fdct_round_shift(t0);
  t3 = fdct_round_shift(t1);
236

Yunqing Wang's avatar
Yunqing Wang committed
237
  // Stage 3
Christian Duvivier's avatar
Christian Duvivier committed
238
239
240
241
  x0 = s4 + t2;
  x1 = s4 - t2;
  x2 = s7 - t3;
  x3 = s7 + t3;
242

Yunqing Wang's avatar
Yunqing Wang committed
243
  // Stage 4
Christian Duvivier's avatar
Christian Duvivier committed
244
245
246
247
  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
248
249
250
251
  output[1] = fdct_round_shift(t0);
  output[3] = fdct_round_shift(t2);
  output[5] = fdct_round_shift(t1);
  output[7] = fdct_round_shift(t3);
Yunqing Wang's avatar
Yunqing Wang committed
252
}
253

254
255
256
257
258
259
260
void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
  int r, c;
  int16_t sum = 0;
  for (r = 0; r < 8; ++r)
    for (c = 0; c < 8; ++c)
      sum += input[r * stride + c];

261
  output[0] = sum;
262
263
264
  output[1] = 0;
}

265
void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
Yunqing Wang's avatar
Yunqing Wang committed
266
  int i, j;
Christian Duvivier's avatar
Christian Duvivier committed
267
  int16_t intermediate[64];
268

Christian Duvivier's avatar
Christian Duvivier committed
269
270
271
272
273
274
275
276
277
278
  // Transform columns
  {
    int16_t *output = intermediate;
    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
    /*needs32*/ int t0, t1, t2, t3;
    /*canbe16*/ int x0, x1, x2, x3;

    int i;
    for (i = 0; i < 8; i++) {
      // stage 1
Yaowu Xu's avatar
Yaowu Xu committed
279
280
281
282
283
284
285
286
      s0 = (input[0 * stride] + input[7 * stride]) * 4;
      s1 = (input[1 * stride] + input[6 * stride]) * 4;
      s2 = (input[2 * stride] + input[5 * stride]) * 4;
      s3 = (input[3 * stride] + input[4 * stride]) * 4;
      s4 = (input[3 * stride] - input[4 * stride]) * 4;
      s5 = (input[2 * stride] - input[5 * stride]) * 4;
      s6 = (input[1 * stride] - input[6 * stride]) * 4;
      s7 = (input[0 * stride] - input[7 * stride]) * 4;
Christian Duvivier's avatar
Christian Duvivier committed
287

288
      // fdct4(step, step);
Christian Duvivier's avatar
Christian Duvivier committed
289
290
291
292
293
294
295
296
      x0 = s0 + s3;
      x1 = s1 + s2;
      x2 = s1 - s2;
      x3 = s0 - s3;
      t0 = (x0 + x1) * cospi_16_64;
      t1 = (x0 - x1) * cospi_16_64;
      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
297
298
299
300
      output[0 * 8] = fdct_round_shift(t0);
      output[2 * 8] = fdct_round_shift(t2);
      output[4 * 8] = fdct_round_shift(t1);
      output[6 * 8] = fdct_round_shift(t3);
Christian Duvivier's avatar
Christian Duvivier committed
301
302
303
304

      // Stage 2
      t0 = (s6 - s5) * cospi_16_64;
      t1 = (s6 + s5) * cospi_16_64;
305
306
      t2 = fdct_round_shift(t0);
      t3 = fdct_round_shift(t1);
Christian Duvivier's avatar
Christian Duvivier committed
307
308
309
310
311
312
313
314
315
316
317
318

      // Stage 3
      x0 = s4 + t2;
      x1 = s4 - t2;
      x2 = s7 - t3;
      x3 = s7 + t3;

      // Stage 4
      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
319
320
321
322
      output[1 * 8] = fdct_round_shift(t0);
      output[3 * 8] = fdct_round_shift(t2);
      output[5 * 8] = fdct_round_shift(t1);
      output[7 * 8] = fdct_round_shift(t3);
Christian Duvivier's avatar
Christian Duvivier committed
323
324
      input++;
      output++;
325
326
    }
  }
327

Dmitry Kovalev's avatar
Dmitry Kovalev committed
328
  // Rows
Yunqing Wang's avatar
Yunqing Wang committed
329
  for (i = 0; i < 8; ++i) {
330
    fdct8(&intermediate[i * 8], &final_output[i * 8]);
Yunqing Wang's avatar
Yunqing Wang committed
331
    for (j = 0; j < 8; ++j)
Christian Duvivier's avatar
Christian Duvivier committed
332
      final_output[j + i * 8] /= 2;
333
  }
Yunqing Wang's avatar
Yunqing Wang committed
334
}
335

336
337
338
339
340
341
342
void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
  int r, c;
  int16_t sum = 0;
  for (r = 0; r < 16; ++r)
    for (c = 0; c < 16; ++c)
      sum += input[r * stride + c];

343
  output[0] = sum >> 1;
344
345
346
  output[1] = 0;
}

347
void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
348
349
350
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
  // the results. In the second one, we transform the rows. To achieve that,
351
  // as the first pass results are transposed, we transpose the columns (that
352
353
354
355
356
  // is the transposed rows) and transpose the results (so that it goes back
  // in normal/row positions).
  int pass;
  // We need an intermediate buffer between passes.
  int16_t intermediate[256];
357
  const int16_t *in = input;
358
359
360
361
362
363
364
365
366
367
368
369
  int16_t *out = intermediate;
  // Do the two transform/transpose passes
  for (pass = 0; pass < 2; ++pass) {
    /*canbe16*/ int step1[8];
    /*canbe16*/ int step2[8];
    /*canbe16*/ int step3[8];
    /*canbe16*/ int input[8];
    /*needs32*/ int temp1, temp2;
    int i;
    for (i = 0; i < 16; i++) {
      if (0 == pass) {
        // Calculate input for the first 8 results.
Yaowu Xu's avatar
Yaowu Xu committed
370
371
372
373
374
375
376
377
        input[0] = (in[0 * stride] + in[15 * stride]) * 4;
        input[1] = (in[1 * stride] + in[14 * stride]) * 4;
        input[2] = (in[2 * stride] + in[13 * stride]) * 4;
        input[3] = (in[3 * stride] + in[12 * stride]) * 4;
        input[4] = (in[4 * stride] + in[11 * stride]) * 4;
        input[5] = (in[5 * stride] + in[10 * stride]) * 4;
        input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
        input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
378
        // Calculate input for the next 8 results.
Yaowu Xu's avatar
Yaowu Xu committed
379
380
381
382
383
384
385
386
        step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
        step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
        step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
        step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
        step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
        step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
        step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
        step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
      } else {
        // Calculate input for the first 8 results.
        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);
        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);
        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);
        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);
        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);
        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);
        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);
        // Calculate input for the next 8 results.
        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);
        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);
        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);
        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);
        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);
        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);
        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
406
      }
407
      // Work on the first eight values; fdct8(input, even_results);
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
      {
        /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
        /*needs32*/ int t0, t1, t2, t3;
        /*canbe16*/ int x0, x1, x2, x3;

        // stage 1
        s0 = input[0] + input[7];
        s1 = input[1] + input[6];
        s2 = input[2] + input[5];
        s3 = input[3] + input[4];
        s4 = input[3] - input[4];
        s5 = input[2] - input[5];
        s6 = input[1] - input[6];
        s7 = input[0] - input[7];

423
        // fdct4(step, step);
424
425
426
427
428
429
430
431
        x0 = s0 + s3;
        x1 = s1 + s2;
        x2 = s1 - s2;
        x3 = s0 - s3;
        t0 = (x0 + x1) * cospi_16_64;
        t1 = (x0 - x1) * cospi_16_64;
        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
432
433
434
435
        out[0] = fdct_round_shift(t0);
        out[4] = fdct_round_shift(t2);
        out[8] = fdct_round_shift(t1);
        out[12] = fdct_round_shift(t3);
436
437
438
439

        // Stage 2
        t0 = (s6 - s5) * cospi_16_64;
        t1 = (s6 + s5) * cospi_16_64;
440
441
        t2 = fdct_round_shift(t0);
        t3 = fdct_round_shift(t1);
442
443
444
445
446
447
448
449
450
451
452
453

        // Stage 3
        x0 = s4 + t2;
        x1 = s4 - t2;
        x2 = s7 - t3;
        x3 = s7 + t3;

        // Stage 4
        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
454
455
456
457
        out[2] = fdct_round_shift(t0);
        out[6] = fdct_round_shift(t2);
        out[10] = fdct_round_shift(t1);
        out[14] = fdct_round_shift(t3);
458
      }
459
460
461
462
463
      // Work on the next eight values; step1 -> odd_results
      {
        // step 2
        temp1 = (step1[5] - step1[2]) * cospi_16_64;
        temp2 = (step1[4] - step1[3]) * cospi_16_64;
464
465
        step2[2] = fdct_round_shift(temp1);
        step2[3] = fdct_round_shift(temp2);
466
467
        temp1 = (step1[4] + step1[3]) * cospi_16_64;
        temp2 = (step1[5] + step1[2]) * cospi_16_64;
468
469
        step2[4] = fdct_round_shift(temp1);
        step2[5] = fdct_round_shift(temp2);
470
471
472
473
474
475
476
477
478
479
480
        // step 3
        step3[0] = step1[0] + step2[3];
        step3[1] = step1[1] + step2[2];
        step3[2] = step1[1] - step2[2];
        step3[3] = step1[0] - step2[3];
        step3[4] = step1[7] - step2[4];
        step3[5] = step1[6] - step2[5];
        step3[6] = step1[6] + step2[5];
        step3[7] = step1[7] + step2[4];
        // step 4
        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
481
        temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
482
483
        step2[1] = fdct_round_shift(temp1);
        step2[2] = fdct_round_shift(temp2);
484
        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
485
        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
486
487
        step2[5] = fdct_round_shift(temp1);
        step2[6] = fdct_round_shift(temp2);
488
489
490
        // step 5
        step1[0] = step3[0] + step2[1];
        step1[1] = step3[0] - step2[1];
491
492
493
494
        step1[2] = step3[3] + step2[2];
        step1[3] = step3[3] - step2[2];
        step1[4] = step3[4] - step2[5];
        step1[5] = step3[4] + step2[5];
495
496
497
498
499
        step1[6] = step3[7] - step2[6];
        step1[7] = step3[7] + step2[6];
        // step 6
        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
500
501
        out[1] = fdct_round_shift(temp1);
        out[9] = fdct_round_shift(temp2);
502
503
        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
504
505
        out[5] = fdct_round_shift(temp1);
        out[13] = fdct_round_shift(temp2);
506
507
        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
508
509
        out[3] = fdct_round_shift(temp1);
        out[11] = fdct_round_shift(temp2);
510
511
        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
512
513
        out[7] = fdct_round_shift(temp1);
        out[15] = fdct_round_shift(temp2);
514
515
516
517
      }
      // Do next column (which is a transposed row in second/horizontal pass)
      in++;
      out += 16;
518
    }
519
520
521
    // Setup in/out for next pass.
    in = intermediate;
    out = output;
522
  }
523
524
}

525
static void fadst8(const int16_t *input, int16_t *output) {
526
527
  int s0, s1, s2, s3, s4, s5, s6, s7;

Dmitry Kovalev's avatar
Dmitry Kovalev committed
528
529
530
531
532
533
534
535
  int x0 = input[7];
  int x1 = input[0];
  int x2 = input[5];
  int x3 = input[2];
  int x4 = input[3];
  int x5 = input[4];
  int x6 = input[1];
  int x7 = input[6];
536
537
538
539
540
541
542
543
544
545
546

  // stage 1
  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

547
548
549
550
551
552
553
554
  x0 = fdct_round_shift(s0 + s4);
  x1 = fdct_round_shift(s1 + s5);
  x2 = fdct_round_shift(s2 + s6);
  x3 = fdct_round_shift(s3 + s7);
  x4 = fdct_round_shift(s0 - s4);
  x5 = fdct_round_shift(s1 - s5);
  x6 = fdct_round_shift(s2 - s6);
  x7 = fdct_round_shift(s3 - s7);
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569

  // stage 2
  s0 = x0;
  s1 = x1;
  s2 = x2;
  s3 = x3;
  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;
  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;
  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;
  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;

  x0 = s0 + s2;
  x1 = s1 + s3;
  x2 = s0 - s2;
  x3 = s1 - s3;
570
571
572
573
  x4 = fdct_round_shift(s4 + s6);
  x5 = fdct_round_shift(s5 + s7);
  x6 = fdct_round_shift(s4 - s6);
  x7 = fdct_round_shift(s5 - s7);
574
575
576
577
578
579
580

  // stage 3
  s2 = cospi_16_64 * (x2 + x3);
  s3 = cospi_16_64 * (x2 - x3);
  s6 = cospi_16_64 * (x6 + x7);
  s7 = cospi_16_64 * (x6 - x7);

581
582
583
584
  x2 = fdct_round_shift(s2);
  x3 = fdct_round_shift(s3);
  x6 = fdct_round_shift(s6);
  x7 = fdct_round_shift(s7);
585
586
587
588
589
590
591
592
593

  output[0] =   x0;
  output[1] = - x4;
  output[2] =   x6;
  output[3] = - x2;
  output[4] =   x3;
  output[5] = - x7;
  output[6] =   x5;
  output[7] = - x1;
John Koleszar's avatar
John Koleszar committed
594
595
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
596
static const transform_2d FHT_8[] = {
597
598
599
600
  { fdct8,  fdct8  },  // DCT_DCT  = 0
  { fadst8, fdct8  },  // ADST_DCT = 1
  { fdct8,  fadst8 },  // DCT_ADST = 2
  { fadst8, fadst8 }   // ADST_ADST = 3
Dmitry Kovalev's avatar
Dmitry Kovalev committed
601
};
John Koleszar's avatar
John Koleszar committed
602

603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
void vp9_fht8x8_c(const int16_t *input, int16_t *output,
                  int stride, int tx_type) {
  if (tx_type == DCT_DCT) {
    vp9_fdct8x8_c(input, output, stride);
  } else {
    int16_t out[64];
    int16_t *outptr = &out[0];
    int i, j;
    int16_t temp_in[8], temp_out[8];
    const transform_2d ht = FHT_8[tx_type];

    // Columns
    for (i = 0; i < 8; ++i) {
      for (j = 0; j < 8; ++j)
        temp_in[j] = input[j * stride + i] * 4;
      ht.cols(temp_in, temp_out);
      for (j = 0; j < 8; ++j)
        outptr[j * 8 + i] = temp_out[j];
    }
John Koleszar's avatar
John Koleszar committed
622

623
624
625
626
627
628
629
630
    // Rows
    for (i = 0; i < 8; ++i) {
      for (j = 0; j < 8; ++j)
        temp_in[j] = out[j + i * 8];
      ht.rows(temp_in, temp_out);
      for (j = 0; j < 8; ++j)
        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
    }
John Koleszar's avatar
John Koleszar committed
631
  }
Hui Su's avatar
Hui Su committed
632
633
}

Timothy B. Terriberry's avatar
Timothy B. Terriberry committed
634
635
/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
   pixel. */
636
void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
John Koleszar's avatar
John Koleszar committed
637
  int i;
Timothy B. Terriberry's avatar
Timothy B. Terriberry committed
638
  int a1, b1, c1, d1, e1;
639
  const int16_t *ip = input;
640
  int16_t *op = output;
John Koleszar's avatar
John Koleszar committed
641
642

  for (i = 0; i < 4; i++) {
643
644
645
646
    a1 = ip[0 * stride];
    b1 = ip[1 * stride];
    c1 = ip[2 * stride];
    d1 = ip[3 * stride];
Timothy B. Terriberry's avatar
Timothy B. Terriberry committed
647

Yaowu Xu's avatar
Yaowu Xu committed
648
649
650
651
652
653
654
    a1 += b1;
    d1 = d1 - c1;
    e1 = (a1 - d1) >> 1;
    b1 = e1 - b1;
    c1 = e1 - c1;
    a1 -= c1;
    d1 += b1;
Timothy B. Terriberry's avatar
Timothy B. Terriberry committed
655
656
657
658
    op[0] = a1;
    op[4] = c1;
    op[8] = d1;
    op[12] = b1;
John Koleszar's avatar
John Koleszar committed
659
660
661
662
663
664
665
666

    ip++;
    op++;
  }
  ip = output;
  op = output;

  for (i = 0; i < 4; i++) {
Timothy B. Terriberry's avatar
Timothy B. Terriberry committed
667
668
669
670
671
    a1 = ip[0];
    b1 = ip[1];
    c1 = ip[2];
    d1 = ip[3];

Yaowu Xu's avatar
Yaowu Xu committed
672
673
674
675
676
677
678
    a1 += b1;
    d1 -= c1;
    e1 = (a1 - d1) >> 1;
    b1 = e1 - b1;
    c1 = e1 - c1;
    a1 -= c1;
    d1 += b1;
Yaowu Xu's avatar
Yaowu Xu committed
679
680
681
682
    op[0] = a1 * UNIT_QUANT_FACTOR;
    op[1] = c1 * UNIT_QUANT_FACTOR;
    op[2] = d1 * UNIT_QUANT_FACTOR;
    op[3] = b1 * UNIT_QUANT_FACTOR;
John Koleszar's avatar
John Koleszar committed
683
684
685
686

    ip += 4;
    op += 4;
  }
Hui Su's avatar
Hui Su committed
687
688
}

Yunqing Wang's avatar
Yunqing Wang committed
689
// Rewrote to use same algorithm as others.
690
static void fdct16(const int16_t in[16], int16_t out[16]) {
691
692
693
694
695
  /*canbe16*/ int step1[8];
  /*canbe16*/ int step2[8];
  /*canbe16*/ int step3[8];
  /*canbe16*/ int input[8];
  /*needs32*/ int temp1, temp2;
Yunqing Wang's avatar
Yunqing Wang committed
696
697

  // step 1
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
  input[0] = in[0] + in[15];
  input[1] = in[1] + in[14];
  input[2] = in[2] + in[13];
  input[3] = in[3] + in[12];
  input[4] = in[4] + in[11];
  input[5] = in[5] + in[10];
  input[6] = in[6] + in[ 9];
  input[7] = in[7] + in[ 8];

  step1[0] = in[7] - in[ 8];
  step1[1] = in[6] - in[ 9];
  step1[2] = in[5] - in[10];
  step1[3] = in[4] - in[11];
  step1[4] = in[3] - in[12];
  step1[5] = in[2] - in[13];
  step1[6] = in[1] - in[14];
  step1[7] = in[0] - in[15];

716
  // fdct8(step, step);
717
  {
718
719
720
721
722
723
724
725
726
727
728
729
730
731
    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
    /*needs32*/ int t0, t1, t2, t3;
    /*canbe16*/ int x0, x1, x2, x3;

    // stage 1
    s0 = input[0] + input[7];
    s1 = input[1] + input[6];
    s2 = input[2] + input[5];
    s3 = input[3] + input[4];
    s4 = input[3] - input[4];
    s5 = input[2] - input[5];
    s6 = input[1] - input[6];
    s7 = input[0] - input[7];

732
    // fdct4(step, step);
733
734
735
736
737
738
739
740
    x0 = s0 + s3;
    x1 = s1 + s2;
    x2 = s1 - s2;
    x3 = s0 - s3;
    t0 = (x0 + x1) * cospi_16_64;
    t1 = (x0 - x1) * cospi_16_64;
    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
741
742
743
744
    out[0] = fdct_round_shift(t0);
    out[4] = fdct_round_shift(t2);
    out[8] = fdct_round_shift(t1);
    out[12] = fdct_round_shift(t3);
745
746
747
748

    // Stage 2
    t0 = (s6 - s5) * cospi_16_64;
    t1 = (s6 + s5) * cospi_16_64;
749
750
    t2 = fdct_round_shift(t0);
    t3 = fdct_round_shift(t1);
751
752
753
754
755
756
757
758
759
760
761
762

    // Stage 3
    x0 = s4 + t2;
    x1 = s4 - t2;
    x2 = s7 - t3;
    x3 = s7 + t3;

    // Stage 4
    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
763
764
765
766
    out[2] = fdct_round_shift(t0);
    out[6] = fdct_round_shift(t2);
    out[10] = fdct_round_shift(t1);
    out[14] = fdct_round_shift(t3);
767
  }
Yunqing Wang's avatar
Yunqing Wang committed
768
769

  // step 2
770
771
  temp1 = (step1[5] - step1[2]) * cospi_16_64;
  temp2 = (step1[4] - step1[3]) * cospi_16_64;
772
773
  step2[2] = fdct_round_shift(temp1);
  step2[3] = fdct_round_shift(temp2);
774
775
  temp1 = (step1[4] + step1[3]) * cospi_16_64;
  temp2 = (step1[5] + step1[2]) * cospi_16_64;
776
777
  step2[4] = fdct_round_shift(temp1);
  step2[5] = fdct_round_shift(temp2);
Yunqing Wang's avatar
Yunqing Wang committed
778
779

  // step 3
780
781
782
783
784
785
786
787
  step3[0] = step1[0] + step2[3];
  step3[1] = step1[1] + step2[2];
  step3[2] = step1[1] - step2[2];
  step3[3] = step1[0] - step2[3];
  step3[4] = step1[7] - step2[4];
  step3[5] = step1[6] - step2[5];
  step3[6] = step1[6] + step2[5];
  step3[7] = step1[7] + step2[4];
Yunqing Wang's avatar
Yunqing Wang committed
788
789

  // step 4
790
  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
791
  temp2 = step3[2] * cospi_24_64 + step3[5] *  cospi_8_64;
792
793
  step2[1] = fdct_round_shift(temp1);
  step2[2] = fdct_round_shift(temp2);
794
  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
795
  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
796
797
  step2[5] = fdct_round_shift(temp1);
  step2[6] = fdct_round_shift(temp2);
Yunqing Wang's avatar
Yunqing Wang committed
798
799

  // step 5
800
801
  step1[0] = step3[0] + step2[1];
  step1[1] = step3[0] - step2[1];
802
803
804
805
  step1[2] = step3[3] + step2[2];
  step1[3] = step3[3] - step2[2];
  step1[4] = step3[4] - step2[5];
  step1[5] = step3[4] + step2[5];
806
807
  step1[6] = step3[7] - step2[6];
  step1[7] = step3[7] + step2[6];
Yunqing Wang's avatar
Yunqing Wang committed
808
809

  // step 6
810
811
  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
812
813
  out[1] = fdct_round_shift(temp1);
  out[9] = fdct_round_shift(temp2);
814
815
816

  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
817
818
  out[5] = fdct_round_shift(temp1);
  out[13] = fdct_round_shift(temp2);
819
820
821

  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
822
823
  out[3] = fdct_round_shift(temp1);
  out[11] = fdct_round_shift(temp2);
824
825
826

  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
827
828
  out[7] = fdct_round_shift(temp1);
  out[15] = fdct_round_shift(temp2);
Daniel Kang's avatar
Daniel Kang committed
829
}
830

831
static void fadst16(const int16_t *input, int16_t *output) {
832
833
  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

Dmitry Kovalev's avatar
Dmitry Kovalev committed
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
  int x0 = input[15];
  int x1 = input[0];
  int x2 = input[13];
  int x3 = input[2];
  int x4 = input[11];
  int x5 = input[4];
  int x6 = input[9];
  int x7 = input[6];
  int x8 = input[7];
  int x9 = input[8];
  int x10 = input[5];
  int x11 = input[10];
  int x12 = input[3];
  int x13 = input[12];
  int x14 = input[1];
  int x15 = input[14];
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868

  // stage 1
  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
  x0 = fdct_round_shift(s0 + s8);
  x1 = fdct_round_shift(s1 + s9);
  x2 = fdct_round_shift(s2 + s10);
  x3 = fdct_round_shift(s3 + s11);
  x4 = fdct_round_shift(s4 + s12);
  x5 = fdct_round_shift(s5 + s13);
  x6 = fdct_round_shift(s6 + s14);
  x7 = fdct_round_shift(s7 + s15);
  x8  = fdct_round_shift(s0 - s8);
  x9  = fdct_round_shift(s1 - s9);
  x10 = fdct_round_shift(s2 - s10);
  x11 = fdct_round_shift(s3 - s11);
  x12 = fdct_round_shift(s4 - s12);
  x13 = fdct_round_shift(s5 - s13);
  x14 = fdct_round_shift(s6 - s14);
  x15 = fdct_round_shift(s7 - s15);
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911

  // stage 2
  s0 = x0;
  s1 = x1;
  s2 = x2;
  s3 = x3;
  s4 = x4;
  s5 = x5;
  s6 = x6;
  s7 = x7;
  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

  x0 = s0 + s4;
  x1 = s1 + s5;
  x2 = s2 + s6;
  x3 = s3 + s7;
  x4 = s0 - s4;
  x5 = s1 - s5;
  x6 = s2 - s6;
  x7 = s3 - s7;
912
913
914
915
916
917
918
919
  x8 = fdct_round_shift(s8 + s12);
  x9 = fdct_round_shift(s9 + s13);
  x10 = fdct_round_shift(s10 + s14);
  x11 = fdct_round_shift(s11 + s15);
  x12 = fdct_round_shift(s8 - s12);
  x13 = fdct_round_shift(s9 - s13);
  x14 = fdct_round_shift(s10 - s14);
  x15 = fdct_round_shift(s11 - s15);
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942

  // stage 3
  s0 = x0;
  s1 = x1;
  s2 = x2;
  s3 = x3;
  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
  s8 = x8;
  s9 = x9;
  s10 = x10;
  s11 = x11;
  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

  x0 = s0 + s2;
  x1 = s1 + s3;
  x2 = s0 - s2;
  x3 = s1 - s3;
943
944
945
946
  x4 = fdct_round_shift(s4 + s6);
  x5 = fdct_round_shift(s5 + s7);
  x6 = fdct_round_shift(s4 - s6);
  x7 = fdct_round_shift(s5 - s7);
947
948
949
950
  x8 = s8 + s10;
  x9 = s9 + s11;
  x10 = s8 - s10;
  x11 = s9 - s11;
951
952
953
954
  x12 = fdct_round_shift(s12 + s14);
  x13 = fdct_round_shift(s13 + s15);
  x14 = fdct_round_shift(s12 - s14);
  x15 = fdct_round_shift(s13 - s15);
955
956
957
958
959
960
961
962
963
964
965

  // stage 4
  s2 = (- cospi_16_64) * (x2 + x3);
  s3 = cospi_16_64 * (x2 - x3);
  s6 = cospi_16_64 * (x6 + x7);
  s7 = cospi_16_64 * (- x6 + x7);
  s10 = cospi_16_64 * (x10 + x11);
  s11 = cospi_16_64 * (- x10 + x11);
  s14 = (- cospi_16_64) * (x14 + x15);
  s15 = cospi_16_64 * (x14 - x15);

966
967
968
969
970
971
972
973
  x2 = fdct_round_shift(s2);
  x3 = fdct_round_shift(s3);
  x6 = fdct_round_shift(s6);
  x7 = fdct_round_shift(s7);
  x10 = fdct_round_shift(s10);
  x11 = fdct_round_shift(s11);
  x14 = fdct_round_shift(s14);
  x15 = fdct_round_shift(s15);
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990

  output[0] = x0;
  output[1] = - x8;
  output[2] = x12;
  output[3] = - x4;
  output[4] = x6;
  output[5] = x14;
  output[6] = x10;
  output[7] = x2;
  output[8] = x3;
  output[9] =  x11;
  output[10] = x15;
  output[11] = x7;
  output[12] = x5;
  output[13] = - x13;
  output[14] = x9;
  output[15] = - x1;
991
992
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
993
static const transform_2d FHT_16[] = {
994
995
996
997
  { fdct16,  fdct16  },  // DCT_DCT  = 0
  { fadst16, fdct16  },  // ADST_DCT = 1
  { fdct16,  fadst16 },  // DCT_ADST = 2
  { fadst16, fadst16 }   // ADST_ADST = 3
Dmitry Kovalev's avatar
Dmitry Kovalev committed
998
};
999

1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
void vp9_fht16x16_c(const int16_t *input, int16_t *output,
                    int stride, int tx_type) {
  if (tx_type == DCT_DCT) {
    vp9_fdct16x16_c(input, output, stride);
  } else {
    int16_t out[256];
    int16_t *outptr = &out[0];
    int i, j;
    int16_t temp_in[16], temp_out[16];
    const transform_2d ht = FHT_16[tx_type];

    // Columns
    for (i = 0; i < 16; ++i) {
      for (j = 0; j < 16; ++j)
        temp_in[j] = input[j * stride + i] * 4;
      ht.cols(temp_in, temp_out);
      for (j = 0; j < 16; ++j)
        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
    }
1019

1020
1021
1022
1023
1024
1025
1026
1027
    // Rows
    for (i = 0; i < 16; ++i) {
      for (j = 0; j < 16; ++j)
        temp_in[j] = out[j + i * 16];
      ht.rows(temp_in, temp_out);
      for (j = 0; j < 16; ++j)
        output[j + i * 16] = temp_out[j];
    }
1028
  }
1029
}
1030

1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
static INLINE int dct_32_round(int input) {
  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
  assert(-131072 <= rv && rv <= 131071);
  return rv;
}

static INLINE int half_round_shift(int input) {
  int rv = (input + 1 + (input < 0)) >> 2;
  return rv;
}
Yunqing Wang's avatar
Yunqing Wang committed
1041

1042
static void fdct32(const int *input, int *output, int round) {
Yunqing Wang's avatar
Yunqing Wang committed
1043
  int step[32];
1044
  // Stage 1
Yunqing Wang's avatar
Yunqing Wang committed
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
  step[0] = input[0] + input[(32 - 1)];
  step[1] = input[1] + input[(32 - 2)];
  step[2] = input[2] + input[(32 - 3)];
  step[3] = input[3] + input[(32 - 4)];
  step[4] = input[4] + input[(32 - 5)];
  step[5] = input[5] + input[(32 - 6)];
  step[6] = input[6] + input[(32 - 7)];
  step[7] = input[7] + input[(32 - 8)];
  step[8] = input[8] + input[(32 - 9)];
  step[9] = input[9] + input[(32 - 10)];
  step[10] = input[10] + input[(32 - 11)];
  step[11] = input[11] + input[(32 - 12)];
  step[12] = input[12] + input[(32 - 13)];
  step[13] = input[13] + input[(32 - 14)];
  step[14] = input[14] + input[(32 - 15)];
  step[15] = input[15] + input[(32 - 16)];
  step[16] = -input[16] + input[(32 - 17)];
  step[17] = -input[17] + input[(32 - 18)];
  step[18] = -input[18] + input[(32 - 19)];
  step[19] = -input[19] + input[(32 - 20)];
  step[20] = -input[20] + input[(32 - 21)];
  step[21] = -input[21] + input[(32 - 22)];
  step[22] = -input[22] + input[(32 - 23)];
  step[23] = -input[23] + input[(32 - 24)];
  step[24] = -input[24] + input[(32 - 25)];
  step[25] = -input[25] + input[(32 - 26)];
  step[26] = -input[26] + input[(32 - 27)];
  step[27] = -input[27] + input[(32 - 28)];
  step[28] = -input[28] + input[(32 - 29)];
  step[29] = -input[29] + input[(32 - 30)];
  step[30] = -input[30] + input[(32 - 31)];
  step[31] = -input[31] + input[(32 - 32)];
1077
1078

  // Stage 2
Yunqing Wang's avatar
Yunqing Wang committed
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
  output[0] = step[0] + step[16 - 1];
  output[1] = step[1] + step[16 - 2];
  output[2] = step[2] + step[16 - 3];
  output[3] = step[3] + step[16 - 4];
  output[4] = step[4] + step[16 - 5];
  output[5] = step[5] + step[16 - 6];
  output[6] = step[6] + step[16 - 7];
  output[7] = step[7] + step[16 - 8];
  output[8] = -step[8] + step[16 - 9];
  output[9] = -step[9] + step[16 - 10];
  output[10] = -step[10] + step[16 - 11];
  output[11] = -step[11] + step[16 - 12];
  output[12] = -step[12] + step[16 - 13];
  output[13] = -step[13] + step[16 - 14];
  output[14] = -step[14] + step[16 - 15];
  output[15] = -step[15] + step[16 - 16];

  output[16] = step[16];
  output[17] = step[17];
  output[18] = step[18];
  output[19] = step[19];

Yaowu Xu's avatar
Yaowu Xu committed
1101
1102
1103
1104
  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
Yunqing Wang's avatar
Yunqing Wang committed
1105

Yaowu Xu's avatar
Yaowu Xu committed
1106
1107
1108
1109
  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
Yunqing Wang's avatar
Yunqing Wang committed
1110
1111
1112
1113
1114

  output[28] = step[28];
  output[29] = step[29];
  output[30] = step[30];
  output[31] = step[31];
1115

1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
  // dump the magnitude by 4, hence the intermediate values are within
  // the range of 16 bits.
  if (round) {
    output[0] = half_round_shift(output[0]);
    output[1] = half_round_shift(output[1]);
    output[2] = half_round_shift(output[2]);
    output[3] = half_round_shift(output[3]);
    output[4] = half_round_shift(output[4]);
    output[5] = half_round_shift(output[5]);
    output[6] = half_round_shift(output[6]);
    output[7] = half_round_shift(output[7]);
    output[8] = half_round_shift(output[8]);
    output[9] = half_round_shift(output[9]);
    output[10] = half_round_shift(output[10]);
    output[11] = half_round_shift(output[11]);
    output[12] = half_round_shift(output[12]);
    output[13] = half_round_shift(output[13]);
    output[14] = half_round_shift(output[14]);
    output[15] = half_round_shift(output[15]);

    output[16] = half_round_shift(output[16]);
    output[17] = half_round_shift(output[17]);
    output[18] = half_round_shift(output[18]);
    output[19] = half_round_shift(output[19]);
    output[20] = half_round_shift(output[20]);
    output[21] = half_round_shift(output[21]);
    output[22] = half_round_shift(output[22]);
    output[23] = half_round_shift(output[23]);
    output[24] = half_round_shift(output[24]);
    output[25] = half_round_shift(output[25]);
    output[26] = half_round_shift(output[26]);
    output[27] = half_round_shift(output[27]);
    output[28] = half_round_shift(output[28]);
    output[29] = half_round_shift(output[29]);
    output[30] = half_round_shift(output[30]);
    output[31] = half_round_shift(output[31]);
  }

1154
  // Stage 3
Yunqing Wang's avatar
Yunqing Wang committed
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
  step[0] = output[0] + output[(8 - 1)];
  step[1] = output[1] + output[(8 - 2)];
  step[2] = output[2] + output[(8 - 3)];
  step[3] = output[3] + output[(8 - 4)];
  step[4] = -output[4] + output[(8 - 5)];
  step[5] = -output[5] + output[(8 - 6)];
  step[6] = -output[6] + output[(8 - 7)];
  step[7] = -output[7] + output[(8 - 8)];
  step[8] = output[8];
  step[9] = output[9];
Yaowu Xu's avatar
Yaowu Xu committed
1165
1166
1167
1168
  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
Yunqing Wang's avatar
Yunqing Wang committed
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
  step[14] = output[14];
  step[15] = output[15];

  step[16] = output[16] + output[23];
  step[17] = output[17] + output[22];
  step[18] = output[18] + output[21];
  step[19] = output[19] + output[20];
  step[20] = -output[20] + output[19];
  step[21] = -output[21] + output[18];
  step[22] = -output[22] + output[17];
  step[23] = -output[23] + output[16];
  step[24] = -output[24] + output[31];
  step[25] = -output[25] + output[30];
  step[26] = -output[26] + output[29];
  step[27] = -output[27] + output[28];
  step[28] = output[28] + output[27];
  step[29] = output[29] + output[26];
  step[30] = output[30] + output[25];
  step[31] = output[31] + output[24];
1188
1189

  // Stage 4
Yunqing Wang's avatar
Yunqing Wang committed
1190
1191
1192
1193
1194
  output[0] = step[0] + step[3];
  output[1] = step[1] + step[2];
  output[2] = -step[2] + step[1];
  output[3] = -step[3] + step[0];
  output[4] = step[4];
Yaowu Xu's avatar
Yaowu Xu committed
1195
1196
  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
Yunqing Wang's avatar
Yunqing Wang committed
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
  output[7] = step[7];
  output[8] = step[8] + step[11];
  output[9] = step[9] + step[10];
  output[10] = -step[10] + step[9];
  output[11] = -step[11] + step[8];
  output[12] = -step[12] + step[15];
  output[13] = -step[13] + step[14];
  output[14] = step[14] + step[13];
  output[15] = step[15] + step[12];

  output[16] = step[16];
  output[17] = step[17];
Yaowu Xu's avatar
Yaowu Xu committed
1209
1210
1211
1212
  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
Yunqing Wang's avatar
Yunqing Wang committed
1213
1214
1215
1216
  output[22] = step[22];
  output[23] = step[23];
  output[24] = step[24];
  output[25] = step[25];
Yaowu Xu's avatar
Yaowu Xu committed
1217
1218
1219
1220
  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
Yunqing Wang's avatar
Yunqing Wang committed
1221
1222
  output[30] = step[30];
  output[31] = step[31];
1223
1224

  // Stage 5