dct.c 82.6 KB
Newer Older
Jingning Han's avatar
Jingning Han committed
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Jingning Han's avatar
Jingning Han committed
3
 *
Yaowu Xu's avatar
Yaowu Xu committed
4
5
6
7
8
9
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Jingning Han's avatar
Jingning Han committed
10
11
12
13
14
 */

#include <assert.h>
#include <math.h>

Yaowu Xu's avatar
Yaowu Xu committed
15
16
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
Geza Lore's avatar
Geza Lore committed
17
#include "./av1_rtcd.h"
18
19
#include "aom_dsp/fwd_txfm.h"
#include "aom_ports/mem.h"
Geza Lore's avatar
Geza Lore committed
20
#include "av1/common/blockd.h"
21
#include "av1/common/av1_fwd_txfm1d.h"
22
#include "av1/common/av1_fwd_txfm1d_cfg.h"
Geza Lore's avatar
Geza Lore committed
23
#include "av1/common/idct.h"
24
#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
25
    CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
26
27
#include "av1/common/daala_tx.h"
#endif
Jingning Han's avatar
Jingning Han committed
28

29
30
static INLINE void range_check(const tran_low_t *input, const int size,
                               const int bit) {
31
32
33
34
#if 0  // CONFIG_COEFFICIENT_RANGE_CHECKING
// TODO(angiebird): the range_check is not used because the bit range
// in fdct# is not correct. Since we are going to merge in a new version
// of fdct# from nextgenv2, we won't fix the incorrect bit range now.
35
36
37
38
39
40
41
42
43
44
45
  int i;
  for (i = 0; i < size; ++i) {
    assert(abs(input[i]) < (1 << bit));
  }
#else
  (void)input;
  (void)size;
  (void)bit;
#endif
}

Jingning Han's avatar
Jingning Han committed
46
static void fdct4(const tran_low_t *input, tran_low_t *output) {
47
48
49
50
  tran_high_t temp;
  tran_low_t step[4];

  // stage 0
51
  range_check(input, 4, 14);
52
53
54
55
56
57
58

  // stage 1
  output[0] = input[0] + input[3];
  output[1] = input[1] + input[2];
  output[2] = input[1] - input[2];
  output[3] = input[0] - input[3];

59
  range_check(output, 4, 15);
60
61
62
63
64
65
66
67
68
69
70

  // stage 2
  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
  step[0] = (tran_low_t)fdct_round_shift(temp);
  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
  step[1] = (tran_low_t)fdct_round_shift(temp);
  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
  step[2] = (tran_low_t)fdct_round_shift(temp);
  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
  step[3] = (tran_low_t)fdct_round_shift(temp);

71
  range_check(step, 4, 16);
72
73
74
75
76
77
78

  // stage 3
  output[0] = step[0];
  output[1] = step[2];
  output[2] = step[1];
  output[3] = step[3];

79
  range_check(output, 4, 16);
Jingning Han's avatar
Jingning Han committed
80
81
82
}

static void fdct8(const tran_low_t *input, tran_low_t *output) {
83
84
85
86
  tran_high_t temp;
  tran_low_t step[8];

  // stage 0
87
  range_check(input, 8, 13);
Jingning Han's avatar
Jingning Han committed
88
89

  // stage 1
90
91
92
93
94
95
96
97
98
  output[0] = input[0] + input[7];
  output[1] = input[1] + input[6];
  output[2] = input[2] + input[5];
  output[3] = input[3] + input[4];
  output[4] = input[3] - input[4];
  output[5] = input[2] - input[5];
  output[6] = input[1] - input[6];
  output[7] = input[0] - input[7];

99
  range_check(output, 8, 14);
100
101
102
103
104
105
106
107
108
109
110
111
112

  // stage 2
  step[0] = output[0] + output[3];
  step[1] = output[1] + output[2];
  step[2] = output[1] - output[2];
  step[3] = output[0] - output[3];
  step[4] = output[4];
  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
  step[5] = (tran_low_t)fdct_round_shift(temp);
  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
  step[6] = (tran_low_t)fdct_round_shift(temp);
  step[7] = output[7];

113
  range_check(step, 8, 15);
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

  // stage 3
  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
  output[0] = (tran_low_t)fdct_round_shift(temp);
  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
  output[1] = (tran_low_t)fdct_round_shift(temp);
  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
  output[2] = (tran_low_t)fdct_round_shift(temp);
  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
  output[3] = (tran_low_t)fdct_round_shift(temp);
  output[4] = step[4] + step[5];
  output[5] = step[4] - step[5];
  output[6] = step[7] - step[6];
  output[7] = step[7] + step[6];

129
  range_check(output, 8, 16);
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

  // stage 4
  step[0] = output[0];
  step[1] = output[1];
  step[2] = output[2];
  step[3] = output[3];
  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
  step[4] = (tran_low_t)fdct_round_shift(temp);
  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
  step[5] = (tran_low_t)fdct_round_shift(temp);
  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
  step[6] = (tran_low_t)fdct_round_shift(temp);
  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
  step[7] = (tran_low_t)fdct_round_shift(temp);

145
  range_check(step, 8, 16);
146
147
148
149
150
151
152
153
154
155
156

  // stage 5
  output[0] = step[0];
  output[1] = step[4];
  output[2] = step[2];
  output[3] = step[6];
  output[4] = step[1];
  output[5] = step[5];
  output[6] = step[3];
  output[7] = step[7];

157
  range_check(output, 8, 16);
Jingning Han's avatar
Jingning Han committed
158
159
}

160
161
162
static void fdct16(const tran_low_t *input, tran_low_t *output) {
  tran_high_t temp;
  tran_low_t step[16];
Jingning Han's avatar
Jingning Han committed
163

164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
  // stage 0
  range_check(input, 16, 13);

  // stage 1
  output[0] = input[0] + input[15];
  output[1] = input[1] + input[14];
  output[2] = input[2] + input[13];
  output[3] = input[3] + input[12];
  output[4] = input[4] + input[11];
  output[5] = input[5] + input[10];
  output[6] = input[6] + input[9];
  output[7] = input[7] + input[8];
  output[8] = input[7] - input[8];
  output[9] = input[6] - input[9];
  output[10] = input[5] - input[10];
  output[11] = input[4] - input[11];
  output[12] = input[3] - input[12];
  output[13] = input[2] - input[13];
  output[14] = input[1] - input[14];
  output[15] = input[0] - input[15];

  range_check(output, 16, 14);

  // stage 2
  step[0] = output[0] + output[7];
  step[1] = output[1] + output[6];
  step[2] = output[2] + output[5];
  step[3] = output[3] + output[4];
  step[4] = output[3] - output[4];
  step[5] = output[2] - output[5];
  step[6] = output[1] - output[6];
  step[7] = output[0] - output[7];
  step[8] = output[8];
  step[9] = output[9];
  temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64;
  step[10] = (tran_low_t)fdct_round_shift(temp);
  temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64;
  step[11] = (tran_low_t)fdct_round_shift(temp);
  temp = output[12] * cospi_16_64 + output[11] * cospi_16_64;
  step[12] = (tran_low_t)fdct_round_shift(temp);
  temp = output[13] * cospi_16_64 + output[10] * cospi_16_64;
  step[13] = (tran_low_t)fdct_round_shift(temp);
  step[14] = output[14];
  step[15] = output[15];

  range_check(step, 16, 15);

  // stage 3
  output[0] = step[0] + step[3];
  output[1] = step[1] + step[2];
  output[2] = step[1] - step[2];
  output[3] = step[0] - step[3];
  output[4] = step[4];
  temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64;
  output[5] = (tran_low_t)fdct_round_shift(temp);
  temp = step[6] * cospi_16_64 + step[5] * cospi_16_64;
  output[6] = (tran_low_t)fdct_round_shift(temp);
  output[7] = step[7];
  output[8] = step[8] + step[11];
  output[9] = step[9] + step[10];
  output[10] = step[9] - step[10];
  output[11] = step[8] - step[11];
  output[12] = step[15] - step[12];
  output[13] = step[14] - step[13];
  output[14] = step[14] + step[13];
  output[15] = step[15] + step[12];

  range_check(output, 16, 16);

  // stage 4
  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
  step[0] = (tran_low_t)fdct_round_shift(temp);
  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
  step[1] = (tran_low_t)fdct_round_shift(temp);
  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
  step[2] = (tran_low_t)fdct_round_shift(temp);
  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
  step[3] = (tran_low_t)fdct_round_shift(temp);
  step[4] = output[4] + output[5];
  step[5] = output[4] - output[5];
  step[6] = output[7] - output[6];
  step[7] = output[7] + output[6];
  step[8] = output[8];
  temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64;
  step[9] = (tran_low_t)fdct_round_shift(temp);
  temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64;
  step[10] = (tran_low_t)fdct_round_shift(temp);
  step[11] = output[11];
  step[12] = output[12];
  temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64;
  step[13] = (tran_low_t)fdct_round_shift(temp);
  temp = output[14] * cospi_8_64 + output[9] * cospi_24_64;
  step[14] = (tran_low_t)fdct_round_shift(temp);
  step[15] = output[15];

  range_check(step, 16, 16);

  // stage 5
  output[0] = step[0];
  output[1] = step[1];
  output[2] = step[2];
  output[3] = step[3];
  temp = step[4] * cospi_28_64 + step[7] * cospi_4_64;
  output[4] = (tran_low_t)fdct_round_shift(temp);
  temp = step[5] * cospi_12_64 + step[6] * cospi_20_64;
  output[5] = (tran_low_t)fdct_round_shift(temp);
  temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64;
  output[6] = (tran_low_t)fdct_round_shift(temp);
  temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64;
  output[7] = (tran_low_t)fdct_round_shift(temp);
  output[8] = step[8] + step[9];
  output[9] = step[8] - step[9];
  output[10] = step[11] - step[10];
  output[11] = step[11] + step[10];
  output[12] = step[12] + step[13];
  output[13] = step[12] - step[13];
  output[14] = step[15] - step[14];
  output[15] = step[15] + step[14];

  range_check(output, 16, 16);

  // stage 6
  step[0] = output[0];
  step[1] = output[1];
  step[2] = output[2];
  step[3] = output[3];
  step[4] = output[4];
  step[5] = output[5];
  step[6] = output[6];
  step[7] = output[7];
  temp = output[8] * cospi_30_64 + output[15] * cospi_2_64;
  step[8] = (tran_low_t)fdct_round_shift(temp);
  temp = output[9] * cospi_14_64 + output[14] * cospi_18_64;
  step[9] = (tran_low_t)fdct_round_shift(temp);
  temp = output[10] * cospi_22_64 + output[13] * cospi_10_64;
  step[10] = (tran_low_t)fdct_round_shift(temp);
  temp = output[11] * cospi_6_64 + output[12] * cospi_26_64;
  step[11] = (tran_low_t)fdct_round_shift(temp);
  temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64;
  step[12] = (tran_low_t)fdct_round_shift(temp);
  temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64;
  step[13] = (tran_low_t)fdct_round_shift(temp);
  temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64;
  step[14] = (tran_low_t)fdct_round_shift(temp);
  temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64;
  step[15] = (tran_low_t)fdct_round_shift(temp);

  range_check(step, 16, 16);

  // stage 7
  output[0] = step[0];
  output[1] = step[8];
  output[2] = step[4];
  output[3] = step[12];
  output[4] = step[2];
  output[5] = step[10];
  output[6] = step[6];
  output[7] = step[14];
  output[8] = step[1];
  output[9] = step[9];
  output[10] = step[5];
  output[11] = step[13];
  output[12] = step[3];
  output[13] = step[11];
  output[14] = step[7];
  output[15] = step[15];

  range_check(output, 16, 16);
}

static void fdct32(const tran_low_t *input, tran_low_t *output) {
  tran_high_t temp;
336
  tran_low_t step[32];
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375

  // stage 0
  range_check(input, 32, 14);

  // stage 1
  output[0] = input[0] + input[31];
  output[1] = input[1] + input[30];
  output[2] = input[2] + input[29];
  output[3] = input[3] + input[28];
  output[4] = input[4] + input[27];
  output[5] = input[5] + input[26];
  output[6] = input[6] + input[25];
  output[7] = input[7] + input[24];
  output[8] = input[8] + input[23];
  output[9] = input[9] + input[22];
  output[10] = input[10] + input[21];
  output[11] = input[11] + input[20];
  output[12] = input[12] + input[19];
  output[13] = input[13] + input[18];
  output[14] = input[14] + input[17];
  output[15] = input[15] + input[16];
  output[16] = input[15] - input[16];
  output[17] = input[14] - input[17];
  output[18] = input[13] - input[18];
  output[19] = input[12] - input[19];
  output[20] = input[11] - input[20];
  output[21] = input[10] - input[21];
  output[22] = input[9] - input[22];
  output[23] = input[8] - input[23];
  output[24] = input[7] - input[24];
  output[25] = input[6] - input[25];
  output[26] = input[5] - input[26];
  output[27] = input[4] - input[27];
  output[28] = input[3] - input[28];
  output[29] = input[2] - input[29];
  output[30] = input[1] - input[30];
  output[31] = input[0] - input[31];

  range_check(output, 32, 15);
Jingning Han's avatar
Jingning Han committed
376

377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
  // stage 2
  step[0] = output[0] + output[15];
  step[1] = output[1] + output[14];
  step[2] = output[2] + output[13];
  step[3] = output[3] + output[12];
  step[4] = output[4] + output[11];
  step[5] = output[5] + output[10];
  step[6] = output[6] + output[9];
  step[7] = output[7] + output[8];
  step[8] = output[7] - output[8];
  step[9] = output[6] - output[9];
  step[10] = output[5] - output[10];
  step[11] = output[4] - output[11];
  step[12] = output[3] - output[12];
  step[13] = output[2] - output[13];
  step[14] = output[1] - output[14];
  step[15] = output[0] - output[15];
  step[16] = output[16];
  step[17] = output[17];
  step[18] = output[18];
  step[19] = output[19];
  temp = output[20] * -cospi_16_64 + output[27] * cospi_16_64;
  step[20] = (tran_low_t)fdct_round_shift(temp);
  temp = output[21] * -cospi_16_64 + output[26] * cospi_16_64;
  step[21] = (tran_low_t)fdct_round_shift(temp);
  temp = output[22] * -cospi_16_64 + output[25] * cospi_16_64;
  step[22] = (tran_low_t)fdct_round_shift(temp);
  temp = output[23] * -cospi_16_64 + output[24] * cospi_16_64;
  step[23] = (tran_low_t)fdct_round_shift(temp);
  temp = output[24] * cospi_16_64 + output[23] * cospi_16_64;
  step[24] = (tran_low_t)fdct_round_shift(temp);
  temp = output[25] * cospi_16_64 + output[22] * cospi_16_64;
  step[25] = (tran_low_t)fdct_round_shift(temp);
  temp = output[26] * cospi_16_64 + output[21] * cospi_16_64;
  step[26] = (tran_low_t)fdct_round_shift(temp);
  temp = output[27] * cospi_16_64 + output[20] * cospi_16_64;
  step[27] = (tran_low_t)fdct_round_shift(temp);
  step[28] = output[28];
  step[29] = output[29];
  step[30] = output[30];
  step[31] = output[31];

419
  range_check(step, 32, 16);
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504

  // stage 3
  output[0] = step[0] + step[7];
  output[1] = step[1] + step[6];
  output[2] = step[2] + step[5];
  output[3] = step[3] + step[4];
  output[4] = step[3] - step[4];
  output[5] = step[2] - step[5];
  output[6] = step[1] - step[6];
  output[7] = step[0] - step[7];
  output[8] = step[8];
  output[9] = step[9];
  temp = step[10] * -cospi_16_64 + step[13] * cospi_16_64;
  output[10] = (tran_low_t)fdct_round_shift(temp);
  temp = step[11] * -cospi_16_64 + step[12] * cospi_16_64;
  output[11] = (tran_low_t)fdct_round_shift(temp);
  temp = step[12] * cospi_16_64 + step[11] * cospi_16_64;
  output[12] = (tran_low_t)fdct_round_shift(temp);
  temp = step[13] * cospi_16_64 + step[10] * cospi_16_64;
  output[13] = (tran_low_t)fdct_round_shift(temp);
  output[14] = step[14];
  output[15] = step[15];
  output[16] = step[16] + step[23];
  output[17] = step[17] + step[22];
  output[18] = step[18] + step[21];
  output[19] = step[19] + step[20];
  output[20] = step[19] - step[20];
  output[21] = step[18] - step[21];
  output[22] = step[17] - step[22];
  output[23] = step[16] - step[23];
  output[24] = step[31] - step[24];
  output[25] = step[30] - step[25];
  output[26] = step[29] - step[26];
  output[27] = step[28] - step[27];
  output[28] = step[28] + step[27];
  output[29] = step[29] + step[26];
  output[30] = step[30] + step[25];
  output[31] = step[31] + step[24];

  range_check(output, 32, 17);

  // stage 4
  step[0] = output[0] + output[3];
  step[1] = output[1] + output[2];
  step[2] = output[1] - output[2];
  step[3] = output[0] - output[3];
  step[4] = output[4];
  temp = output[5] * -cospi_16_64 + output[6] * cospi_16_64;
  step[5] = (tran_low_t)fdct_round_shift(temp);
  temp = output[6] * cospi_16_64 + output[5] * cospi_16_64;
  step[6] = (tran_low_t)fdct_round_shift(temp);
  step[7] = output[7];
  step[8] = output[8] + output[11];
  step[9] = output[9] + output[10];
  step[10] = output[9] - output[10];
  step[11] = output[8] - output[11];
  step[12] = output[15] - output[12];
  step[13] = output[14] - output[13];
  step[14] = output[14] + output[13];
  step[15] = output[15] + output[12];
  step[16] = output[16];
  step[17] = output[17];
  temp = output[18] * -cospi_8_64 + output[29] * cospi_24_64;
  step[18] = (tran_low_t)fdct_round_shift(temp);
  temp = output[19] * -cospi_8_64 + output[28] * cospi_24_64;
  step[19] = (tran_low_t)fdct_round_shift(temp);
  temp = output[20] * -cospi_24_64 + output[27] * -cospi_8_64;
  step[20] = (tran_low_t)fdct_round_shift(temp);
  temp = output[21] * -cospi_24_64 + output[26] * -cospi_8_64;
  step[21] = (tran_low_t)fdct_round_shift(temp);
  step[22] = output[22];
  step[23] = output[23];
  step[24] = output[24];
  step[25] = output[25];
  temp = output[26] * cospi_24_64 + output[21] * -cospi_8_64;
  step[26] = (tran_low_t)fdct_round_shift(temp);
  temp = output[27] * cospi_24_64 + output[20] * -cospi_8_64;
  step[27] = (tran_low_t)fdct_round_shift(temp);
  temp = output[28] * cospi_8_64 + output[19] * cospi_24_64;
  step[28] = (tran_low_t)fdct_round_shift(temp);
  temp = output[29] * cospi_8_64 + output[18] * cospi_24_64;
  step[29] = (tran_low_t)fdct_round_shift(temp);
  step[30] = output[30];
  step[31] = output[31];

505
  range_check(step, 32, 18);
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596

  // stage 5
  temp = step[0] * cospi_16_64 + step[1] * cospi_16_64;
  output[0] = (tran_low_t)fdct_round_shift(temp);
  temp = step[1] * -cospi_16_64 + step[0] * cospi_16_64;
  output[1] = (tran_low_t)fdct_round_shift(temp);
  temp = step[2] * cospi_24_64 + step[3] * cospi_8_64;
  output[2] = (tran_low_t)fdct_round_shift(temp);
  temp = step[3] * cospi_24_64 + step[2] * -cospi_8_64;
  output[3] = (tran_low_t)fdct_round_shift(temp);
  output[4] = step[4] + step[5];
  output[5] = step[4] - step[5];
  output[6] = step[7] - step[6];
  output[7] = step[7] + step[6];
  output[8] = step[8];
  temp = step[9] * -cospi_8_64 + step[14] * cospi_24_64;
  output[9] = (tran_low_t)fdct_round_shift(temp);
  temp = step[10] * -cospi_24_64 + step[13] * -cospi_8_64;
  output[10] = (tran_low_t)fdct_round_shift(temp);
  output[11] = step[11];
  output[12] = step[12];
  temp = step[13] * cospi_24_64 + step[10] * -cospi_8_64;
  output[13] = (tran_low_t)fdct_round_shift(temp);
  temp = step[14] * cospi_8_64 + step[9] * cospi_24_64;
  output[14] = (tran_low_t)fdct_round_shift(temp);
  output[15] = step[15];
  output[16] = step[16] + step[19];
  output[17] = step[17] + step[18];
  output[18] = step[17] - step[18];
  output[19] = step[16] - step[19];
  output[20] = step[23] - step[20];
  output[21] = step[22] - step[21];
  output[22] = step[22] + step[21];
  output[23] = step[23] + step[20];
  output[24] = step[24] + step[27];
  output[25] = step[25] + step[26];
  output[26] = step[25] - step[26];
  output[27] = step[24] - step[27];
  output[28] = step[31] - step[28];
  output[29] = step[30] - step[29];
  output[30] = step[30] + step[29];
  output[31] = step[31] + step[28];

  range_check(output, 32, 18);

  // stage 6
  step[0] = output[0];
  step[1] = output[1];
  step[2] = output[2];
  step[3] = output[3];
  temp = output[4] * cospi_28_64 + output[7] * cospi_4_64;
  step[4] = (tran_low_t)fdct_round_shift(temp);
  temp = output[5] * cospi_12_64 + output[6] * cospi_20_64;
  step[5] = (tran_low_t)fdct_round_shift(temp);
  temp = output[6] * cospi_12_64 + output[5] * -cospi_20_64;
  step[6] = (tran_low_t)fdct_round_shift(temp);
  temp = output[7] * cospi_28_64 + output[4] * -cospi_4_64;
  step[7] = (tran_low_t)fdct_round_shift(temp);
  step[8] = output[8] + output[9];
  step[9] = output[8] - output[9];
  step[10] = output[11] - output[10];
  step[11] = output[11] + output[10];
  step[12] = output[12] + output[13];
  step[13] = output[12] - output[13];
  step[14] = output[15] - output[14];
  step[15] = output[15] + output[14];
  step[16] = output[16];
  temp = output[17] * -cospi_4_64 + output[30] * cospi_28_64;
  step[17] = (tran_low_t)fdct_round_shift(temp);
  temp = output[18] * -cospi_28_64 + output[29] * -cospi_4_64;
  step[18] = (tran_low_t)fdct_round_shift(temp);
  step[19] = output[19];
  step[20] = output[20];
  temp = output[21] * -cospi_20_64 + output[26] * cospi_12_64;
  step[21] = (tran_low_t)fdct_round_shift(temp);
  temp = output[22] * -cospi_12_64 + output[25] * -cospi_20_64;
  step[22] = (tran_low_t)fdct_round_shift(temp);
  step[23] = output[23];
  step[24] = output[24];
  temp = output[25] * cospi_12_64 + output[22] * -cospi_20_64;
  step[25] = (tran_low_t)fdct_round_shift(temp);
  temp = output[26] * cospi_20_64 + output[21] * cospi_12_64;
  step[26] = (tran_low_t)fdct_round_shift(temp);
  step[27] = output[27];
  step[28] = output[28];
  temp = output[29] * cospi_28_64 + output[18] * -cospi_4_64;
  step[29] = (tran_low_t)fdct_round_shift(temp);
  temp = output[30] * cospi_4_64 + output[17] * cospi_28_64;
  step[30] = (tran_low_t)fdct_round_shift(temp);
  step[31] = output[31];

597
  range_check(step, 32, 18);
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692

  // stage 7
  output[0] = step[0];
  output[1] = step[1];
  output[2] = step[2];
  output[3] = step[3];
  output[4] = step[4];
  output[5] = step[5];
  output[6] = step[6];
  output[7] = step[7];
  temp = step[8] * cospi_30_64 + step[15] * cospi_2_64;
  output[8] = (tran_low_t)fdct_round_shift(temp);
  temp = step[9] * cospi_14_64 + step[14] * cospi_18_64;
  output[9] = (tran_low_t)fdct_round_shift(temp);
  temp = step[10] * cospi_22_64 + step[13] * cospi_10_64;
  output[10] = (tran_low_t)fdct_round_shift(temp);
  temp = step[11] * cospi_6_64 + step[12] * cospi_26_64;
  output[11] = (tran_low_t)fdct_round_shift(temp);
  temp = step[12] * cospi_6_64 + step[11] * -cospi_26_64;
  output[12] = (tran_low_t)fdct_round_shift(temp);
  temp = step[13] * cospi_22_64 + step[10] * -cospi_10_64;
  output[13] = (tran_low_t)fdct_round_shift(temp);
  temp = step[14] * cospi_14_64 + step[9] * -cospi_18_64;
  output[14] = (tran_low_t)fdct_round_shift(temp);
  temp = step[15] * cospi_30_64 + step[8] * -cospi_2_64;
  output[15] = (tran_low_t)fdct_round_shift(temp);
  output[16] = step[16] + step[17];
  output[17] = step[16] - step[17];
  output[18] = step[19] - step[18];
  output[19] = step[19] + step[18];
  output[20] = step[20] + step[21];
  output[21] = step[20] - step[21];
  output[22] = step[23] - step[22];
  output[23] = step[23] + step[22];
  output[24] = step[24] + step[25];
  output[25] = step[24] - step[25];
  output[26] = step[27] - step[26];
  output[27] = step[27] + step[26];
  output[28] = step[28] + step[29];
  output[29] = step[28] - step[29];
  output[30] = step[31] - step[30];
  output[31] = step[31] + step[30];

  range_check(output, 32, 18);

  // stage 8
  step[0] = output[0];
  step[1] = output[1];
  step[2] = output[2];
  step[3] = output[3];
  step[4] = output[4];
  step[5] = output[5];
  step[6] = output[6];
  step[7] = output[7];
  step[8] = output[8];
  step[9] = output[9];
  step[10] = output[10];
  step[11] = output[11];
  step[12] = output[12];
  step[13] = output[13];
  step[14] = output[14];
  step[15] = output[15];
  temp = output[16] * cospi_31_64 + output[31] * cospi_1_64;
  step[16] = (tran_low_t)fdct_round_shift(temp);
  temp = output[17] * cospi_15_64 + output[30] * cospi_17_64;
  step[17] = (tran_low_t)fdct_round_shift(temp);
  temp = output[18] * cospi_23_64 + output[29] * cospi_9_64;
  step[18] = (tran_low_t)fdct_round_shift(temp);
  temp = output[19] * cospi_7_64 + output[28] * cospi_25_64;
  step[19] = (tran_low_t)fdct_round_shift(temp);
  temp = output[20] * cospi_27_64 + output[27] * cospi_5_64;
  step[20] = (tran_low_t)fdct_round_shift(temp);
  temp = output[21] * cospi_11_64 + output[26] * cospi_21_64;
  step[21] = (tran_low_t)fdct_round_shift(temp);
  temp = output[22] * cospi_19_64 + output[25] * cospi_13_64;
  step[22] = (tran_low_t)fdct_round_shift(temp);
  temp = output[23] * cospi_3_64 + output[24] * cospi_29_64;
  step[23] = (tran_low_t)fdct_round_shift(temp);
  temp = output[24] * cospi_3_64 + output[23] * -cospi_29_64;
  step[24] = (tran_low_t)fdct_round_shift(temp);
  temp = output[25] * cospi_19_64 + output[22] * -cospi_13_64;
  step[25] = (tran_low_t)fdct_round_shift(temp);
  temp = output[26] * cospi_11_64 + output[21] * -cospi_21_64;
  step[26] = (tran_low_t)fdct_round_shift(temp);
  temp = output[27] * cospi_27_64 + output[20] * -cospi_5_64;
  step[27] = (tran_low_t)fdct_round_shift(temp);
  temp = output[28] * cospi_7_64 + output[19] * -cospi_25_64;
  step[28] = (tran_low_t)fdct_round_shift(temp);
  temp = output[29] * cospi_23_64 + output[18] * -cospi_9_64;
  step[29] = (tran_low_t)fdct_round_shift(temp);
  temp = output[30] * cospi_15_64 + output[17] * -cospi_17_64;
  step[30] = (tran_low_t)fdct_round_shift(temp);
  temp = output[31] * cospi_31_64 + output[16] * -cospi_1_64;
  step[31] = (tran_low_t)fdct_round_shift(temp);

693
  range_check(step, 32, 18);
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729

  // stage 9
  output[0] = step[0];
  output[1] = step[16];
  output[2] = step[8];
  output[3] = step[24];
  output[4] = step[4];
  output[5] = step[20];
  output[6] = step[12];
  output[7] = step[28];
  output[8] = step[2];
  output[9] = step[18];
  output[10] = step[10];
  output[11] = step[26];
  output[12] = step[6];
  output[13] = step[22];
  output[14] = step[14];
  output[15] = step[30];
  output[16] = step[1];
  output[17] = step[17];
  output[18] = step[9];
  output[19] = step[25];
  output[20] = step[5];
  output[21] = step[21];
  output[22] = step[13];
  output[23] = step[29];
  output[24] = step[3];
  output[25] = step[19];
  output[26] = step[11];
  output[27] = step[27];
  output[28] = step[7];
  output[29] = step[23];
  output[30] = step[15];
  output[31] = step[31];

  range_check(output, 32, 18);
Jingning Han's avatar
Jingning Han committed
730
731
}

732
#ifndef AV1_DCT_GTEST
733
734
735
736
737
738
739
740
741
742
#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64
static void fdct64(const tran_low_t *input, tran_low_t *output) {
  int i;
  od_coeff x[64];
  od_coeff y[64];
  for (i = 0; i < 64; i++) x[i] = (od_coeff)input[i];
  od_bin_fdct64(y, x, 1);
  for (i = 0; i < 64; i++) output[i] = (tran_low_t)y[i];
}
#endif
743

Jingning Han's avatar
Jingning Han committed
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
static void fadst4(const tran_low_t *input, tran_low_t *output) {
  tran_high_t x0, x1, x2, x3;
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

  x0 = input[0];
  x1 = input[1];
  x2 = input[2];
  x3 = input[3];

  if (!(x0 | x1 | x2 | x3)) {
    output[0] = output[1] = output[2] = output[3] = 0;
    return;
  }

  s0 = sinpi_1_9 * x0;
  s1 = sinpi_4_9 * x0;
  s2 = sinpi_2_9 * x1;
  s3 = sinpi_1_9 * x1;
  s4 = sinpi_3_9 * x2;
  s5 = sinpi_4_9 * x3;
  s6 = sinpi_2_9 * x3;
  s7 = x0 + x1 - x3;

  x0 = s0 + s2 + s5;
  x1 = sinpi_3_9 * s7;
  x2 = s1 - s3 + s6;
  x3 = s4;

  s0 = x0 + x3;
  s1 = x1;
  s2 = x2 - x3;
  s3 = x2 - x0 + x3;

  // 1-D transform scaling factor is sqrt(2).
  output[0] = (tran_low_t)fdct_round_shift(s0);
  output[1] = (tran_low_t)fdct_round_shift(s1);
  output[2] = (tran_low_t)fdct_round_shift(s2);
  output[3] = (tran_low_t)fdct_round_shift(s3);
}

static void fadst8(const tran_low_t *input, tran_low_t *output) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

  tran_high_t x0 = input[7];
  tran_high_t x1 = input[0];
  tran_high_t x2 = input[5];
  tran_high_t x3 = input[2];
  tran_high_t x4 = input[3];
  tran_high_t x5 = input[4];
  tran_high_t x6 = input[1];
  tran_high_t x7 = input[6];

  // stage 1
797
798
  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
Jingning Han's avatar
Jingning Han committed
799
800
801
802
  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
803
804
  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
Jingning Han's avatar
Jingning Han committed
805

806
807
808
809
  x0 = s0 + s4;
  x1 = s1 + s5;
  x2 = s2 + s6;
  x3 = s3 + s7;
Jingning Han's avatar
Jingning Han committed
810
811
812
813
814
815
816
817
818
819
  x4 = fdct_round_shift(s0 - s4);
  x5 = fdct_round_shift(s1 - s5);
  x6 = fdct_round_shift(s2 - s6);
  x7 = fdct_round_shift(s3 - s7);

  // stage 2
  s0 = x0;
  s1 = x1;
  s2 = x2;
  s3 = x3;
820
821
822
823
  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
Jingning Han's avatar
Jingning Han committed
824

825
826
827
828
  x0 = fdct_round_shift(s0 + s2);
  x1 = fdct_round_shift(s1 + s3);
  x2 = fdct_round_shift(s0 - s2);
  x3 = fdct_round_shift(s1 - s3);
Jingning Han's avatar
Jingning Han committed
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
  x4 = fdct_round_shift(s4 + s6);
  x5 = fdct_round_shift(s5 + s7);
  x6 = fdct_round_shift(s4 - s6);
  x7 = fdct_round_shift(s5 - s7);

  // stage 3
  s2 = cospi_16_64 * (x2 + x3);
  s3 = cospi_16_64 * (x2 - x3);
  s6 = cospi_16_64 * (x6 + x7);
  s7 = cospi_16_64 * (x6 - x7);

  x2 = fdct_round_shift(s2);
  x3 = fdct_round_shift(s3);
  x6 = fdct_round_shift(s6);
  x7 = fdct_round_shift(s7);

  output[0] = (tran_low_t)x0;
  output[1] = (tran_low_t)-x4;
  output[2] = (tran_low_t)x6;
  output[3] = (tran_low_t)-x2;
  output[4] = (tran_low_t)x3;
  output[5] = (tran_low_t)-x7;
  output[6] = (tran_low_t)x5;
  output[7] = (tran_low_t)-x1;
}

static void fadst16(const tran_low_t *input, tran_low_t *output) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  tran_high_t s9, s10, s11, s12, s13, s14, s15;

  tran_high_t x0 = input[15];
  tran_high_t x1 = input[0];
  tran_high_t x2 = input[13];
  tran_high_t x3 = input[2];
  tran_high_t x4 = input[11];
  tran_high_t x5 = input[4];
  tran_high_t x6 = input[9];
  tran_high_t x7 = input[6];
  tran_high_t x8 = input[7];
  tran_high_t x9 = input[8];
  tran_high_t x10 = input[5];
  tran_high_t x11 = input[10];
  tran_high_t x12 = input[3];
  tran_high_t x13 = input[12];
  tran_high_t x14 = input[1];
  tran_high_t x15 = input[14];

  // stage 1
877
  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
Jingning Han's avatar
Jingning Han committed
878
  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
879
  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
Jingning Han's avatar
Jingning Han committed
880
  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
881
  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
Jingning Han's avatar
Jingning Han committed
882
883
884
885
886
887
888
889
  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
890
  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
Jingning Han's avatar
Jingning Han committed
891
  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
892
  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
Jingning Han's avatar
Jingning Han committed
893

894
895
896
897
898
899
900
901
902
  x0 = s0 + s8;
  x1 = s1 + s9;
  x2 = s2 + s10;
  x3 = s3 + s11;
  x4 = s4 + s12;
  x5 = s5 + s13;
  x6 = s6 + s14;
  x7 = s7 + s15;

903
904
  x8 = fdct_round_shift(s0 - s8);
  x9 = fdct_round_shift(s1 - s9);
Jingning Han's avatar
Jingning Han committed
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
  x10 = fdct_round_shift(s2 - s10);
  x11 = fdct_round_shift(s3 - s11);
  x12 = fdct_round_shift(s4 - s12);
  x13 = fdct_round_shift(s5 - s13);
  x14 = fdct_round_shift(s6 - s14);
  x15 = fdct_round_shift(s7 - s15);

  // stage 2
  s0 = x0;
  s1 = x1;
  s2 = x2;
  s3 = x3;
  s4 = x4;
  s5 = x5;
  s6 = x6;
  s7 = x7;
921
922
923
924
925
926
927
928
  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
Jingning Han's avatar
Jingning Han committed
929
930
931
932
933

  x0 = s0 + s4;
  x1 = s1 + s5;
  x2 = s2 + s6;
  x3 = s3 + s7;
934
935
936
937
938
939
940
941
942
  x4 = fdct_round_shift(s0 - s4);
  x5 = fdct_round_shift(s1 - s5);
  x6 = fdct_round_shift(s2 - s6);
  x7 = fdct_round_shift(s3 - s7);

  x8 = s8 + s12;
  x9 = s9 + s13;
  x10 = s10 + s14;
  x11 = s11 + s15;
Jingning Han's avatar
Jingning Han committed
943
944
945
946
947
948
949
950
951
952
  x12 = fdct_round_shift(s8 - s12);
  x13 = fdct_round_shift(s9 - s13);
  x14 = fdct_round_shift(s10 - s14);
  x15 = fdct_round_shift(s11 - s15);

  // stage 3
  s0 = x0;
  s1 = x1;
  s2 = x2;
  s3 = x3;
953
  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
Jingning Han's avatar
Jingning Han committed
954
  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
955
956
  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
Jingning Han's avatar
Jingning Han committed
957
958
959
960
  s8 = x8;
  s9 = x9;
  s10 = x10;
  s11 = x11;
961
  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
Jingning Han's avatar
Jingning Han committed
962
  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
963
964
  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
Jingning Han's avatar
Jingning Han committed
965

966
967
968
969
970
  x0 = fdct_round_shift(s0 + s2);
  x1 = fdct_round_shift(s1 + s3);
  x2 = fdct_round_shift(s0 - s2);
  x3 = fdct_round_shift(s1 - s3);

Jingning Han's avatar
Jingning Han committed
971
972
973
974
  x4 = fdct_round_shift(s4 + s6);
  x5 = fdct_round_shift(s5 + s7);
  x6 = fdct_round_shift(s4 - s6);
  x7 = fdct_round_shift(s5 - s7);
975
976
977
978
979
980

  x8 = fdct_round_shift(s8 + s10);
  x9 = fdct_round_shift(s9 + s11);
  x10 = fdct_round_shift(s8 - s10);
  x11 = fdct_round_shift(s9 - s11);

Jingning Han's avatar
Jingning Han committed
981
982
983
984
985
986
  x12 = fdct_round_shift(s12 + s14);
  x13 = fdct_round_shift(s13 + s15);
  x14 = fdct_round_shift(s12 - s14);
  x15 = fdct_round_shift(s13 - s15);

  // stage 4
987
  s2 = (-cospi_16_64) * (x2 + x3);
Jingning Han's avatar
Jingning Han committed
988
989
  s3 = cospi_16_64 * (x2 - x3);
  s6 = cospi_16_64 * (x6 + x7);
990
  s7 = cospi_16_64 * (-x6 + x7);
Jingning Han's avatar
Jingning Han committed
991
  s10 = cospi_16_64 * (x10 + x11);
992
993
  s11 = cospi_16_64 * (-x10 + x11);
  s14 = (-cospi_16_64) * (x14 + x15);
Jingning Han's avatar
Jingning Han committed
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
  s15 = cospi_16_64 * (x14 - x15);

  x2 = fdct_round_shift(s2);
  x3 = fdct_round_shift(s3);
  x6 = fdct_round_shift(s6);
  x7 = fdct_round_shift(s7);
  x10 = fdct_round_shift(s10);
  x11 = fdct_round_shift(s11);
  x14 = fdct_round_shift(s14);
  x15 = fdct_round_shift(s15);

  output[0] = (tran_low_t)x0;
  output[1] = (tran_low_t)-x8;
  output[2] = (tran_low_t)x12;
  output[3] = (tran_low_t)-x4;
  output[4] = (tran_low_t)x6;
  output[5] = (tran_low_t)x14;
  output[6] = (tran_low_t)x10;
  output[7] = (tran_low_t)x2;
  output[8] = (tran_low_t)x3;
  output[9] = (tran_low_t)x11;
  output[10] = (tran_low_t)x15;
  output[11] = (tran_low_t)x7;
  output[12] = (tran_low_t)x5;
  output[13] = (tran_low_t)-x13;
  output[14] = (tran_low_t)x9;
  output[15] = (tran_low_t)-x1;
}

1023
// For use in lieu of ADST
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
#if CONFIG_DAALA_DCT32
static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
  int i;
  tran_low_t inputhalf[16];
  // No scaling within; Daala transforms are all orthonormal
  for (i = 0; i < 16; ++i) {
    output[16 + i] = input[i];
  }
  for (i = 0; i < 16; ++i) {
    inputhalf[i] = input[i + 16];
  }
1035
  daala_fdct16(inputhalf, output);
1036
1037
}
#else
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
  int i;
  tran_low_t inputhalf[16];
  for (i = 0; i < 16; ++i) {
    output[16 + i] = input[i] * 4;
  }
  // Multiply input by sqrt(2)
  for (i = 0; i < 16; ++i) {
    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
  }
  fdct16(inputhalf, output);
  // Note overall scaling factor is 4 times orthogonal
}
1051
#endif
1052

Sarah Parker's avatar
Sarah Parker committed
1053
#if CONFIG_MRC_TX
1054
1055
static void get_masked_residual32(const int16_t **input, int *input_stride,
                                  const uint8_t *pred, int pred_stride,
Sarah Parker's avatar
Sarah Parker committed
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
                                  int16_t *masked_input,
                                  TxfmParam *txfm_param) {
  int n_masked_vals = 0;
  uint8_t *mrc_mask;
  uint8_t mask_tmp[32 * 32];
  if ((txfm_param->is_inter && SIGNAL_MRC_MASK_INTER) ||
      (!txfm_param->is_inter && SIGNAL_MRC_MASK_INTRA)) {
    mrc_mask = txfm_param->mask;
    n_masked_vals = get_mrc_diff_mask(*input, *input_stride, mrc_mask, 32, 32,
                                      32, txfm_param->is_inter);
  } else {
    mrc_mask = mask_tmp;
    n_masked_vals = get_mrc_pred_mask(pred, pred_stride, mrc_mask, 32, 32, 32,
                                      txfm_param->is_inter);
  }

1072
1073
  // Do not use MRC_DCT if mask is invalid. DCT_DCT will be used instead.
  if (!is_valid_mrc_mask(n_masked_vals, 32, 32)) {
Sarah Parker's avatar
Sarah Parker committed
1074
    *txfm_param->valid_mask = 0;
1075
1076
    return;
  }
1077
1078
1079
1080
1081
1082
1083
1084
  int32_t sum = 0;
  int16_t avg;
  // Get the masked average of the prediction
  for (int i = 0; i < 32; ++i) {
    for (int j = 0; j < 32; ++j) {
      sum += mrc_mask[i * 32 + j] * (*input)[i * (*input_stride) + j];
    }
  }
1085
  avg = sum / n_masked_vals;
1086
1087
1088
1089
1090
1091
1092
1093
1094
  // Replace all of the unmasked pixels in the prediction with the average
  // of the masked pixels
  for (int i = 0; i < 32; ++i) {
    for (int j = 0; j < 32; ++j)
      masked_input[i * 32 + j] =
          (mrc_mask[i * 32 + j]) ? (*input)[i * (*input_stride) + j] : avg;
  }
  *input = masked_input;
  *input_stride = 32;
Sarah Parker's avatar
Sarah Parker committed
1095
  *txfm_param->valid_mask = 1;
Sarah Parker's avatar
Sarah Parker committed
1096
1097
1098
}
#endif  // CONFIG_MRC_TX

Lester Lu's avatar
Lester Lu committed
1099
1100
1101
#if CONFIG_LGT
static void flgt4(const tran_low_t *input, tran_low_t *output,
                  const tran_high_t *lgtmtx) {
Lester Lu's avatar
Lester Lu committed
1102
  if (!lgtmtx) assert(0);
Lester Lu's avatar
Lester Lu committed
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113

  // evaluate s[j] = sum of all lgtmtx[j][i]*input[i] over i=1,...,4
  tran_high_t s[4] = { 0 };
  for (int i = 0; i < 4; ++i)
    for (int j = 0; j < 4; ++j) s[j] += lgtmtx[j * 4 + i] * input[i];

  for (int i = 0; i < 4; ++i) output[i] = (tran_low_t)fdct_round_shift(s[i]);
}

static void flgt8(const tran_low_t *input, tran_low_t *output,
                  const tran_high_t *lgtmtx) {
Lester Lu's avatar
Lester Lu committed
1114
1115
  if (!lgtmtx) assert(0);

Lester Lu's avatar
Lester Lu committed
1116
1117
1118
1119
1120
1121
1122
1123
1124
  // evaluate s[j] = sum of all lgtmtx[j][i]*input[i] over i=1,...,8
  tran_high_t s[8] = { 0 };
  for (int i = 0; i < 8; ++i)
    for (int j = 0; j < 8; ++j) s[j] += lgtmtx[j * 8 + i] * input[i];

  for (int i = 0; i < 8; ++i) output[i] = (tran_low_t)fdct_round_shift(s[i]);
}
#endif  // CONFIG_LGT

1125
#if CONFIG_EXT_TX
1126
1127
1128
1129
// TODO(sarahparker) these functions will be removed once the highbitdepth
// codepath works properly for rectangular transforms. They have almost
// identical versions in av1_fwd_txfm1d.c, but those are currently only
// being used for square transforms.
Debargha Mukherjee's avatar
Debargha Mukherjee committed
1130
1131
static void fidtx4(const tran_low_t *input, tran_low_t *output) {
  int i;
1132
1133
1134
1135
  for (i = 0; i < 4; ++i) {
#if CONFIG_DAALA_DCT4
    output[i] = input[i];
#else
Debargha Mukherjee's avatar
Debargha Mukherjee committed
1136
    output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
1137
1138
#endif
  }
Debargha Mukherjee's avatar
Debargha Mukherjee committed
1139
1140
1141
1142
}

static void fidtx8(const tran_low_t *input, tran_low_t *output) {
  int i;
1143
1144
1145
1146
1147
1148
1149
  for (i = 0; i < 8; ++i) {
#if CONFIG_DAALA_DCT8
    output[i] = input[i];
#else
    output[i] = input[i] * 2;
#endif
  }
Debargha Mukherjee's avatar
Debargha Mukherjee committed
1150
1151
1152
1153
}

static void fidtx16(const tran_low_t *input, tran_low_t *output) {
  int i;
1154
1155
1156
1157
  for (i = 0; i < 16; ++i) {
#if CONFIG_DAALA_DCT16
    output[i] = input[i];
#else
Debargha Mukherjee's avatar
Debargha Mukherjee committed
1158
    output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2);
1159
1160
#endif
  }
Debargha Mukherjee's avatar
Debargha Mukherjee committed
1161
1162
1163
1164
}

static void fidtx32(const tran_low_t *input, tran_low_t *output) {
  int i;
1165
1166
1167
1168
1169
1170
1171
  for (i = 0; i < 32; ++i) {
#if CONFIG_DAALA_DCT32
    output[i] = input[i];
#else
    output[i] = input[i] * 4;
#endif
  }
Debargha Mukherjee's avatar
Debargha Mukherjee committed
1172
1173
}

1174
static void copy_block(const int16_t *src, int src_stride, int l, int w,
1175
1176
1177
                       int16_t *dest, int dest_stride) {
  int i;
  for (i = 0; i < l; ++i) {
1178
    memcpy(dest + dest_stride * i, src + src_stride * i, w * sizeof(int16_t));
1179
1180
1181
  }
}

1182
static void fliplr(int16_t *dest, int stride, int l, int w) {
1183
1184
  int i, j;
  for (i = 0; i < l; ++i) {
1185
    for (j = 0; j < w / 2; ++j) {
1186
      const int16_t tmp = dest[i * stride + j];
1187
1188
      dest[i * stride + j] = dest[i * stride + w - 1 - j];
      dest[i * stride + w - 1 - j] = tmp;
1189
1190
1191
1192
    }
  }
}

1193
static void flipud(int16_t *dest, int stride, int l, int w) {
1194
  int i, j;
1195
  for (j = 0; j < w; ++j) {
1196
1197
1198
1199
1200
1201
1202
1203
    for (i = 0; i < l / 2; ++i) {
      const int16_t tmp = dest[i * stride + j];
      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
      dest[(l - 1 - i) * stride + j] = tmp;
    }
  }
}

1204
static void fliplrud(int16_t *dest, int stride, int l, int w) {
1205
1206
  int i, j;
  for (i = 0; i < l / 2; ++i) {
1207
    for (j = 0; j < w; ++j) {
1208
      const int16_t tmp = dest[i * stride + j];
1209
1210
      dest[i * stride + j] = dest[(l - 1 - i) * stride + w - 1 - j];
      dest[(l - 1 - i) * stride + w - 1 - j] = tmp;
1211
1212
1213
1214
    }
  }
}

1215
static void copy_fliplr(const int16_t *src, int src_stride, int l, int w,
1216
1217
1218
                        int16_t *dest, int dest_stride) {
  copy_block(src, src_stride, l, w, dest, dest_stride);
  fliplr(dest, dest_stride, l, w);
1219
1220
}

1221
static void copy_flipud(const int16_t *src, int src_stride, int l, int w,
1222
1223
1224
                        int16_t *dest, int dest_stride) {
  copy_block(src, src_stride, l, w, dest, dest_stride);
  flipud(dest, dest_stride, l, w);
1225
1226
}

1227
static void copy_fliplrud(const int16_t *src, int src_stride, int l, int w,
1228
1229
1230
                          int16_t *dest, int dest_stride) {
  copy_block(src, src_stride, l, w, dest, dest_stride);
  fliplrud(dest, dest_stride, l, w);
1231
1232
}

1233
static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w,
1234
1235
                             int16_t *buff, int tx_type) {
  switch (tx_type) {
Sarah Parker's avatar
Sarah Parker committed
1236
1237
1238
#if CONFIG_MRC_TX
    case MRC_DCT:
#endif  // CONFIG_MRC_TX
1239
1240
1241
1242
    case DCT_DCT:
    case ADST_DCT:
    case DCT_ADST:
    case ADST_ADST:
Debargha Mukherjee's avatar
Debargha Mukherjee committed
1243
    case IDTX:
Jingning Han's avatar
Jingning Han committed
1244
    case V_DCT:
1245
1246
    case H_DCT:
    case V_ADST:
1247
    case H_ADST: break;
1248
1249
    case FLIPADST_DCT:
    case FLIPADST_ADST:
1250
    case V_FLIPADST:
1251
      copy_flipud(*src, *src_stride, l, w, buff, w);
1252
      *src = buff;
1253
      *src_stride = w;
1254
1255
1256
      break;
    case DCT_FLIPADST:
    case ADST_FLIPADST:
1257
    case H_FLIPADST:
1258
      copy_fliplr(*src, *src_stride, l, w, buff, w);
1259
      *src = buff;
1260
      *src_stride = w;
1261
1262
      break;
    case FLIPADST_FLIPADST:
1263
      copy_fliplrud(*src, *src_stride, l, w, buff, w);
1264
      *src = buff;
1265
      *src_stride = w;
1266
      break;
1267
    default: assert(0); break;
1268
1269
1270
1271
  }
}
#endif  // CONFIG_EXT_TX

Yaowu Xu's avatar
Yaowu Xu committed
1272
void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
1273
1274
                  TxfmParam *txfm_param) {
  int tx_type = txfm_param->tx_type;
Sarah Parker's avatar
Sarah Parker committed
1275
1276
1277
#if CONFIG_MRC_TX
  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif
1278
1279
1280
#if CONFIG_DCT_ONLY
  assert(tx_type == DCT_DCT);
#endif
1281
#if !CONFIG_DAALA_DCT4
Jingning Han's avatar
Jingning Han committed
1282
  if (tx_type == DCT_DCT) {
Yaowu Xu's avatar
Yaowu Xu committed
1283
    aom_fdct4x4_c(input, output, stride);
1284
1285
1286
1287
    return;
  }
#endif
  {
Geza Lore's avatar
Geza Lore committed
1288
    static const transform_2d FHT[] = {
1289
1290
#if CONFIG_DAALA_DCT4
      { daala_fdct4, daala_fdct4 },  // DCT_DCT
1291
1292
1293
      { daala_fdst4, daala_fdct4 },  // ADST_DCT
      { daala_fdct4, daala_fdst4 },  // DCT_ADST
      { daala_fdst4, daala_fdst4 },  // ADST_ADST
1294
#if CONFIG_EXT_TX
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
      { daala_fdst4, daala_fdct4 },  // FLIPADST_DCT
      { daala_fdct4, daala_fdst4 },  // DCT_FLIPADST
      { daala_fdst4, daala_fdst4 },  // FLIPADST_FLIPADST
      { daala_fdst4, daala_fdst4 },  // ADST_FLIPADST
      { daala_fdst4, daala_fdst4 },  // FLIPADST_ADST
      { fidtx4, fidtx4 },            // IDTX
      { daala_fdct4, fidtx4 },       // V_DCT
      { fidtx4, daala_fdct4 },       // H_DCT
      { daala_fdst4, fidtx4 },       // V_ADST
      { fidtx4, daala_fdst4 },       // H_ADST
      { daala_fdst4, fidtx4 },       // V_FLIPADST
      { fidtx4, daala_fdst4 },       // H_FLIPADST
1307
1308
#endif
#else
Geza Lore's avatar
Geza Lore committed
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
      { fdct4, fdct4 },    // DCT_DCT
      { fadst4, fdct4 },   // ADST_DCT
      { fdct4, fadst4 },   // DCT_ADST
      { fadst4, fadst4 },  // ADST_ADST
#if CONFIG_EXT_TX
      { fadst4, fdct4 },   // FLIPADST_DCT
      { fdct4, fadst4 },   // DCT_FLIPADST
      { fadst4, fadst4 },  // FLIPADST_FLIPADST
      { fadst4, fadst4 },  // ADST_FLIPADST
      { fadst4, fadst4 },  // FLIPADST_ADST
      { fidtx4, fidtx4 },  // IDTX
      { fdct4, fidtx4 },   // V_DCT
      { fidtx4, fdct4 },   // H_DCT
      { fadst4, fidtx4 },  // V_ADST
      { fidtx4, fadst4 },  // H_ADST
      { fadst4, fidtx4 },  // V_FLIPADST
      { fidtx4, fadst4 },  // H_FLIPADST
1326
#endif
Lester Lu's avatar
Lester Lu committed
1327
#endif
Geza Lore's avatar
Geza Lore committed
1328
1329
    };
    const transform_2d ht = FHT[tx_type];
Jingning Han's avatar
Jingning Han committed
1330
1331
1332
1333
    tran_low_t out[4 * 4];
    int i, j;
    tran_low_t temp_in[4], temp_out[4];

1334
1335
#if CONFIG_EXT_TX
    int16_t flipped_input[4 * 4];
1336
    maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type);
1337
1338
#endif

Lester Lu's avatar
Lester Lu committed
1339
1340
1341
#if CONFIG_LGT
    // Choose LGT adaptive to the prediction. We may apply different LGTs for
    // different rows/columns, indicated by the pointers to 2D arrays
Lester Lu's avatar
Lester Lu committed
1342
1343
1344
1345
    const tran_high_t *lgtmtx_col[1];
    const tran_high_t *lgtmtx_row[1];
    int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
    int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
Lester Lu's avatar
Lester Lu committed
1346
1347
#endif

Jingning Han's avatar
Jingning Han committed
1348
1349
    // Columns
    for (i = 0; i < 4; ++i) {
1350
      /* A C99-safe upshift by 4 for both Daala and VPx TX. */
1351
      for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
1352
#if !CONFIG_DAALA_DCT4
1353
      if (i == 0 && temp_in[0]) temp_in[0] += 1;
1354
#endif
Lester Lu's avatar
Lester Lu committed
1355
1356
#if CONFIG_LGT
      if (use_lgt_col)
Lester Lu's avatar
Lester Lu committed
1357
        flgt4(temp_in, temp_out, lgtmtx_col[0]);
Lester Lu's avatar
Lester Lu committed
1358
1359
1360
      else
#endif
        ht.cols(temp_in, temp_out);
1361
      for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
Jingning Han's avatar
Jingning Han committed
1362
1363
1364
1365
    }

    // Rows
    for (i = 0; i < 4; ++i) {
1366
      for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
Lester Lu's avatar
Lester Lu committed
1367
1368
#if CONFIG_LGT
      if (use_lgt_row)
Lester Lu's avatar
Lester Lu committed
1369
        flgt4(temp_in, temp_out, lgtmtx_row[0]);
Lester Lu's avatar
Lester Lu committed
1370
1371
1372
      else
#endif
        ht.rows(temp_in, temp_out);
1373
1374
1375
1376
1377
#if CONFIG_DAALA_DCT4
      /* Daala TX has orthonormal scaling; shift down by only 1 to achieve
         the usual VPx coefficient left-shift of 3. */
      for (j = 0; j < 4; ++j) output[j + i * 4] = temp_out[j] >> 1;
#else
1378
      for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
1379
#endif
Jingning Han's avatar
Jingning Han committed
1380
1381
1382
1383
    }
  }
}

Yaowu Xu's avatar
Yaowu Xu committed
1384
void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
1385
1386
                  TxfmParam *txfm_param) {
  int tx_type = txfm_param->tx_type;
Sarah Parker's avatar
Sarah Parker committed
1387
1388
1389
#if CONFIG_MRC_TX
  assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
#endif  // CONFIG_MRC_TX
1390
1391
1392
#if CONFIG_DCT_ONLY
  assert(tx_type == DCT_DCT);
#endif
Geza Lore's avatar
Geza Lore committed
1393
1394
1395
1396
1397
  static const transform_2d FHT[] = {
    { fdct8, fdct4 },    // DCT_DCT
    { fadst8, fdct4 },   // ADST_DCT
    { fdct8, fadst4 },   // DCT_ADST
    { fadst8, fadst4 },  // ADST_ADST
1398
#if CONFIG_EXT_TX
Geza Lore's avatar
Geza Lore committed
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
    { fadst8, fdct4 },   // FLIPADST_DCT
    { fdct8, fadst4 },   // DCT_FLIPADST
    { fadst8, fadst4 },  // FLIPADST_FLIPADST
    { fadst8, fadst4 },  // ADST_FLIPADST
    { fadst8, fadst4 },  // FLIPADST_ADST
    { fidtx8, fidtx4 },  // IDTX
    { fdct8, fidtx4 },   // V_DCT
    { fidtx8, fdct4 },   // H_DCT
    { fadst8, fidtx4 },  // V_ADST
    { fidtx8, fadst4 },  // H_ADST
    { fadst8, fidtx4 },  // V_FLIPADST
    { fidtx8, fadst4 },  // H_FLIPADST
1411
#endif
Geza Lore's avatar
Geza Lore committed
1412
1413
  };
  const transform_2d ht = FHT[tx_type];