inv_txfm.c 49.2 KB
Newer Older
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
Yaowu Xu's avatar
Yaowu Xu committed
4
5
6
7
8
9
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
11
12
13
14
 */

#include <math.h>
#include <string.h>

Yaowu Xu's avatar
Yaowu Xu committed
15
#include "./aom_dsp_rtcd.h"
16
#include "aom_dsp/inv_txfm.h"
17
#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
18
    CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
19
20
#include "av1/common/daala_tx.h"
#endif
21

Yaowu Xu's avatar
Yaowu Xu committed
22
void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
clang-format's avatar
clang-format committed
23
24
  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
     0.5 shifts per pixel. */
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
  int i;
  tran_low_t output[16];
  tran_high_t a1, b1, c1, d1, e1;
  const tran_low_t *ip = input;
  tran_low_t *op = output;

  for (i = 0; i < 4; i++) {
    a1 = ip[0] >> UNIT_QUANT_SHIFT;
    c1 = ip[1] >> UNIT_QUANT_SHIFT;
    d1 = ip[2] >> UNIT_QUANT_SHIFT;
    b1 = ip[3] >> UNIT_QUANT_SHIFT;
    a1 += c1;
    d1 -= b1;
    e1 = (a1 - d1) >> 1;
    b1 = e1 - b1;
    c1 = e1 - c1;
    a1 -= b1;
    d1 += c1;
43
44
45
46
    op[0] = WRAPLOW(a1);
    op[1] = WRAPLOW(b1);
    op[2] = WRAPLOW(c1);
    op[3] = WRAPLOW(d1);
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
    ip += 4;
    op += 4;
  }

  ip = output;
  for (i = 0; i < 4; i++) {
    a1 = ip[4 * 0];
    c1 = ip[4 * 1];
    d1 = ip[4 * 2];
    b1 = ip[4 * 3];
    a1 += c1;
    d1 -= b1;
    e1 = (a1 - d1) >> 1;
    b1 = e1 - b1;
    c1 = e1 - c1;
    a1 -= b1;
    d1 += c1;
64
65
66
67
    dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
    dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
    dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
    dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
68
69
70
71
72
73

    ip++;
    dest++;
  }
}

Yaowu Xu's avatar
Yaowu Xu committed
74
void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
75
76
77
78
79
80
81
82
83
  int i;
  tran_high_t a1, e1;
  tran_low_t tmp[4];
  const tran_low_t *ip = in;
  tran_low_t *op = tmp;

  a1 = ip[0] >> UNIT_QUANT_SHIFT;
  e1 = a1 >> 1;
  a1 -= e1;
84
85
  op[0] = WRAPLOW(a1);
  op[1] = op[2] = op[3] = WRAPLOW(e1);
86
87
88
89
90
91
92
93
94
95
96
97
98
99

  ip = tmp;
  for (i = 0; i < 4; i++) {
    e1 = ip[0] >> 1;
    a1 = ip[0] - e1;
    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
    ip++;
    dest++;
  }
}

Luca Barbato's avatar
Luca Barbato committed
100
void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
101
102
103
104
105
  tran_low_t step[4];
  tran_high_t temp1, temp2;
  // stage 1
  temp1 = (input[0] + input[2]) * cospi_16_64;
  temp2 = (input[0] - input[2]) * cospi_16_64;
106
107
  step[0] = WRAPLOW(dct_const_round_shift(temp1));
  step[1] = WRAPLOW(dct_const_round_shift(temp2));
108
109
  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
110
111
  step[2] = WRAPLOW(dct_const_round_shift(temp1));
  step[3] = WRAPLOW(dct_const_round_shift(temp2));
112
113

  // stage 2
114
115
116
117
  output[0] = WRAPLOW(step[0] + step[3]);
  output[1] = WRAPLOW(step[1] + step[2]);
  output[2] = WRAPLOW(step[1] - step[2]);
  output[3] = WRAPLOW(step[0] - step[3]);
118
119
}

Yaowu Xu's avatar
Yaowu Xu committed
120
void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
121
122
123
124
125
126
127
  tran_low_t out[4 * 4];
  tran_low_t *outptr = out;
  int i, j;
  tran_low_t temp_in[4], temp_out[4];

  // Rows
  for (i = 0; i < 4; ++i) {
Luca Barbato's avatar
Luca Barbato committed
128
    aom_idct4_c(input, outptr);
129
130
131
132
133
134
    input += 4;
    outptr += 4;
  }

  // Columns
  for (i = 0; i < 4; ++i) {
clang-format's avatar
clang-format committed
135
    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
Luca Barbato's avatar
Luca Barbato committed
136
    aom_idct4_c(temp_in, temp_out);
137
138
139
140
141
142
143
    for (j = 0; j < 4; ++j) {
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
    }
  }
}

Yaowu Xu's avatar
Yaowu Xu committed
144
void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
145
146
147
                         int dest_stride) {
  int i;
  tran_high_t a1;
148
149
  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
150
151
  a1 = ROUND_POWER_OF_TWO(out, 4);

152
153
  if (a1 == 0) return;

154
155
156
157
158
159
160
161
162
  for (i = 0; i < 4; i++) {
    dest[0] = clip_pixel_add(dest[0], a1);
    dest[1] = clip_pixel_add(dest[1], a1);
    dest[2] = clip_pixel_add(dest[2], a1);
    dest[3] = clip_pixel_add(dest[3], a1);
    dest += dest_stride;
  }
}

Luca Barbato's avatar
Luca Barbato committed
163
void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
164
165
166
167
168
169
170
171
172
  tran_low_t step1[8], step2[8];
  tran_high_t temp1, temp2;
  // stage 1
  step1[0] = input[0];
  step1[2] = input[4];
  step1[1] = input[2];
  step1[3] = input[6];
  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
173
174
  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
175
176
  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
177
178
  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
179

180
181
182
  // stage 2
  temp1 = (step1[0] + step1[2]) * cospi_16_64;
  temp2 = (step1[0] - step1[2]) * cospi_16_64;
183
184
  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
185
186
  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
187
188
189
190
191
192
  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
  step2[4] = WRAPLOW(step1[4] + step1[5]);
  step2[5] = WRAPLOW(step1[4] - step1[5]);
  step2[6] = WRAPLOW(-step1[6] + step1[7]);
  step2[7] = WRAPLOW(step1[6] + step1[7]);
193

194
  // stage 3
195
196
197
198
  step1[0] = WRAPLOW(step2[0] + step2[3]);
  step1[1] = WRAPLOW(step2[1] + step2[2]);
  step1[2] = WRAPLOW(step2[1] - step2[2]);
  step1[3] = WRAPLOW(step2[0] - step2[3]);
199
200
201
  step1[4] = step2[4];
  temp1 = (step2[6] - step2[5]) * cospi_16_64;
  temp2 = (step2[5] + step2[6]) * cospi_16_64;
202
203
  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
204
205
206
  step1[7] = step2[7];

  // stage 4
207
208
209
210
211
212
213
214
  output[0] = WRAPLOW(step1[0] + step1[7]);
  output[1] = WRAPLOW(step1[1] + step1[6]);
  output[2] = WRAPLOW(step1[2] + step1[5]);
  output[3] = WRAPLOW(step1[3] + step1[4]);
  output[4] = WRAPLOW(step1[3] - step1[4]);
  output[5] = WRAPLOW(step1[2] - step1[5]);
  output[6] = WRAPLOW(step1[1] - step1[6]);
  output[7] = WRAPLOW(step1[0] - step1[7]);
215
216
}

Yaowu Xu's avatar
Yaowu Xu committed
217
void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
218
219
220
221
222
223
224
  tran_low_t out[8 * 8];
  tran_low_t *outptr = out;
  int i, j;
  tran_low_t temp_in[8], temp_out[8];

  // First transform rows
  for (i = 0; i < 8; ++i) {
Luca Barbato's avatar
Luca Barbato committed
225
    aom_idct8_c(input, outptr);
226
227
228
229
230
231
    input += 8;
    outptr += 8;
  }

  // Then transform columns
  for (i = 0; i < 8; ++i) {
clang-format's avatar
clang-format committed
232
    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
Luca Barbato's avatar
Luca Barbato committed
233
    aom_idct8_c(temp_in, temp_out);
234
235
236
237
238
239
240
    for (j = 0; j < 8; ++j) {
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
    }
  }
}

Yaowu Xu's avatar
Yaowu Xu committed
241
void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
242
243
  int i, j;
  tran_high_t a1;
244
245
  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
246
  a1 = ROUND_POWER_OF_TWO(out, 5);
247
  if (a1 == 0) return;
248
  for (j = 0; j < 8; ++j) {
clang-format's avatar
clang-format committed
249
    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
250
251
252
253
    dest += stride;
  }
}

Luca Barbato's avatar
Luca Barbato committed
254
void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

  tran_low_t x0 = input[0];
  tran_low_t x1 = input[1];
  tran_low_t x2 = input[2];
  tran_low_t x3 = input[3];

  if (!(x0 | x1 | x2 | x3)) {
    output[0] = output[1] = output[2] = output[3] = 0;
    return;
  }

  s0 = sinpi_1_9 * x0;
  s1 = sinpi_2_9 * x0;
  s2 = sinpi_3_9 * x1;
  s3 = sinpi_4_9 * x2;
  s4 = sinpi_1_9 * x2;
  s5 = sinpi_2_9 * x3;
  s6 = sinpi_4_9 * x3;
274
  s7 = WRAPLOW(x0 - x2 + x3);
275
276
277
278
279
280
281
282
283
284

  s0 = s0 + s3 + s5;
  s1 = s1 - s4 - s6;
  s3 = s2;
  s2 = sinpi_3_9 * s7;

  // 1-D transform scaling factor is sqrt(2).
  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  // + 1b (addition) = 29b.
  // Hence the output bit depth is 15b.
285
286
287
288
  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
  output[2] = WRAPLOW(dct_const_round_shift(s2));
  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
289
290
}

Luca Barbato's avatar
Luca Barbato committed
291
void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
292
293
294
295
296
297
298
299
300
301
302
303
  int s0, s1, s2, s3, s4, s5, s6, s7;

  tran_high_t x0 = input[7];
  tran_high_t x1 = input[0];
  tran_high_t x2 = input[5];
  tran_high_t x3 = input[2];
  tran_high_t x4 = input[3];
  tran_high_t x5 = input[4];
  tran_high_t x6 = input[1];
  tran_high_t x7 = input[6];

  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
clang-format's avatar
clang-format committed
304
305
    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
        output[6] = output[7] = 0;
306
307
308
309
    return;
  }

  // stage 1
clang-format's avatar
clang-format committed
310
311
  s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
  s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
312
313
314
315
  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
clang-format's avatar
clang-format committed
316
317
  s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
  s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
318

319
320
321
322
323
324
325
326
  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
327
328
329
330
331
332
333
334
335
336
337

  // stage 2
  s0 = (int)x0;
  s1 = (int)x1;
  s2 = (int)x2;
  s3 = (int)x3;
  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);

338
339
340
341
342
343
344
345
  x0 = WRAPLOW(s0 + s2);
  x1 = WRAPLOW(s1 + s3);
  x2 = WRAPLOW(s0 - s2);
  x3 = WRAPLOW(s1 - s3);
  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
346
347
348
349
350
351
352

  // stage 3
  s2 = (int)(cospi_16_64 * (x2 + x3));
  s3 = (int)(cospi_16_64 * (x2 - x3));
  s6 = (int)(cospi_16_64 * (x6 + x7));
  s7 = (int)(cospi_16_64 * (x6 - x7));

353
354
355
356
357
358
359
360
361
362
363
364
365
  x2 = WRAPLOW(dct_const_round_shift(s2));
  x3 = WRAPLOW(dct_const_round_shift(s3));
  x6 = WRAPLOW(dct_const_round_shift(s6));
  x7 = WRAPLOW(dct_const_round_shift(s7));

  output[0] = WRAPLOW(x0);
  output[1] = WRAPLOW(-x4);
  output[2] = WRAPLOW(x6);
  output[3] = WRAPLOW(-x2);
  output[4] = WRAPLOW(x3);
  output[5] = WRAPLOW(-x7);
  output[6] = WRAPLOW(x5);
  output[7] = WRAPLOW(-x1);
366
367
}

Yaowu Xu's avatar
Yaowu Xu committed
368
void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
369
370
371
372
373
374
375
376
  tran_low_t out[8 * 8] = { 0 };
  tran_low_t *outptr = out;
  int i, j;
  tran_low_t temp_in[8], temp_out[8];

  // First transform rows
  // only first 4 row has non-zero coefs
  for (i = 0; i < 4; ++i) {
Luca Barbato's avatar
Luca Barbato committed
377
    aom_idct8_c(input, outptr);
378
379
380
381
382
383
    input += 8;
    outptr += 8;
  }

  // Then transform columns
  for (i = 0; i < 8; ++i) {
clang-format's avatar
clang-format committed
384
    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
Luca Barbato's avatar
Luca Barbato committed
385
    aom_idct8_c(temp_in, temp_out);
386
387
388
389
390
391
392
    for (j = 0; j < 8; ++j) {
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
    }
  }
}

Luca Barbato's avatar
Luca Barbato committed
393
void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
394
395
396
397
  tran_low_t step1[16], step2[16];
  tran_high_t temp1, temp2;

  // stage 1
clang-format's avatar
clang-format committed
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
  step1[0] = input[0 / 2];
  step1[1] = input[16 / 2];
  step1[2] = input[8 / 2];
  step1[3] = input[24 / 2];
  step1[4] = input[4 / 2];
  step1[5] = input[20 / 2];
  step1[6] = input[12 / 2];
  step1[7] = input[28 / 2];
  step1[8] = input[2 / 2];
  step1[9] = input[18 / 2];
  step1[10] = input[10 / 2];
  step1[11] = input[26 / 2];
  step1[12] = input[6 / 2];
  step1[13] = input[22 / 2];
  step1[14] = input[14 / 2];
  step1[15] = input[30 / 2];
414
415
416
417
418
419
420
421
422
423
424
425
426

  // stage 2
  step2[0] = step1[0];
  step2[1] = step1[1];
  step2[2] = step1[2];
  step2[3] = step1[3];
  step2[4] = step1[4];
  step2[5] = step1[5];
  step2[6] = step1[6];
  step2[7] = step1[7];

  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
427
428
  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
429
430
431

  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
432
433
  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
434
435
436

  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
437
438
  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
439
440
441

  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
442
443
  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
444
445
446
447
448
449
450
451
452

  // stage 3
  step1[0] = step2[0];
  step1[1] = step2[1];
  step1[2] = step2[2];
  step1[3] = step2[3];

  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
453
454
  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
455
456
  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
457
458
459
460
461
462
463
464
465
466
467
  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  step1[6] = WRAPLOW(dct_const_round_shift(temp2));

  step1[8] = WRAPLOW(step2[8] + step2[9]);
  step1[9] = WRAPLOW(step2[8] - step2[9]);
  step1[10] = WRAPLOW(-step2[10] + step2[11]);
  step1[11] = WRAPLOW(step2[10] + step2[11]);
  step1[12] = WRAPLOW(step2[12] + step2[13]);
  step1[13] = WRAPLOW(step2[12] - step2[13]);
  step1[14] = WRAPLOW(-step2[14] + step2[15]);
  step1[15] = WRAPLOW(step2[14] + step2[15]);
468
469
470
471

  // stage 4
  temp1 = (step1[0] + step1[1]) * cospi_16_64;
  temp2 = (step1[0] - step1[1]) * cospi_16_64;
472
473
  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
474
475
  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
476
477
478
479
480
481
  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
  step2[4] = WRAPLOW(step1[4] + step1[5]);
  step2[5] = WRAPLOW(step1[4] - step1[5]);
  step2[6] = WRAPLOW(-step1[6] + step1[7]);
  step2[7] = WRAPLOW(step1[6] + step1[7]);
482
483
484
485
486

  step2[8] = step1[8];
  step2[15] = step1[15];
  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
487
488
  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
489
490
  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
491
492
  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
493
494
495
496
  step2[11] = step1[11];
  step2[12] = step1[12];

  // stage 5
497
498
499
500
  step1[0] = WRAPLOW(step2[0] + step2[3]);
  step1[1] = WRAPLOW(step2[1] + step2[2]);
  step1[2] = WRAPLOW(step2[1] - step2[2]);
  step1[3] = WRAPLOW(step2[0] - step2[3]);
501
502
503
  step1[4] = step2[4];
  temp1 = (step2[6] - step2[5]) * cospi_16_64;
  temp2 = (step2[5] + step2[6]) * cospi_16_64;
504
505
  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
506
507
  step1[7] = step2[7];

508
509
510
511
512
513
514
515
  step1[8] = WRAPLOW(step2[8] + step2[11]);
  step1[9] = WRAPLOW(step2[9] + step2[10]);
  step1[10] = WRAPLOW(step2[9] - step2[10]);
  step1[11] = WRAPLOW(step2[8] - step2[11]);
  step1[12] = WRAPLOW(-step2[12] + step2[15]);
  step1[13] = WRAPLOW(-step2[13] + step2[14]);
  step1[14] = WRAPLOW(step2[13] + step2[14]);
  step1[15] = WRAPLOW(step2[12] + step2[15]);
516
517

  // stage 6
518
519
520
521
522
523
524
525
  step2[0] = WRAPLOW(step1[0] + step1[7]);
  step2[1] = WRAPLOW(step1[1] + step1[6]);
  step2[2] = WRAPLOW(step1[2] + step1[5]);
  step2[3] = WRAPLOW(step1[3] + step1[4]);
  step2[4] = WRAPLOW(step1[3] - step1[4]);
  step2[5] = WRAPLOW(step1[2] - step1[5]);
  step2[6] = WRAPLOW(step1[1] - step1[6]);
  step2[7] = WRAPLOW(step1[0] - step1[7]);
526
527
528
529
  step2[8] = step1[8];
  step2[9] = step1[9];
  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
  temp2 = (step1[10] + step1[13]) * cospi_16_64;
530
531
  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
532
533
  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
  temp2 = (step1[11] + step1[12]) * cospi_16_64;
534
535
  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
536
537
538
539
  step2[14] = step1[14];
  step2[15] = step1[15];

  // stage 7
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
  output[0] = WRAPLOW(step2[0] + step2[15]);
  output[1] = WRAPLOW(step2[1] + step2[14]);
  output[2] = WRAPLOW(step2[2] + step2[13]);
  output[3] = WRAPLOW(step2[3] + step2[12]);
  output[4] = WRAPLOW(step2[4] + step2[11]);
  output[5] = WRAPLOW(step2[5] + step2[10]);
  output[6] = WRAPLOW(step2[6] + step2[9]);
  output[7] = WRAPLOW(step2[7] + step2[8]);
  output[8] = WRAPLOW(step2[7] - step2[8]);
  output[9] = WRAPLOW(step2[6] - step2[9]);
  output[10] = WRAPLOW(step2[5] - step2[10]);
  output[11] = WRAPLOW(step2[4] - step2[11]);
  output[12] = WRAPLOW(step2[3] - step2[12]);
  output[13] = WRAPLOW(step2[2] - step2[13]);
  output[14] = WRAPLOW(step2[1] - step2[14]);
  output[15] = WRAPLOW(step2[0] - step2[15]);
556
557
}

Yaowu Xu's avatar
Yaowu Xu committed
558
void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
559
560
561
562
563
564
565
566
                             int stride) {
  tran_low_t out[16 * 16];
  tran_low_t *outptr = out;
  int i, j;
  tran_low_t temp_in[16], temp_out[16];

  // First transform rows
  for (i = 0; i < 16; ++i) {
Luca Barbato's avatar
Luca Barbato committed
567
    aom_idct16_c(input, outptr);
568
569
570
571
572
573
    input += 16;
    outptr += 16;
  }

  // Then transform columns
  for (i = 0; i < 16; ++i) {
clang-format's avatar
clang-format committed
574
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
Luca Barbato's avatar
Luca Barbato committed
575
    aom_idct16_c(temp_in, temp_out);
576
577
578
579
580
581
582
    for (j = 0; j < 16; ++j) {
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
    }
  }
}

Luca Barbato's avatar
Luca Barbato committed
583
void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  tran_high_t s9, s10, s11, s12, s13, s14, s15;

  tran_high_t x0 = input[15];
  tran_high_t x1 = input[0];
  tran_high_t x2 = input[13];
  tran_high_t x3 = input[2];
  tran_high_t x4 = input[11];
  tran_high_t x5 = input[4];
  tran_high_t x6 = input[9];
  tran_high_t x7 = input[6];
  tran_high_t x8 = input[7];
  tran_high_t x9 = input[8];
  tran_high_t x10 = input[5];
  tran_high_t x11 = input[10];
  tran_high_t x12 = input[3];
  tran_high_t x13 = input[12];
  tran_high_t x14 = input[1];
  tran_high_t x15 = input[14];

clang-format's avatar
clang-format committed
604
605
606
607
608
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
        x13 | x14 | x15)) {
    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
        output[6] = output[7] = output[8] = output[9] = output[10] =
            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
609
610
611
612
    return;
  }

  // stage 1
clang-format's avatar
clang-format committed
613
  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
614
  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
clang-format's avatar
clang-format committed
615
  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
616
  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
clang-format's avatar
clang-format committed
617
  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
618
619
620
621
622
623
624
625
  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
clang-format's avatar
clang-format committed
626
  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
627
  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
clang-format's avatar
clang-format committed
628
  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
629

630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
646
647
648
649
650
651
652
653
654
655

  // stage 2
  s0 = x0;
  s1 = x1;
  s2 = x2;
  s3 = x3;
  s4 = x4;
  s5 = x5;
  s6 = x6;
  s7 = x7;
clang-format's avatar
clang-format committed
656
657
658
659
660
661
662
663
  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
664

665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
  x0 = WRAPLOW(s0 + s4);
  x1 = WRAPLOW(s1 + s5);
  x2 = WRAPLOW(s2 + s6);
  x3 = WRAPLOW(s3 + s7);
  x4 = WRAPLOW(s0 - s4);
  x5 = WRAPLOW(s1 - s5);
  x6 = WRAPLOW(s2 - s6);
  x7 = WRAPLOW(s3 - s7);
  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
681
682
683
684
685
686

  // stage 3
  s0 = x0;
  s1 = x1;
  s2 = x2;
  s3 = x3;
clang-format's avatar
clang-format committed
687
  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
688
  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
clang-format's avatar
clang-format committed
689
690
  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
691
692
693
694
  s8 = x8;
  s9 = x9;
  s10 = x10;
  s11 = x11;
clang-format's avatar
clang-format committed
695
  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
696
  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
clang-format's avatar
clang-format committed
697
698
  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
699

700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
  x0 = WRAPLOW(s0 + s2);
  x1 = WRAPLOW(s1 + s3);
  x2 = WRAPLOW(s0 - s2);
  x3 = WRAPLOW(s1 - s3);
  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
  x8 = WRAPLOW(s8 + s10);
  x9 = WRAPLOW(s9 + s11);
  x10 = WRAPLOW(s8 - s10);
  x11 = WRAPLOW(s9 - s11);
  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
716
717

  // stage 4
clang-format's avatar
clang-format committed
718
  s2 = (-cospi_16_64) * (x2 + x3);
719
720
  s3 = cospi_16_64 * (x2 - x3);
  s6 = cospi_16_64 * (x6 + x7);
clang-format's avatar
clang-format committed
721
  s7 = cospi_16_64 * (-x6 + x7);
722
  s10 = cospi_16_64 * (x10 + x11);
clang-format's avatar
clang-format committed
723
724
  s11 = cospi_16_64 * (-x10 + x11);
  s14 = (-cospi_16_64) * (x14 + x15);
725
726
  s15 = cospi_16_64 * (x14 - x15);

727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
  x2 = WRAPLOW(dct_const_round_shift(s2));
  x3 = WRAPLOW(dct_const_round_shift(s3));
  x6 = WRAPLOW(dct_const_round_shift(s6));
  x7 = WRAPLOW(dct_const_round_shift(s7));
  x10 = WRAPLOW(dct_const_round_shift(s10));
  x11 = WRAPLOW(dct_const_round_shift(s11));
  x14 = WRAPLOW(dct_const_round_shift(s14));
  x15 = WRAPLOW(dct_const_round_shift(s15));

  output[0] = WRAPLOW(x0);
  output[1] = WRAPLOW(-x8);
  output[2] = WRAPLOW(x12);
  output[3] = WRAPLOW(-x4);
  output[4] = WRAPLOW(x6);
  output[5] = WRAPLOW(x14);
  output[6] = WRAPLOW(x10);
  output[7] = WRAPLOW(x2);
  output[8] = WRAPLOW(x3);
  output[9] = WRAPLOW(x11);
  output[10] = WRAPLOW(x15);
  output[11] = WRAPLOW(x7);
  output[12] = WRAPLOW(x5);
  output[13] = WRAPLOW(-x13);
  output[14] = WRAPLOW(x9);
  output[15] = WRAPLOW(-x1);
752
753
}

754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
                            int stride) {
  int i, j;
  tran_low_t out[16 * 16] = { 0 };
  tran_low_t *outptr = out;
  tran_low_t temp_in[16], temp_out[16];

  // First transform rows. Since all non-zero dct coefficients are in
  // upper-left 8x8 area, we only need to calculate first 8 rows here.
  for (i = 0; i < 8; ++i) {
    aom_idct16_c(input, outptr);
    input += 16;
    outptr += 16;
  }

  // Then transform columns
  for (i = 0; i < 16; ++i) {
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
    aom_idct16_c(temp_in, temp_out);
    for (j = 0; j < 16; ++j) {
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
    }
  }
}

Yaowu Xu's avatar
Yaowu Xu committed
780
void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
781
782
783
784
785
786
787
788
789
                            int stride) {
  tran_low_t out[16 * 16] = { 0 };
  tran_low_t *outptr = out;
  int i, j;
  tran_low_t temp_in[16], temp_out[16];

  // First transform rows. Since all non-zero dct coefficients are in
  // upper-left 4x4 area, we only need to calculate first 4 rows here.
  for (i = 0; i < 4; ++i) {
Luca Barbato's avatar
Luca Barbato committed
790
    aom_idct16_c(input, outptr);
791
792
793
794
795
796
    input += 16;
    outptr += 16;
  }

  // Then transform columns
  for (i = 0; i < 16; ++i) {
clang-format's avatar
clang-format committed
797
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
Luca Barbato's avatar
Luca Barbato committed
798
    aom_idct16_c(temp_in, temp_out);
799
800
801
802
803
804
805
    for (j = 0; j < 16; ++j) {
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
    }
  }
}

Yaowu Xu's avatar
Yaowu Xu committed
806
void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
807
808
  int i, j;
  tran_high_t a1;
809
810
  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
811
  a1 = ROUND_POWER_OF_TWO(out, 6);
812
  if (a1 == 0) return;
813
  for (j = 0; j < 16; ++j) {
clang-format's avatar
clang-format committed
814
    for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
815
816
817
818
    dest += stride;
  }
}

Luca Barbato's avatar
Luca Barbato committed
819
void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
  tran_low_t step1[32], step2[32];
  tran_high_t temp1, temp2;

  // stage 1
  step1[0] = input[0];
  step1[1] = input[16];
  step1[2] = input[8];
  step1[3] = input[24];
  step1[4] = input[4];
  step1[5] = input[20];
  step1[6] = input[12];
  step1[7] = input[28];
  step1[8] = input[2];
  step1[9] = input[18];
  step1[10] = input[10];
  step1[11] = input[26];
  step1[12] = input[6];
  step1[13] = input[22];
  step1[14] = input[14];
  step1[15] = input[30];

  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
843
844
  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
845
846
847

  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
848
849
  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
850
851
852

  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
853
854
  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
855
856
857

  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
858
859
  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
860
861
862

  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
863
864
  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
865
866
867

  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
868
869
  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
870
871
872

  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
873
874
  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
875
876
877

  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
878
879
  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
880
881
882
883
884
885
886
887
888
889
890
891
892

  // stage 2
  step2[0] = step1[0];
  step2[1] = step1[1];
  step2[2] = step1[2];
  step2[3] = step1[3];
  step2[4] = step1[4];
  step2[5] = step1[5];
  step2[6] = step1[6];
  step2[7] = step1[7];

  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
893
894
  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
895
896
897

  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
898
899
  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
900
901
902

  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
903
904
  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
905
906
907

  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
  step2[12] = WRAPLOW(dct_const_round_shift(temp2));

  step2[16] = WRAPLOW(step1[16] + step1[17]);
  step2[17] = WRAPLOW(step1[16] - step1[17]);
  step2[18] = WRAPLOW(-step1[18] + step1[19]);
  step2[19] = WRAPLOW(step1[18] + step1[19]);
  step2[20] = WRAPLOW(step1[20] + step1[21]);
  step2[21] = WRAPLOW(step1[20] - step1[21]);
  step2[22] = WRAPLOW(-step1[22] + step1[23]);
  step2[23] = WRAPLOW(step1[22] + step1[23]);
  step2[24] = WRAPLOW(step1[24] + step1[25]);
  step2[25] = WRAPLOW(step1[24] - step1[25]);
  step2[26] = WRAPLOW(-step1[26] + step1[27]);
  step2[27] = WRAPLOW(step1[26] + step1[27]);
  step2[28] = WRAPLOW(step1[28] + step1[29]);
  step2[29] = WRAPLOW(step1[28] - step1[29]);
  step2[30] = WRAPLOW(-step1[30] + step1[31]);
  step2[31] = WRAPLOW(step1[30] + step1[31]);
927
928
929
930
931
932
933
934
935

  // stage 3
  step1[0] = step2[0];
  step1[1] = step2[1];
  step1[2] = step2[2];
  step1[3] = step2[3];

  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
936
937
  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
938
939
  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
940
941
942
943
944
945
946
947
948
949
950
  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  step1[6] = WRAPLOW(dct_const_round_shift(temp2));

  step1[8] = WRAPLOW(step2[8] + step2[9]);
  step1[9] = WRAPLOW(step2[8] - step2[9]);
  step1[10] = WRAPLOW(-step2[10] + step2[11]);
  step1[11] = WRAPLOW(step2[10] + step2[11]);
  step1[12] = WRAPLOW(step2[12] + step2[13]);
  step1[13] = WRAPLOW(step2[12] - step2[13]);
  step1[14] = WRAPLOW(-step2[14] + step2[15]);
  step1[15] = WRAPLOW(step2[14] + step2[15]);
951
952
953
954
955

  step1[16] = step2[16];
  step1[31] = step2[31];
  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
956
957
  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
958
959
  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
960
961
  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
962
963
964
965
  step1[19] = step2[19];
  step1[20] = step2[20];
  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
966
967
  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
968
969
  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
970
971
  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
972
973
974
975
976
977
978
979
  step1[23] = step2[23];
  step1[24] = step2[24];
  step1[27] = step2[27];
  step1[28] = step2[28];

  // stage 4
  temp1 = (step1[0] + step1[1]) * cospi_16_64;
  temp2 = (step1[0] - step1[1]) * cospi_16_64;
980
981
  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
982
983
  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
984
985
986
987
988
989
  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
  step2[4] = WRAPLOW(step1[4] + step1[5]);
  step2[5] = WRAPLOW(step1[4] - step1[5]);
  step2[6] = WRAPLOW(-step1[6] + step1[7]);
  step2[7] = WRAPLOW(step1[6] + step1[7]);
990
991
992
993
994

  step2[8] = step1[8];
  step2[15] = step1[15];
  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
995
996
  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
997
998
  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
999
1000
  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1001
1002
1003
  step2[11] = step1[11];
  step2[12] = step1[12];

1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
  step2[16] = WRAPLOW(step1[16] + step1[19]);
  step2[17] = WRAPLOW(step1[17] + step1[18]);
  step2[18] = WRAPLOW(step1[17] - step1[18]);
  step2[19] = WRAPLOW(step1[16] - step1[19]);
  step2[20] = WRAPLOW(-step1[20] + step1[23]);
  step2[21] = WRAPLOW(-step1[21] + step1[22]);
  step2[22] = WRAPLOW(step1[21] + step1[22]);
  step2[23] = WRAPLOW(step1[20] + step1[23]);

  step2[24] = WRAPLOW(step1[24] + step1[27]);
  step2[25] = WRAPLOW(step1[25] + step1[26]);
  step2[26] = WRAPLOW(step1[25] - step1[26]);
  step2[27] = WRAPLOW(step1[24] - step1[27]);
  step2[28] = WRAPLOW(-step1[28] + step1[31]);
  step2[29] = WRAPLOW(-step1[29] + step1[30]);
  step2[30] = WRAPLOW(step1[29] + step1[30]);
  step2[31] = WRAPLOW(step1[28] + step1[31]);
1021
1022

  // stage 5
1023
1024
1025
1026
  step1[0] = WRAPLOW(step2[0] + step2[3]);
  step1[1] = WRAPLOW(step2[1] + step2[2]);
  step1[2] = WRAPLOW(step2[1] - step2[2]);
  step1[3] = WRAPLOW(step2[0] - step2[3]);
1027
1028
1029
  step1[4] = step2[4];
  temp1 = (step2[6] - step2[5]) * cospi_16_64;
  temp2 = (step2[5] + step2[6]) * cospi_16_64;
1030
1031
  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1032
1033
  step1[7] = step2[7];

1034
1035
1036
1037
1038
1039
1040
1041
  step1[8] = WRAPLOW(step2[8] + step2[11]);
  step1[9] = WRAPLOW(step2[9] + step2[10]);
  step1[10] = WRAPLOW(step2[9] - step2[10]);
  step1[11] = WRAPLOW(step2[8] - step2[11]);
  step1[12] = WRAPLOW(-step2[12] + step2[15]);
  step1[13] = WRAPLOW(-step2[13] + step2[14]);
  step1[14] = WRAPLOW(step2[13] + step2[14]);
  step1[15] = WRAPLOW(step2[12] + step2[15]);
1042
1043
1044
1045
1046

  step1[16] = step2[16];
  step1[17] = step2[17];
  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1047
1048
  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1049
1050
  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1051
1052
  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1053
1054
  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1055
1056
  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1057
1058
  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1059
1060
  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1061
1062
1063
1064
1065
1066
1067
1068
  step1[22] = step2[22];
  step1[23] = step2[23];
  step1[24] = step2[24];
  step1[25] = step2[25];
  step1[30] = step2[30];
  step1[31] = step2[31];

  // stage 6
1069
1070
1071
1072
1073
1074
1075
1076
  step2[0] = WRAPLOW(step1[0] + step1[7]);
  step2[1] = WRAPLOW(step1[1] + step1[6]);
  step2[2] = WRAPLOW(step1[2] + step1[5]);
  step2[3] = WRAPLOW(step1[3] + step1[4]);
  step2[4] = WRAPLOW(step1[3] - step1[4]);
  step2[5] = WRAPLOW(step1[2] - step1[5]);
  step2[6] = WRAPLOW(step1[1] - step1[6]);
  step2[7] = WRAPLOW(step1[0] - step1[7]);
1077
1078
1079
1080
  step2[8] = step1[8];
  step2[9] = step1[9];
  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
  temp2 = (step1[10] + step1[13]) * cospi_16_64;
1081
1082
  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1083
1084
  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
  temp2 = (step1[11] + step1[12]) * cospi_16_64;
1085
1086
  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1087
1088
1089
  step2[14] = step1[14];
  step2[15] = step1[15];

1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
  step2[16] = WRAPLOW(step1[16] + step1[23]);
  step2[17] = WRAPLOW(step1[17] + step1[22]);
  step2[18] = WRAPLOW(step1[18] + step1[21]);
  step2[19] = WRAPLOW(step1[19] + step1[20]);
  step2[20] = WRAPLOW(step1[19] - step1[20]);
  step2[21] = WRAPLOW(step1[18] - step1[21]);
  step2[22] = WRAPLOW(step1[17] - step1[22]);
  step2[23] = WRAPLOW(step1[16] - step1[23]);

  step2[24] = WRAPLOW(-step1[24] + step1[31]);
  step2[25] = WRAPLOW(-step1[25] + step1[30]);
  step2[26] = WRAPLOW(-step1[26] + step1[29]);
  step2[27] = WRAPLOW(-step1[27] + step1[28]);
  step2[28] = WRAPLOW(step1[27] + step1[28]);
  step2[29] = WRAPLOW(step1[26] + step1[29]);
  step2[30] = WRAPLOW(step1[25] + step1[30]);
  step2[31] = WRAPLOW(step1[24] + step1[31]);
1107
1108

  // stage 7
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
  step1[0] = WRAPLOW(step2[0] + step2[15]);
  step1[1] = WRAPLOW(step2[1] + step2[14]);
  step1[2] = WRAPLOW(step2[2] + step2[13]);
  step1[3] = WRAPLOW(step2[3] + step2[12]);
  step1[4] = WRAPLOW(step2[4] + step2[11]);
  step1[5] = WRAPLOW(step2[5] + step2[10]);
  step1[6] = WRAPLOW(step2[6] + step2[9]);
  step1[7] = WRAPLOW(step2[7] + step2[8]);
  step1[8] = WRAPLOW(step2[7] - step2[8]);
  step1[9] = WRAPLOW(step2[6] - step2[9]);
  step1[10] = WRAPLOW(step2[5] - step2[10]);
  step1[11] = WRAPLOW(step2[4] - step2[11]);
  step1[12] = WRAPLOW(step2[3] - step2[12]);
  step1[13] = WRAPLOW(step2[2] - step2[13]);
  step1[14] = WRAPLOW(step2[1] - step2[14]);
  step1[15] = WRAPLOW(step2[0] - step2[15]);
1125
1126
1127
1128
1129
1130
1131

  step1[16] = step2[16];
  step1[17] = step2[17];
  step1[18] = step2[18];
  step1[19] = step2[19];
  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
  temp2 = (step2[20] + step2[27]) * cospi_16_64;
1132
1133
  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1134
1135
  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
  temp2 = (step2[21] + step2[26]) * cospi_16_64;
1136
1137
  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1138
1139
  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
  temp2 = (step2[22] + step2[25]) * cospi_16_64;
1140
1141
  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1142
1143
  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
  temp2 = (step2[23] + step2[24]) * cospi_16_64;
1144
1145
  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1146
1147
1148
1149
1150
1151
  step1[28] = step2[28];
  step1[29] = step2[29];
  step1[30] = step2[30];
  step1[31] = step2[31];

  // final stage
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
  output[0] = WRAPLOW(step1[0] + step1[31]);
  output[1] = WRAPLOW(step1[1] + step1[30]);
  output[2] = WRAPLOW(step1[2] + step1[29]);
  output[3] = WRAPLOW(step1[3] + step1[28]);
  output[4] = WRAPLOW(step1[4] + step1[27]);
  output[5] = WRAPLOW(step1[5] + step1[26]);
  output[6] = WRAPLOW(step1[6] + step1[25]);
  output[7] = WRAPLOW(step1[7] + step1[24]);
  output[8] = WRAPLOW(step1[8] + step1[23]);
  output[9] = WRAPLOW(step1[9] + step1[22]);
  output[10] = WRAPLOW(step1[10] + step1[21]);
  output[11] = WRAPLOW(step1[11] + step1[20]);
  output[12] = WRAPLOW(step1[12] + step1[19]);
  output[13] = WRAPLOW(step1[13] + step1[18]);
  output[14] = WRAPLOW(step1[14] + step1[17]);
  output[15] = WRAPLOW(step1[15] + step1[16]);
  output[16] = WRAPLOW(step1[15] - step1[16]);
  output[17] = WRAPLOW(step1[14] - step1[17]);
  output[18] = WRAPLOW(step1[13] - step1[18]);
  output[19] = WRAPLOW(step1[12] - step1[19]);
  output[20] = WRAPLOW(step1[11] - step1[20]);
  output[21] = WRAPLOW(step1[10] - step1[21]);
  output[22] = WRAPLOW(step1[9] - step1[22]);
  output[23] = WRAPLOW(step1[8] - step1[23]);
  output[24] = WRAPLOW(step1[7] - step1[24]);
  output[25] = WRAPLOW(step1[6] - step1[25]);
  output[26] = WRAPLOW(step1[5] - step1[26]);
  output[27] = WRAPLOW(step1[4] - step1[27]);
  output[28] = WRAPLOW(step1[3] - step1[28]);
  output[29] = WRAPLOW(step1[2] - step1[29]);
  output[30] = WRAPLOW(step1[1] - step1[30]);
  output[31] = WRAPLOW(step1[0] - step1[31]);
1184
1185
}

1186
1187
#if CONFIG_MRC_TX
void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
Sarah Parker's avatar
Sarah Parker committed
1188
                              int stride, uint8_t *mask) {
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209