vp9_idct.c 98.9 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
/*
2
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
John Koleszar's avatar
John Koleszar committed
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5 6
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
John Koleszar's avatar
John Koleszar committed
9 10
 */

Yunqing Wang's avatar
Yunqing Wang committed
11
#include <math.h>
Dmitry Kovalev's avatar
Dmitry Kovalev committed
12

Yunqing Wang's avatar
Yunqing Wang committed
13
#include "./vp9_rtcd.h"
14 15
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/common/vp9_blockd.h"
16
#include "vp9/common/vp9_idct.h"
17

18 19 20 21 22 23 24 25 26 27 28 29 30
#if CONFIG_EMULATE_HARDWARE
// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
// non-normative method to handle overflows. A stream that causes
// overflows  in the inverse transform is considered invalid in VP9,
// and a hardware implementer is free to choose any reasonable
// method to handle overflows. However to aid in hardware
// verification they can use a specific implementation of the
// WRAPLOW() macro below that is identical to their intended
// hardware implementation (and also use configure options to trigger
// the C-implementation of the transform).
//
// The particular WRAPLOW implementation below performs strict
// overflow wrapping to match common hardware implementations.
31 32 33 34
// bd of 8 uses trans_low with 16bits, need to remove 16bits
// bd of 10 uses trans_low with 18bits, need to remove 14bits
// bd of 12 uses trans_low with 20bits, need to remove 12bits
// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
35
#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
36
#else
37 38
#define WRAPLOW(x, bd) (x)
#endif  // CONFIG_EMULATE_HARDWARE
39 40

#if CONFIG_VP9_HIGHBITDEPTH
41 42
static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
                                             int bd) {
43
  trans = WRAPLOW(trans, bd);
44
  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
45 46 47
}
#endif  // CONFIG_VP9_HIGHBITDEPTH

48
static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
49
  trans = WRAPLOW(trans, 8);
50
  return clip_pixel(WRAPLOW(dest + trans, 8));
51 52
}

53
void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
Timothy B. Terriberry's avatar
Timothy B. Terriberry committed
54 55
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
   0.5 shifts per pixel. */
John Koleszar's avatar
John Koleszar committed
56
  int i;
57 58 59 60
  tran_low_t output[16];
  tran_high_t a1, b1, c1, d1, e1;
  const tran_low_t *ip = input;
  tran_low_t *op = output;
John Koleszar's avatar
John Koleszar committed
61 62

  for (i = 0; i < 4; i++) {
Yaowu Xu's avatar
Yaowu Xu committed
63 64 65 66
    a1 = ip[0] >> UNIT_QUANT_SHIFT;
    c1 = ip[1] >> UNIT_QUANT_SHIFT;
    d1 = ip[2] >> UNIT_QUANT_SHIFT;
    b1 = ip[3] >> UNIT_QUANT_SHIFT;
Yaowu Xu's avatar
Yaowu Xu committed
67 68 69 70 71 72 73
    a1 += c1;
    d1 -= b1;
    e1 = (a1 - d1) >> 1;
    b1 = e1 - b1;
    c1 = e1 - c1;
    a1 -= b1;
    d1 += c1;
74 75 76 77
    op[0] = WRAPLOW(a1, 8);
    op[1] = WRAPLOW(b1, 8);
    op[2] = WRAPLOW(c1, 8);
    op[3] = WRAPLOW(d1, 8);
John Koleszar's avatar
John Koleszar committed
78
    ip += 4;
Scott LaVarnway's avatar
Scott LaVarnway committed
79
    op += 4;
John Koleszar's avatar
John Koleszar committed
80 81 82 83
  }

  ip = output;
  for (i = 0; i < 4; i++) {
Timothy B. Terriberry's avatar
Timothy B. Terriberry committed
84 85 86 87
    a1 = ip[4 * 0];
    c1 = ip[4 * 1];
    d1 = ip[4 * 2];
    b1 = ip[4 * 3];
Yaowu Xu's avatar
Yaowu Xu committed
88 89 90 91 92 93 94
    a1 += c1;
    d1 -= b1;
    e1 = (a1 - d1) >> 1;
    b1 = e1 - b1;
    c1 = e1 - c1;
    a1 -= b1;
    d1 += c1;
95 96 97 98
    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
John Koleszar's avatar
John Koleszar committed
99 100

    ip++;
Scott LaVarnway's avatar
Scott LaVarnway committed
101
    dest++;
John Koleszar's avatar
John Koleszar committed
102 103
  }
}
Hui Su's avatar
Hui Su committed
104

105
void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
John Koleszar's avatar
John Koleszar committed
106
  int i;
107 108 109 110
  tran_high_t a1, e1;
  tran_low_t tmp[4];
  const tran_low_t *ip = in;
  tran_low_t *op = tmp;
John Koleszar's avatar
John Koleszar committed
111

Yaowu Xu's avatar
Yaowu Xu committed
112
  a1 = ip[0] >> UNIT_QUANT_SHIFT;
Timothy B. Terriberry's avatar
Timothy B. Terriberry committed
113
  e1 = a1 >> 1;
Yaowu Xu's avatar
Yaowu Xu committed
114
  a1 -= e1;
115 116
  op[0] = WRAPLOW(a1, 8);
  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
John Koleszar's avatar
John Koleszar committed
117 118 119

  ip = tmp;
  for (i = 0; i < 4; i++) {
Timothy B. Terriberry's avatar
Timothy B. Terriberry committed
120 121
    e1 = ip[0] >> 1;
    a1 = ip[0] - e1;
122 123 124 125
    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
John Koleszar's avatar
John Koleszar committed
126
    ip++;
Scott LaVarnway's avatar
Scott LaVarnway committed
127
    dest++;
John Koleszar's avatar
John Koleszar committed
128 129
  }
}
Hui Su's avatar
Hui Su committed
130

131 132 133
static void idct4(const tran_low_t *input, tran_low_t *output) {
  tran_low_t step[4];
  tran_high_t temp1, temp2;
Yaowu Xu's avatar
Yaowu Xu committed
134 135 136
  // stage 1
  temp1 = (input[0] + input[2]) * cospi_16_64;
  temp2 = (input[0] - input[2]) * cospi_16_64;
137 138
  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
139 140
  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
141 142
  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
John Koleszar's avatar
John Koleszar committed
143

Yaowu Xu's avatar
Yaowu Xu committed
144
  // stage 2
145 146 147 148
  output[0] = WRAPLOW(step[0] + step[3], 8);
  output[1] = WRAPLOW(step[1] + step[2], 8);
  output[2] = WRAPLOW(step[1] - step[2], 8);
  output[3] = WRAPLOW(step[0] - step[3], 8);
149 150
}

151 152 153
void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  tran_low_t out[4 * 4];
  tran_low_t *outptr = out;
Yaowu Xu's avatar
Yaowu Xu committed
154
  int i, j;
155
  tran_low_t temp_in[4], temp_out[4];
Dmitry Kovalev's avatar
Dmitry Kovalev committed
156 157

  // Rows
Yaowu Xu's avatar
Yaowu Xu committed
158
  for (i = 0; i < 4; ++i) {
159
    idct4(input, outptr);
Yaowu Xu's avatar
Yaowu Xu committed
160 161 162
    input += 4;
    outptr += 4;
  }
Dmitry Kovalev's avatar
Dmitry Kovalev committed
163 164

  // Columns
Yaowu Xu's avatar
Yaowu Xu committed
165 166 167
  for (i = 0; i < 4; ++i) {
    for (j = 0; j < 4; ++j)
      temp_in[j] = out[j * 4 + i];
168
    idct4(temp_in, temp_out);
169
    for (j = 0; j < 4; ++j) {
170 171
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
172
    }
Yaowu Xu's avatar
Yaowu Xu committed
173 174 175
  }
}

176 177
void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
                         int dest_stride) {
Yaowu Xu's avatar
Yaowu Xu committed
178
  int i;
179
  tran_high_t a1;
180 181
  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
Dmitry Kovalev's avatar
Dmitry Kovalev committed
182
  a1 = ROUND_POWER_OF_TWO(out, 4);
Yaowu Xu's avatar
Yaowu Xu committed
183 184

  for (i = 0; i < 4; i++) {
185 186 187 188
    dest[0] = clip_pixel_add(dest[0], a1);
    dest[1] = clip_pixel_add(dest[1], a1);
    dest[2] = clip_pixel_add(dest[2], a1);
    dest[3] = clip_pixel_add(dest[3], a1);
Scott LaVarnway's avatar
Scott LaVarnway committed
189
    dest += dest_stride;
Yaowu Xu's avatar
Yaowu Xu committed
190 191 192
  }
}

193 194 195
static void idct8(const tran_low_t *input, tran_low_t *output) {
  tran_low_t step1[8], step2[8];
  tran_high_t temp1, temp2;
Yaowu Xu's avatar
Yaowu Xu committed
196 197 198 199 200 201 202
  // stage 1
  step1[0] = input[0];
  step1[2] = input[4];
  step1[1] = input[2];
  step1[3] = input[6];
  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
203 204
  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
205 206
  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
207 208
  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
John Koleszar's avatar
John Koleszar committed
209

Yaowu Xu's avatar
Yaowu Xu committed
210
  // stage 2 & stage 3 - even half
211
  idct4(step1, step1);
212

Yaowu Xu's avatar
Yaowu Xu committed
213
  // stage 2 - odd half
214 215 216 217
  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
John Koleszar's avatar
John Koleszar committed
218

Yaowu Xu's avatar
Yaowu Xu committed
219 220 221 222
  // stage 3 -odd half
  step1[4] = step2[4];
  temp1 = (step2[6] - step2[5]) * cospi_16_64;
  temp2 = (step2[5] + step2[6]) * cospi_16_64;
223 224
  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
225
  step1[7] = step2[7];
John Koleszar's avatar
John Koleszar committed
226

Yaowu Xu's avatar
Yaowu Xu committed
227
  // stage 4
228 229 230 231 232 233 234 235
  output[0] = WRAPLOW(step1[0] + step1[7], 8);
  output[1] = WRAPLOW(step1[1] + step1[6], 8);
  output[2] = WRAPLOW(step1[2] + step1[5], 8);
  output[3] = WRAPLOW(step1[3] + step1[4], 8);
  output[4] = WRAPLOW(step1[3] - step1[4], 8);
  output[5] = WRAPLOW(step1[2] - step1[5], 8);
  output[6] = WRAPLOW(step1[1] - step1[6], 8);
  output[7] = WRAPLOW(step1[0] - step1[7], 8);
236 237
}

238 239 240
void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  tran_low_t out[8 * 8];
  tran_low_t *outptr = out;
Yaowu Xu's avatar
Yaowu Xu committed
241
  int i, j;
242
  tran_low_t temp_in[8], temp_out[8];
Yunqing Wang's avatar
Yunqing Wang committed
243

Scott LaVarnway's avatar
Scott LaVarnway committed
244
  // First transform rows
Yaowu Xu's avatar
Yaowu Xu committed
245
  for (i = 0; i < 8; ++i) {
246
    idct8(input, outptr);
Yaowu Xu's avatar
Yaowu Xu committed
247 248
    input += 8;
    outptr += 8;
Yunqing Wang's avatar
Yunqing Wang committed
249 250
  }

Scott LaVarnway's avatar
Scott LaVarnway committed
251
  // Then transform columns
Yaowu Xu's avatar
Yaowu Xu committed
252 253 254
  for (i = 0; i < 8; ++i) {
    for (j = 0; j < 8; ++j)
      temp_in[j] = out[j * 8 + i];
255
    idct8(temp_in, temp_out);
256
    for (j = 0; j < 8; ++j) {
257 258
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
259
    }
Dmitry Kovalev's avatar
Dmitry Kovalev committed
260
  }
Yunqing Wang's avatar
Yunqing Wang committed
261 262
}

263
void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
264
  int i, j;
265
  tran_high_t a1;
266 267
  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
268 269 270
  a1 = ROUND_POWER_OF_TWO(out, 5);
  for (j = 0; j < 8; ++j) {
    for (i = 0; i < 8; ++i)
271
      dest[i] = clip_pixel_add(dest[i], a1);
272
    dest += stride;
273 274 275
  }
}

276 277
static void iadst4(const tran_low_t *input, tran_low_t *output) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
Jingning Han's avatar
Jingning Han committed
278

279 280 281 282
  tran_high_t x0 = input[0];
  tran_high_t x1 = input[1];
  tran_high_t x2 = input[2];
  tran_high_t x3 = input[3];
Jingning Han's avatar
Jingning Han committed
283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311

  if (!(x0 | x1 | x2 | x3)) {
    output[0] = output[1] = output[2] = output[3] = 0;
    return;
  }

  s0 = sinpi_1_9 * x0;
  s1 = sinpi_2_9 * x0;
  s2 = sinpi_3_9 * x1;
  s3 = sinpi_4_9 * x2;
  s4 = sinpi_1_9 * x2;
  s5 = sinpi_2_9 * x3;
  s6 = sinpi_4_9 * x3;
  s7 = x0 - x2 + x3;

  x0 = s0 + s3 + s5;
  x1 = s1 - s4 - s6;
  x2 = sinpi_3_9 * s7;
  x3 = s2;

  s0 = x0 + x3;
  s1 = x1 + x3;
  s2 = x2;
  s3 = x0 + x1 - x3;

  // 1-D transform scaling factor is sqrt(2).
  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  // + 1b (addition) = 29b.
  // Hence the output bit depth is 15b.
312 313 314 315
  output[0] = WRAPLOW(dct_const_round_shift(s0), 8);
  output[1] = WRAPLOW(dct_const_round_shift(s1), 8);
  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
  output[3] = WRAPLOW(dct_const_round_shift(s3), 8);
Jingning Han's avatar
Jingning Han committed
316 317
}

318
void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
319
                         int tx_type) {
Yunqing Wang's avatar
Yunqing Wang committed
320
  const transform_2d IHT_4[] = {
321 322 323 324
    { idct4, idct4  },  // DCT_DCT  = 0
    { iadst4, idct4  },   // ADST_DCT = 1
    { idct4, iadst4 },    // DCT_ADST = 2
    { iadst4, iadst4 }      // ADST_ADST = 3
Yunqing Wang's avatar
Yunqing Wang committed
325 326
  };

327
  int i, j;
328 329 330
  tran_low_t out[4 * 4];
  tran_low_t *outptr = out;
  tran_low_t temp_in[4], temp_out[4];
Jingning Han's avatar
Jingning Han committed
331 332 333

  // inverse transform row vectors
  for (i = 0; i < 4; ++i) {
Yunqing Wang's avatar
Yunqing Wang committed
334
    IHT_4[tx_type].rows(input, outptr);
Jingning Han's avatar
Jingning Han committed
335 336 337 338 339 340 341 342
    input  += 4;
    outptr += 4;
  }

  // inverse transform column vectors
  for (i = 0; i < 4; ++i) {
    for (j = 0; j < 4; ++j)
      temp_in[j] = out[j * 4 + i];
Yunqing Wang's avatar
Yunqing Wang committed
343
    IHT_4[tx_type].cols(temp_in, temp_out);
344
    for (j = 0; j < 4; ++j) {
345 346
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
347
    }
Jingning Han's avatar
Jingning Han committed
348 349
  }
}
350

351
static void iadst8(const tran_low_t *input, tran_low_t *output) {
352 353
  int s0, s1, s2, s3, s4, s5, s6, s7;

354 355 356 357 358 359 360 361
  tran_high_t x0 = input[7];
  tran_high_t x1 = input[0];
  tran_high_t x2 = input[5];
  tran_high_t x3 = input[2];
  tran_high_t x4 = input[3];
  tran_high_t x5 = input[4];
  tran_high_t x6 = input[1];
  tran_high_t x7 = input[6];
362 363 364

  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
    output[0] = output[1] = output[2] = output[3] = output[4]
Dmitry Kovalev's avatar
Dmitry Kovalev committed
365
              = output[5] = output[6] = output[7] = 0;
366 367 368 369 370 371 372 373 374 375 376 377 378
    return;
  }

  // stage 1
  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

379 380 381 382 383 384 385 386
  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
387 388 389 390 391 392

  // stage 2
  s0 = x0;
  s1 = x1;
  s2 = x2;
  s3 = x3;
Dmitry Kovalev's avatar
Dmitry Kovalev committed
393 394 395 396
  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
397

398 399 400 401 402 403 404 405
  x0 = WRAPLOW(s0 + s2, 8);
  x1 = WRAPLOW(s1 + s3, 8);
  x2 = WRAPLOW(s0 - s2, 8);
  x3 = WRAPLOW(s1 - s3, 8);
  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
406 407 408 409 410 411 412

  // stage 3
  s2 = cospi_16_64 * (x2 + x3);
  s3 = cospi_16_64 * (x2 - x3);
  s6 = cospi_16_64 * (x6 + x7);
  s7 = cospi_16_64 * (x6 - x7);

413 414 415 416 417 418 419 420 421 422 423 424 425
  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
  x7 = WRAPLOW(dct_const_round_shift(s7), 8);

  output[0] = WRAPLOW(x0, 8);
  output[1] = WRAPLOW(-x4, 8);
  output[2] = WRAPLOW(x6, 8);
  output[3] = WRAPLOW(-x2, 8);
  output[4] = WRAPLOW(x3, 8);
  output[5] = WRAPLOW(-x7, 8);
  output[6] = WRAPLOW(x5, 8);
  output[7] = WRAPLOW(-x1, 8);
426 427
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
428
static const transform_2d IHT_8[] = {
429 430 431 432
  { idct8,  idct8  },  // DCT_DCT  = 0
  { iadst8, idct8  },  // ADST_DCT = 1
  { idct8,  iadst8 },  // DCT_ADST = 2
  { iadst8, iadst8 }   // ADST_ADST = 3
Dmitry Kovalev's avatar
Dmitry Kovalev committed
433 434
};

435
void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
436
                         int tx_type) {
437
  int i, j;
438 439 440
  tran_low_t out[8 * 8];
  tran_low_t *outptr = out;
  tran_low_t temp_in[8], temp_out[8];
Dmitry Kovalev's avatar
Dmitry Kovalev committed
441
  const transform_2d ht = IHT_8[tx_type];
442 443 444

  // inverse transform row vectors
  for (i = 0; i < 8; ++i) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
445
    ht.rows(input, outptr);
446 447 448 449 450 451 452 453
    input += 8;
    outptr += 8;
  }

  // inverse transform column vectors
  for (i = 0; i < 8; ++i) {
    for (j = 0; j < 8; ++j)
      temp_in[j] = out[j * 8 + i];
Dmitry Kovalev's avatar
Dmitry Kovalev committed
454
    ht.cols(temp_in, temp_out);
455
    for (j = 0; j < 8; ++j) {
456 457
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
458
    }
459
  }
460 461
}

462 463 464
void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  tran_low_t out[8 * 8] = { 0 };
  tran_low_t *outptr = out;
Yunqing Wang's avatar
Yunqing Wang committed
465
  int i, j;
466
  tran_low_t temp_in[8], temp_out[8];
Yunqing Wang's avatar
Yunqing Wang committed
467

Yaowu Xu's avatar
Yaowu Xu committed
468 469 470
  // First transform rows
  // only first 4 row has non-zero coefs
  for (i = 0; i < 4; ++i) {
471
    idct8(input, outptr);
Yaowu Xu's avatar
Yaowu Xu committed
472 473
    input += 8;
    outptr += 8;
Yunqing Wang's avatar
Yunqing Wang committed
474 475
  }

Yaowu Xu's avatar
Yaowu Xu committed
476 477 478 479
  // Then transform columns
  for (i = 0; i < 8; ++i) {
    for (j = 0; j < 8; ++j)
      temp_in[j] = out[j * 8 + i];
480
    idct8(temp_in, temp_out);
481
    for (j = 0; j < 8; ++j) {
482 483
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
484
    }
Dmitry Kovalev's avatar
Dmitry Kovalev committed
485
  }
Yaowu Xu's avatar
Yaowu Xu committed
486 487
}

488 489 490
static void idct16(const tran_low_t *input, tran_low_t *output) {
  tran_low_t step1[16], step2[16];
  tran_high_t temp1, temp2;
491

Yaowu Xu's avatar
Yaowu Xu committed
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
  // stage 1
  step1[0] = input[0/2];
  step1[1] = input[16/2];
  step1[2] = input[8/2];
  step1[3] = input[24/2];
  step1[4] = input[4/2];
  step1[5] = input[20/2];
  step1[6] = input[12/2];
  step1[7] = input[28/2];
  step1[8] = input[2/2];
  step1[9] = input[18/2];
  step1[10] = input[10/2];
  step1[11] = input[26/2];
  step1[12] = input[6/2];
  step1[13] = input[22/2];
  step1[14] = input[14/2];
  step1[15] = input[30/2];

  // stage 2
  step2[0] = step1[0];
  step2[1] = step1[1];
  step2[2] = step1[2];
  step2[3] = step1[3];
  step2[4] = step1[4];
  step2[5] = step1[5];
  step2[6] = step1[6];
  step2[7] = step1[7];

  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
522 523
  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
524 525 526

  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
527 528
  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
529 530 531

  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
532 533
  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
534 535 536

  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
537 538
  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
539 540 541 542 543 544 545 546 547

  // stage 3
  step1[0] = step2[0];
  step1[1] = step2[1];
  step1[2] = step2[2];
  step1[3] = step2[3];

  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
548 549
  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
550 551
  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
552 553 554 555 556 557 558 559 560 561 562
  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);

  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
Yaowu Xu's avatar
Yaowu Xu committed
563

564
  // stage 4
Yaowu Xu's avatar
Yaowu Xu committed
565 566
  temp1 = (step1[0] + step1[1]) * cospi_16_64;
  temp2 = (step1[0] - step1[1]) * cospi_16_64;
567 568
  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
569 570
  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
571 572 573 574 575 576
  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
Yaowu Xu's avatar
Yaowu Xu committed
577 578 579 580 581

  step2[8] = step1[8];
  step2[15] = step1[15];
  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
582 583
  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
584 585
  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
586 587
  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
588 589 590 591
  step2[11] = step1[11];
  step2[12] = step1[12];

  // stage 5
592 593 594 595
  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
Yaowu Xu's avatar
Yaowu Xu committed
596 597 598
  step1[4] = step2[4];
  temp1 = (step2[6] - step2[5]) * cospi_16_64;
  temp2 = (step2[5] + step2[6]) * cospi_16_64;
599 600
  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
601 602
  step1[7] = step2[7];

603 604 605 606 607 608 609 610
  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
Yaowu Xu's avatar
Yaowu Xu committed
611 612

  // stage 6
613 614 615 616 617 618 619 620
  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
Yaowu Xu's avatar
Yaowu Xu committed
621 622 623 624
  step2[8] = step1[8];
  step2[9] = step1[9];
  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
  temp2 = (step1[10] + step1[13]) * cospi_16_64;
625 626
  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
627 628
  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
  temp2 = (step1[11] + step1[12]) * cospi_16_64;
629 630
  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
Yaowu Xu's avatar
Yaowu Xu committed
631 632 633 634
  step2[14] = step1[14];
  step2[15] = step1[15];

  // stage 7
635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
  output[0] = WRAPLOW(step2[0] + step2[15], 8);
  output[1] = WRAPLOW(step2[1] + step2[14], 8);
  output[2] = WRAPLOW(step2[2] + step2[13], 8);
  output[3] = WRAPLOW(step2[3] + step2[12], 8);
  output[4] = WRAPLOW(step2[4] + step2[11], 8);
  output[5] = WRAPLOW(step2[5] + step2[10], 8);
  output[6] = WRAPLOW(step2[6] + step2[9], 8);
  output[7] = WRAPLOW(step2[7] + step2[8], 8);
  output[8] = WRAPLOW(step2[7] - step2[8], 8);
  output[9] = WRAPLOW(step2[6] - step2[9], 8);
  output[10] = WRAPLOW(step2[5] - step2[10], 8);
  output[11] = WRAPLOW(step2[4] - step2[11], 8);
  output[12] = WRAPLOW(step2[3] - step2[12], 8);
  output[13] = WRAPLOW(step2[2] - step2[13], 8);
  output[14] = WRAPLOW(step2[1] - step2[14], 8);
  output[15] = WRAPLOW(step2[0] - step2[15], 8);
651 652
}

653 654 655 656
void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
  tran_low_t out[16 * 16];
  tran_low_t *outptr = out;
657
  int i, j;
658
  tran_low_t temp_in[16], temp_out[16];
659

660 661
  // First transform rows
  for (i = 0; i < 16; ++i) {
662
    idct16(input, outptr);
663
    input += 16;
664 665
    outptr += 16;
  }
666

667 668 669 670
  // Then transform columns
  for (i = 0; i < 16; ++i) {
    for (j = 0; j < 16; ++j)
      temp_in[j] = out[j * 16 + i];
671
    idct16(temp_in, temp_out);
672
    for (j = 0; j < 16; ++j) {
673 674
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
675
    }
Dmitry Kovalev's avatar
Dmitry Kovalev committed
676
  }
677
}
Yunqing Wang's avatar
Yunqing Wang committed
678

679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
static void iadst16(const tran_low_t *input, tran_low_t *output) {
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  tran_high_t s9, s10, s11, s12, s13, s14, s15;

  tran_high_t x0 = input[15];
  tran_high_t x1 = input[0];
  tran_high_t x2 = input[13];
  tran_high_t x3 = input[2];
  tran_high_t x4 = input[11];
  tran_high_t x5 = input[4];
  tran_high_t x6 = input[9];
  tran_high_t x7 = input[6];
  tran_high_t x8 = input[7];
  tran_high_t x9 = input[8];
  tran_high_t x10 = input[5];
  tran_high_t x11 = input[10];
  tran_high_t x12 = input[3];
  tran_high_t x13 = input[12];
  tran_high_t x14 = input[1];
  tran_high_t x15 = input[14];
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726

  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
    output[0] = output[1] = output[2] = output[3] = output[4]
              = output[5] = output[6] = output[7] = output[8]
              = output[9] = output[10] = output[11] = output[12]
              = output[13] = output[14] = output[15] = 0;
    return;
  }

  // stage 1
  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742
  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761

  // stage 2
  s0 = x0;
  s1 = x1;
  s2 = x2;
  s3 = x3;
  s4 = x4;
  s5 = x5;
  s6 = x6;
  s7 = x7;
  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;
  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;
  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;
  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;
  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;
  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777
  x0 = WRAPLOW(s0 + s4, 8);
  x1 = WRAPLOW(s1 + s5, 8);
  x2 = WRAPLOW(s2 + s6, 8);
  x3 = WRAPLOW(s3 + s7, 8);
  x4 = WRAPLOW(s0 - s4, 8);
  x5 = WRAPLOW(s1 - s5, 8);
  x6 = WRAPLOW(s2 - s6, 8);
  x7 = WRAPLOW(s3 - s7, 8);
  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796

  // stage 3
  s0 = x0;
  s1 = x1;
  s2 = x2;
  s3 = x3;
  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;
  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;
  s8 = x8;
  s9 = x9;
  s10 = x10;
  s11 = x11;
  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;
  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

797 798 799 800
  x0 = WRAPLOW(check_range(s0 + s2), 8);
  x1 = WRAPLOW(check_range(s1 + s3), 8);
  x2 = WRAPLOW(check_range(s0 - s2), 8);
  x3 = WRAPLOW(check_range(s1 - s3), 8);
801 802 803 804
  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
805 806 807 808
  x8 = WRAPLOW(check_range(s8 + s10), 8);
  x9 = WRAPLOW(check_range(s9 + s11), 8);
  x10 = WRAPLOW(check_range(s8 - s10), 8);
  x11 = WRAPLOW(check_range(s9 - s11), 8);
809 810 811 812
  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
813 814 815 816 817 818 819 820 821 822 823

  // stage 4
  s2 = (- cospi_16_64) * (x2 + x3);
  s3 = cospi_16_64 * (x2 - x3);
  s6 = cospi_16_64 * (x6 + x7);
  s7 = cospi_16_64 * (- x6 + x7);
  s10 = cospi_16_64 * (x10 + x11);
  s11 = cospi_16_64 * (- x10 + x11);
  s14 = (- cospi_16_64) * (x14 + x15);
  s15 = cospi_16_64 * (x14 - x15);

824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848
  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
  x15 = WRAPLOW(dct_const_round_shift(s15), 8);

  output[0] = WRAPLOW(x0, 8);
  output[1] = WRAPLOW(-x8, 8);
  output[2] = WRAPLOW(x12, 8);
  output[3] = WRAPLOW(-x4, 8);
  output[4] = WRAPLOW(x6, 8);
  output[5] = WRAPLOW(x14, 8);
  output[6] = WRAPLOW(x10, 8);
  output[7] = WRAPLOW(x2, 8);
  output[8] = WRAPLOW(x3, 8);
  output[9] = WRAPLOW(x11, 8);
  output[10] = WRAPLOW(x15, 8);
  output[11] = WRAPLOW(x7, 8);
  output[12] = WRAPLOW(x5, 8);
  output[13] = WRAPLOW(-x13, 8);
  output[14] = WRAPLOW(x9, 8);
  output[15] = WRAPLOW(-x1, 8);
849 850
}

Dmitry Kovalev's avatar
Dmitry Kovalev committed
851
static const transform_2d IHT_16[] = {
852 853 854 855
  { idct16,  idct16  },  // DCT_DCT  = 0
  { iadst16, idct16  },  // ADST_DCT = 1
  { idct16,  iadst16 },  // DCT_ADST = 2
  { iadst16, iadst16 }   // ADST_ADST = 3
Dmitry Kovalev's avatar
Dmitry Kovalev committed
856 857
};

858
void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
859
                            int tx_type) {
860
  int i, j;
861 862 863
  tran_low_t out[16 * 16];
  tran_low_t *outptr = out;
  tran_low_t temp_in[16], temp_out[16];
Dmitry Kovalev's avatar
Dmitry Kovalev committed
864
  const transform_2d ht = IHT_16[tx_type];
865

Dmitry Kovalev's avatar
Dmitry Kovalev committed
866
  // Rows
867
  for (i = 0; i < 16; ++i) {
Dmitry Kovalev's avatar
Dmitry Kovalev committed
868
    ht.rows(input, outptr);
869
    input += 16;
870 871 872
    outptr += 16;
  }

Dmitry Kovalev's avatar
Dmitry Kovalev committed
873
  // Columns
874 875 876
  for (i = 0; i < 16; ++i) {
    for (j = 0; j < 16; ++j)
      temp_in[j] = out[j * 16 + i];
Dmitry Kovalev's avatar
Dmitry Kovalev committed
877
    ht.cols(temp_in, temp_out);
878
    for (j = 0; j < 16; ++j) {
879 880
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
881
    }
Jingning Han's avatar
Jingning Han committed
882
  }
883 884
}

885 886 887 888
void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
                            int stride) {
  tran_low_t out[16 * 16] = { 0 };
  tran_low_t *outptr = out;
Scott LaVarnway's avatar
Scott LaVarnway committed
889
  int i, j;
890
  tran_low_t temp_in[16], temp_out[16];
891

892 893
  // First transform rows. Since all non-zero dct coefficients are in
  // upper-left 4x4 area, we only need to calculate first 4 rows here.
Scott LaVarnway's avatar
Scott LaVarnway committed
894
  for (i = 0; i < 4; ++i) {
895
    idct16(input, outptr);
Scott LaVarnway's avatar
Scott LaVarnway committed
896 897 898 899 900 901 902 903
    input += 16;
    outptr += 16;
  }

  // Then transform columns
  for (i = 0; i < 16; ++i) {
    for (j = 0; j < 16; ++j)
      temp_in[j] = out[j*16 + i];
904
    idct16(temp_in, temp_out);
905
    for (j = 0; j < 16; ++j) {
906 907
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
908
    }
Scott LaVarnway's avatar
Scott LaVarnway committed
909 910
  }
}
911

912
void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
913
  int i, j;
914
  tran_high_t a1;
915 916
  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
917 918 919
  a1 = ROUND_POWER_OF_TWO(out, 6);
  for (j = 0; j < 16; ++j) {
    for (i = 0; i < 16; ++i)
920
      dest[i] = clip_pixel_add(dest[i], a1);
921
    dest += stride;
922
  }
923
}
924

925 926 927
static void idct32(const tran_low_t *input, tran_low_t *output) {
  tran_low_t step1[32], step2[32];
  tran_high_t temp1, temp2;
928

929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948
  // stage 1
  step1[0] = input[0];
  step1[1] = input[16];
  step1[2] = input[8];
  step1[3] = input[24];
  step1[4] = input[4];
  step1[5] = input[20];
  step1[6] = input[12];
  step1[7] = input[28];
  step1[8] = input[2];
  step1[9] = input[18];
  step1[10] = input[10];
  step1[11] = input[26];
  step1[12] = input[6];
  step1[13] = input[22];
  step1[14] = input[14];
  step1[15] = input[30];

  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
949 950
  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
951 952 953

  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
954 955
  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
956 957 958

  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
959 960
  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
961 962 963

  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
964 965
  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
966 967 968

  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
969 970
  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
971 972 973

  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
974 975
  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
976 977 978

  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
979 980
  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
981 982 983

  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
984 985
  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
986 987 988 989 990 991 992 993 994 995 996 997 998

  // stage 2
  step2[0] = step1[0];
  step2[1] = step1[1];
  step2[2] = step1[2];
  step2[3] = step1[3];
  step2[4] = step1[4];
  step2[5] = step1[5];
  step2[6] = step1[6];
  step2[7] = step1[7];

  temp1 = step1[8] * cospi_30_64