txfm_common_sse2.h 14.2 KB
Newer Older
Jingning Han's avatar
Jingning Han committed
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Jingning Han's avatar
Jingning Han committed
3
 *
Yaowu Xu's avatar
Yaowu Xu committed
4 5 6 7 8 9
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Jingning Han's avatar
Jingning Han committed
10 11
 */

Yaowu Xu's avatar
Yaowu Xu committed
12 13
#ifndef AOM_DSP_X86_TXFM_COMMON_SSE2_H_
#define AOM_DSP_X86_TXFM_COMMON_SSE2_H_
Jingning Han's avatar
Jingning Han committed
14 15

#include <emmintrin.h>
Yaowu Xu's avatar
Yaowu Xu committed
16
#include "aom/aom_integer.h"
17
#include "aom_dsp/x86/synonyms.h"
Jingning Han's avatar
Jingning Han committed
18

clang-format's avatar
clang-format committed
19
#define pair_set_epi16(a, b)                                            \
Jingning Han's avatar
Jingning Han committed
20 21 22
  _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))

clang-format's avatar
clang-format committed
23
#define dual_set_epi16(a, b)                                            \
Jingning Han's avatar
Jingning Han committed
24 25 26
  _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
                (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))

clang-format's avatar
clang-format committed
27
#define octa_set_epi16(a, b, c, d, e, f, g, h)                           \
Jingning Han's avatar
Jingning Han committed
28 29 30
  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))

31 32 33 34 35 36 37
// Reverse the 8 16 bit words in __m128i
static INLINE __m128i mm_reverse_epi16(const __m128i x) {
  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
  return _mm_shuffle_epi32(b, 0x4e);
}

38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
// Identity transform (both forward and inverse).
static INLINE void idtx16_8col(__m128i *in) {
  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
  __m128i y0, y1, y2, y3, y4, y5, y6, y7;

  in[0] = _mm_slli_epi16(in[0], 1);
  in[1] = _mm_slli_epi16(in[1], 1);
  in[2] = _mm_slli_epi16(in[2], 1);
  in[3] = _mm_slli_epi16(in[3], 1);
  in[4] = _mm_slli_epi16(in[4], 1);
  in[5] = _mm_slli_epi16(in[5], 1);
  in[6] = _mm_slli_epi16(in[6], 1);
  in[7] = _mm_slli_epi16(in[7], 1);
  in[8] = _mm_slli_epi16(in[8], 1);
  in[9] = _mm_slli_epi16(in[9], 1);
  in[10] = _mm_slli_epi16(in[10], 1);
  in[11] = _mm_slli_epi16(in[11], 1);
  in[12] = _mm_slli_epi16(in[12], 1);
  in[13] = _mm_slli_epi16(in[13], 1);
  in[14] = _mm_slli_epi16(in[14], 1);
  in[15] = _mm_slli_epi16(in[15], 1);

  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
  v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
  v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
  v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
  v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);

  u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
  u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
  u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
  u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
  u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
  u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
  u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
  u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);

  x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
  x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
  x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
  x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
  x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
  x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
  x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
  x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);

  y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
  y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
  y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
  y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
  y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
  y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
  y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
  y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);

  v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
  v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
  v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
  v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
  v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
  v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
  v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
  v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);

  x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
  x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
  x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
  x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
  x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
  x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
  x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
  x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);

  u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
  u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
  u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
  u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
  u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
  u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
  u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
  u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);

  y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
  y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
  y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
  y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
  y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
  y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
  y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
  y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);

  v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
  v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
  v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
  v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
  v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
  v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
  v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
  v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);

  x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
  x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
  x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
  x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
  x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
  x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
  x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
  x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);

  u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
  u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
  u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
  u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
  u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
  u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
  u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
  u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);

  y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
  y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
  y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
  y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
  y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
  y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
  y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
  y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);

  v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
  v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
  v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
  v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
  v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
  v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
  v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
  v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);

  x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
  x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
  x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
  x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
  x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
  x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
  x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
  x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);

  u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
  u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
  u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
  u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
  u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
  u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
  u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
  u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);

  y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
  y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
  y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
  y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
  y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
  y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
  y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
  y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);

  in[0] = _mm_packs_epi32(v0, x0);
  in[1] = _mm_packs_epi32(v1, x1);
  in[2] = _mm_packs_epi32(v2, x2);
  in[3] = _mm_packs_epi32(v3, x3);
  in[4] = _mm_packs_epi32(v4, x4);
  in[5] = _mm_packs_epi32(v5, x5);
  in[6] = _mm_packs_epi32(v6, x6);
  in[7] = _mm_packs_epi32(v7, x7);

  in[8] = _mm_packs_epi32(u0, y0);
  in[9] = _mm_packs_epi32(u1, y1);
  in[10] = _mm_packs_epi32(u2, y2);
  in[11] = _mm_packs_epi32(u3, y3);
  in[12] = _mm_packs_epi32(u4, y4);
  in[13] = _mm_packs_epi32(u5, y5);
  in[14] = _mm_packs_epi32(u6, y6);
  in[15] = _mm_packs_epi32(u7, y7);
}

229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
static INLINE void scale_sqrt2_8x4(__m128i *in) {
  // Implements ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS), for 32
  // consecutive elements.
  const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);

  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);

  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);

  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
  in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
                          xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
  in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
                          xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
}

static INLINE void scale_sqrt2_8x8(__m128i *in) {
  // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
  // for each element.
  const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);

  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
  const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
  const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
  const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
  const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
  const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
  const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
  const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
  const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);

  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
  const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
  const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
  const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
  const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
  const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
  const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
  const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
  const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);

  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
  in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
                          xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
  in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
                          xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
  in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS),
                          xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS));
  in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS),
                          xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS));
  in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS),
                          xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS));
  in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS),
                          xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
}

static INLINE void scale_sqrt2_8x16(__m128i *in) {
  scale_sqrt2_8x8(in);
  scale_sqrt2_8x8(in + 8);
}

Yaowu Xu's avatar
Yaowu Xu committed
324
#endif  // AOM_DSP_X86_TXFM_COMMON_SSE2_H_