av1_fwd_txfm_sse2.c 8.63 KB
Newer Older
1
/*
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
4 5 6 7 8 9
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 11 12 13 14
 */

#include <emmintrin.h>  // SSE2

#include "./vpx_config.h"
Yaowu Xu's avatar
Yaowu Xu committed
15 16
#include "aom_dsp/vpx_dsp_common.h"
#include "aom_dsp/x86/fwd_txfm_sse2.h"
17 18 19 20 21

void vp10_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
  __m128i in0, in1;
  __m128i tmp;
  const __m128i zero = _mm_setzero_si128();
clang-format's avatar
clang-format committed
22 23 24 25 26 27
  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
  in1 = _mm_unpacklo_epi64(
      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
  in0 = _mm_unpacklo_epi64(
      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47

  tmp = _mm_add_epi16(in0, in1);
  in0 = _mm_unpacklo_epi16(zero, tmp);
  in1 = _mm_unpackhi_epi16(zero, tmp);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);

  tmp = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(tmp, zero);
  in1 = _mm_unpackhi_epi32(tmp, zero);

  tmp = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(tmp, 8);

  in1 = _mm_add_epi32(tmp, in0);
  in0 = _mm_slli_epi32(in1, 1);
  store_output(&in0, output);
}

void vp10_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
clang-format's avatar
clang-format committed
48 49 50 51
  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
52 53 54 55 56
  __m128i u0, u1, sum;

  u0 = _mm_add_epi16(in0, in1);
  u1 = _mm_add_epi16(in2, in3);

clang-format's avatar
clang-format committed
57 58 59 60
  in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
  in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
  in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
  in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
61 62 63 64 65 66 67

  sum = _mm_add_epi16(u0, u1);

  in0 = _mm_add_epi16(in0, in1);
  in2 = _mm_add_epi16(in2, in3);
  sum = _mm_add_epi16(sum, in0);

clang-format's avatar
clang-format committed
68
  u0 = _mm_setzero_si128();
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
  sum = _mm_add_epi16(sum, in2);

  in0 = _mm_unpacklo_epi16(u0, sum);
  in1 = _mm_unpackhi_epi16(u0, sum);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);

  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(sum, u0);
  in1 = _mm_unpackhi_epi32(sum, u0);

  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(sum, 8);

  in1 = _mm_add_epi32(sum, in0);
  store_output(&in1, output);
}

void vp10_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
clang-format's avatar
clang-format committed
88
                           int stride) {
89 90 91 92 93 94 95
  __m128i in0, in1, in2, in3;
  __m128i u0, u1;
  __m128i sum = _mm_setzero_si128();
  int i;

  for (i = 0; i < 2; ++i) {
    input += 8 * i;
clang-format's avatar
clang-format committed
96 97 98 99
    in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
    in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
    in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
    in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
100 101 102 103 104

    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);

clang-format's avatar
clang-format committed
105 106 107 108
    in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
    in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
    in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
    in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
109 110

    sum = _mm_add_epi16(sum, u1);
clang-format's avatar
clang-format committed
111 112
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
113 114
    sum = _mm_add_epi16(sum, u0);

clang-format's avatar
clang-format committed
115 116 117 118
    in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
    in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
    in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
    in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
119 120

    sum = _mm_add_epi16(sum, u1);
clang-format's avatar
clang-format committed
121 122
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
123 124
    sum = _mm_add_epi16(sum, u0);

clang-format's avatar
clang-format committed
125 126 127 128
    in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
    in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
    in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
    in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
129 130

    sum = _mm_add_epi16(sum, u1);
clang-format's avatar
clang-format committed
131 132
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
133 134 135 136 137
    sum = _mm_add_epi16(sum, u0);

    sum = _mm_add_epi16(sum, u1);
  }

clang-format's avatar
clang-format committed
138
  u0 = _mm_setzero_si128();
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
  in0 = _mm_unpacklo_epi16(u0, sum);
  in1 = _mm_unpackhi_epi16(u0, sum);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);

  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(sum, u0);
  in1 = _mm_unpackhi_epi32(sum, u0);

  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(sum, 8);

  in1 = _mm_add_epi32(sum, in0);
  in1 = _mm_srai_epi32(in1, 1);
  store_output(&in1, output);
}

void vp10_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
clang-format's avatar
clang-format committed
157
                           int stride) {
158 159 160 161 162 163
  __m128i in0, in1, in2, in3;
  __m128i u0, u1;
  __m128i sum = _mm_setzero_si128();
  int i;

  for (i = 0; i < 8; ++i) {
clang-format's avatar
clang-format committed
164 165 166 167
    in0 = _mm_load_si128((const __m128i *)(input + 0));
    in1 = _mm_load_si128((const __m128i *)(input + 8));
    in2 = _mm_load_si128((const __m128i *)(input + 16));
    in3 = _mm_load_si128((const __m128i *)(input + 24));
168 169 170 171 172 173

    input += stride;
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);

clang-format's avatar
clang-format committed
174 175 176 177
    in0 = _mm_load_si128((const __m128i *)(input + 0));
    in1 = _mm_load_si128((const __m128i *)(input + 8));
    in2 = _mm_load_si128((const __m128i *)(input + 16));
    in3 = _mm_load_si128((const __m128i *)(input + 24));
178 179 180

    input += stride;
    sum = _mm_add_epi16(sum, u1);
clang-format's avatar
clang-format committed
181 182
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
183 184
    sum = _mm_add_epi16(sum, u0);

clang-format's avatar
clang-format committed
185 186 187 188
    in0 = _mm_load_si128((const __m128i *)(input + 0));
    in1 = _mm_load_si128((const __m128i *)(input + 8));
    in2 = _mm_load_si128((const __m128i *)(input + 16));
    in3 = _mm_load_si128((const __m128i *)(input + 24));
189 190 191

    input += stride;
    sum = _mm_add_epi16(sum, u1);
clang-format's avatar
clang-format committed
192 193
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
194 195
    sum = _mm_add_epi16(sum, u0);

clang-format's avatar
clang-format committed
196 197 198 199
    in0 = _mm_load_si128((const __m128i *)(input + 0));
    in1 = _mm_load_si128((const __m128i *)(input + 8));
    in2 = _mm_load_si128((const __m128i *)(input + 16));
    in3 = _mm_load_si128((const __m128i *)(input + 24));
200 201 202

    input += stride;
    sum = _mm_add_epi16(sum, u1);
clang-format's avatar
clang-format committed
203 204
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
205 206 207 208 209
    sum = _mm_add_epi16(sum, u0);

    sum = _mm_add_epi16(sum, u1);
  }

clang-format's avatar
clang-format committed
210
  u0 = _mm_setzero_si128();
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
  in0 = _mm_unpacklo_epi16(u0, sum);
  in1 = _mm_unpackhi_epi16(u0, sum);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);

  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(sum, u0);
  in1 = _mm_unpackhi_epi32(sum, u0);

  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(sum, 8);

  in1 = _mm_add_epi32(sum, in0);
  in1 = _mm_srai_epi32(in1, 3);
  store_output(&in1, output);
}

#define DCT_HIGH_BIT_DEPTH 0
#define FDCT4x4_2D vp10_fdct4x4_sse2
#define FDCT8x8_2D vp10_fdct8x8_sse2
#define FDCT16x16_2D vp10_fdct16x16_sse2
Yaowu Xu's avatar
Yaowu Xu committed
232
#include "av1/common/x86/av1_fwd_txfm_impl_sse2.h"
clang-format's avatar
clang-format committed
233 234 235
#undef FDCT4x4_2D
#undef FDCT8x8_2D
#undef FDCT16x16_2D
236 237 238

#define FDCT32x32_2D vp10_fdct32x32_rd_sse2
#define FDCT32x32_HIGH_PRECISION 0
Yaowu Xu's avatar
Yaowu Xu committed
239
#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"
clang-format's avatar
clang-format committed
240 241
#undef FDCT32x32_2D
#undef FDCT32x32_HIGH_PRECISION
242 243 244

#define FDCT32x32_2D vp10_fdct32x32_sse2
#define FDCT32x32_HIGH_PRECISION 1
Yaowu Xu's avatar
Yaowu Xu committed
245
#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"  // NOLINT
clang-format's avatar
clang-format committed
246 247 248
#undef FDCT32x32_2D
#undef FDCT32x32_HIGH_PRECISION
#undef DCT_HIGH_BIT_DEPTH
249

250
#if CONFIG_VPX_HIGHBITDEPTH
251 252 253 254
#define DCT_HIGH_BIT_DEPTH 1
#define FDCT4x4_2D vp10_highbd_fdct4x4_sse2
#define FDCT8x8_2D vp10_highbd_fdct8x8_sse2
#define FDCT16x16_2D vp10_highbd_fdct16x16_sse2
Yaowu Xu's avatar
Yaowu Xu committed
255
#include "av1/common/x86/av1_fwd_txfm_impl_sse2.h"  // NOLINT
clang-format's avatar
clang-format committed
256 257 258
#undef FDCT4x4_2D
#undef FDCT8x8_2D
#undef FDCT16x16_2D
259 260 261

#define FDCT32x32_2D vp10_highbd_fdct32x32_rd_sse2
#define FDCT32x32_HIGH_PRECISION 0
Yaowu Xu's avatar
Yaowu Xu committed
262
#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"  // NOLINT
clang-format's avatar
clang-format committed
263 264
#undef FDCT32x32_2D
#undef FDCT32x32_HIGH_PRECISION
265 266 267

#define FDCT32x32_2D vp10_highbd_fdct32x32_sse2
#define FDCT32x32_HIGH_PRECISION 1
Yaowu Xu's avatar
Yaowu Xu committed
268
#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"  // NOLINT
clang-format's avatar
clang-format committed
269 270 271
#undef FDCT32x32_2D
#undef FDCT32x32_HIGH_PRECISION
#undef DCT_HIGH_BIT_DEPTH
272
#endif  // CONFIG_VPX_HIGHBITDEPTH