highbd_loopfilter_sse2.c 36.3 KB
Newer Older
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3
 *
Yaowu Xu's avatar
Yaowu Xu committed
4 5 6 7 8 9
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 11 12 13
 */

#include <emmintrin.h>  // SSE2

Yaowu Xu's avatar
Yaowu Xu committed
14
#include "./aom_dsp_rtcd.h"
Yi Luo's avatar
Yi Luo committed
15
#include "aom_dsp/x86/lpf_common_sse2.h"
16
#include "aom_ports/emmintrin_compat.h"
17
#include "aom_ports/mem.h"
18

19 20 21
static INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
                               __m128i *pixel) {
  __m128i clamped, mask;
22

23 24 25 26
  mask = _mm_cmpgt_epi16(*pixel, *max);
  clamped = _mm_andnot_si128(mask, *pixel);
  mask = _mm_and_si128(mask, *max);
  clamped = _mm_or_si128(mask, clamped);
27

28 29 30 31
  mask = _mm_cmpgt_epi16(clamped, *min);
  clamped = _mm_and_si128(mask, clamped);
  mask = _mm_andnot_si128(mask, *min);
  *pixel = _mm_or_si128(clamped, mask);
32 33
}

34 35 36 37 38
static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
                             const uint8_t *t, int bd, __m128i *blt,
                             __m128i *lt, __m128i *thr) {
  const int shift = bd - 8;
  const __m128i zero = _mm_setzero_si128();
39

40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
  *blt = _mm_slli_epi16(x, shift);

  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
  *lt = _mm_slli_epi16(x, shift);

  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
  *thr = _mm_slli_epi16(x, shift);
}

static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
                                     __m128i *p, __m128i *q) {
  int i;
  for (i = 0; i < size; i++) {
    p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch));
    q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
56
  }
57 58 59 60 61 62 63 64 65 66
}
// _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
static INLINE void highbd_hev_mask(const __m128i *p, const __m128i *q,
                                   const __m128i *t, __m128i *hev) {
  const __m128i abs_p1p0 =
      _mm_or_si128(_mm_subs_epu16(p[1], p[0]), _mm_subs_epu16(p[0], p[1]));
  const __m128i abs_q1q0 =
      _mm_or_si128(_mm_subs_epu16(q[1], q[0]), _mm_subs_epu16(q[0], q[1]));
  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
  h = _mm_subs_epu16(h, *t);
67

68 69 70 71
  const __m128i ffff = _mm_set1_epi16(0xFFFF);
  const __m128i zero = _mm_setzero_si128();
  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
}
72

73 74 75 76 77 78 79 80 81
static INLINE void highbd_filter_mask(const __m128i *p, const __m128i *q,
                                      const __m128i *l, const __m128i *bl,
                                      __m128i *mask) {
  __m128i abs_p0q0 =
      _mm_or_si128(_mm_subs_epu16(p[0], q[0]), _mm_subs_epu16(q[0], p[0]));
  __m128i abs_p1q1 =
      _mm_or_si128(_mm_subs_epu16(p[1], q[1]), _mm_subs_epu16(q[1], p[1]));
  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
82

83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
  const __m128i zero = _mm_setzero_si128();
  const __m128i one = _mm_set1_epi16(1);
  const __m128i ffff = _mm_set1_epi16(0xFFFF);
  __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));

  int i;
  for (i = 1; i < 4; ++i) {
    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[i - 1]),
                                          _mm_subs_epu16(p[i - 1], p[i])));
    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[i - 1]),
                                          _mm_subs_epu16(q[i - 1], q[i])));
  }
  max = _mm_subs_epu16(max, *l);
  *mask = _mm_cmpeq_epi16(max, zero);  // return ~mask
}
100

101 102 103 104 105 106 107 108 109 110 111
static INLINE void flat_mask_internal(const __m128i *th, const __m128i *p,
                                      const __m128i *q, int bd, int start,
                                      int end, __m128i *flat) {
  __m128i max = _mm_setzero_si128();
  int i;
  for (i = start; i < end; ++i) {
    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[0]),
                                          _mm_subs_epu16(p[0], p[i])));
    max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[0]),
                                          _mm_subs_epu16(q[0], q[i])));
  }
112

113 114 115 116 117 118 119
  __m128i ft;
  if (bd == 8)
    ft = _mm_subs_epu16(max, *th);
  else if (bd == 10)
    ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 2));
  else  // bd == 12
    ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 4));
120

121 122 123
  const __m128i zero = _mm_setzero_si128();
  *flat = _mm_cmpeq_epi16(ft, zero);
}
124

125 126 127 128 129 130 131 132
// Note:
//  Access p[3-1], p[0], and q[3-1], q[0]
static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p,
                                     const __m128i *q, __m128i *flat, int bd) {
  // check the distance 1,2,3 against 0
  flat_mask_internal(th, p, q, bd, 1, 4, flat);
}

133 134 135 136 137 138 139 140
// Note:
//  access p[6-4], p[0], and q[6-4], q[0]
static INLINE void highbd_flat_mask4_13(const __m128i *th, const __m128i *p,
                                        const __m128i *q, __m128i *flat,
                                        int bd) {
  flat_mask_internal(th, p, q, bd, 4, 7, flat);
}

141 142 143 144 145 146 147 148 149 150 151
// Note:
//  access p[7-4], p[0], and q[7-4], q[0]
static INLINE void highbd_flat_mask5(const __m128i *th, const __m128i *p,
                                     const __m128i *q, __m128i *flat, int bd) {
  flat_mask_internal(th, p, q, bd, 4, 8, flat);
}

static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask,
                                  const __m128i *th, int bd, __m128i *ps,
                                  __m128i *qs) {
  __m128i t80;
152 153 154 155 156 157 158
  if (bd == 8)
    t80 = _mm_set1_epi16(0x80);
  else if (bd == 10)
    t80 = _mm_set1_epi16(0x200);
  else  // bd == 12
    t80 = _mm_set1_epi16(0x800);

159 160 161 162
  __m128i ps0 = _mm_subs_epi16(p[0], t80);
  __m128i ps1 = _mm_subs_epi16(p[1], t80);
  __m128i qs0 = _mm_subs_epi16(q[0], t80);
  __m128i qs1 = _mm_subs_epi16(q[1], t80);
163

164
  const __m128i one = _mm_set1_epi16(1);
165 166
  const __m128i pmax =
      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
167
  const __m128i zero = _mm_setzero_si128();
168 169
  const __m128i pmin = _mm_subs_epi16(zero, t80);

170 171
  __m128i filter = _mm_subs_epi16(ps1, qs1);
  pixel_clamp(&pmin, &pmax, &filter);
172

173 174 175
  __m128i hev;
  highbd_hev_mask(p, q, th, &hev);
  filter = _mm_and_si128(filter, hev);
176

177 178 179 180 181 182
  const __m128i x = _mm_subs_epi16(qs0, ps0);
  filter = _mm_adds_epi16(filter, x);
  filter = _mm_adds_epi16(filter, x);
  filter = _mm_adds_epi16(filter, x);
  pixel_clamp(&pmin, &pmax, &filter);
  filter = _mm_and_si128(filter, *mask);
183

184 185
  const __m128i t3 = _mm_set1_epi16(3);
  const __m128i t4 = _mm_set1_epi16(4);
186

187 188
  __m128i filter1 = _mm_adds_epi16(filter, t4);
  __m128i filter2 = _mm_adds_epi16(filter, t3);
189 190
  pixel_clamp(&pmin, &pmax, &filter1);
  pixel_clamp(&pmin, &pmax, &filter2);
191 192
  filter1 = _mm_srai_epi16(filter1, 3);
  filter2 = _mm_srai_epi16(filter2, 3);
193

194 195 196 197 198
  qs0 = _mm_subs_epi16(qs0, filter1);
  pixel_clamp(&pmin, &pmax, &qs0);
  ps0 = _mm_adds_epi16(ps0, filter2);
  pixel_clamp(&pmin, &pmax, &ps0);

199 200
  qs[0] = _mm_adds_epi16(qs0, t80);
  ps[0] = _mm_adds_epi16(ps0, t80);
201

202 203 204
  filter = _mm_adds_epi16(filter1, one);
  filter = _mm_srai_epi16(filter, 1);
  filter = _mm_andnot_si128(hev, filter);
205

206 207 208
  qs1 = _mm_subs_epi16(qs1, filter);
  pixel_clamp(&pmin, &pmax, &qs1);
  ps1 = _mm_adds_epi16(ps1, filter);
209
  pixel_clamp(&pmin, &pmax, &ps1);
210

211 212 213
  qs[1] = _mm_adds_epi16(qs1, t80);
  ps[1] = _mm_adds_epi16(ps1, t80);
}
214

215
typedef enum { FOUR_PIXELS, EIGHT_PIXELS } PixelOutput;
216

217 218 219 220 221 222 223
static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
                                                   const uint8_t *blt,
                                                   const uint8_t *lt,
                                                   const uint8_t *thr, int bd,
                                                   PixelOutput pixel_output) {
  __m128i blimit, limit, thresh;
  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
224

225 226
  __m128i p[7], q[7];
  load_highbd_pixel(s, 7, pitch, p, q);
clang-format's avatar
clang-format committed
227

228 229
  __m128i mask;
  highbd_filter_mask(p, q, &limit, &blimit, &mask);
230

231 232 233
  __m128i flat, flat2;
  const __m128i one = _mm_set1_epi16(1);
  highbd_flat_mask4(&one, p, q, &flat, bd);
234
  highbd_flat_mask4_13(&one, p, q, &flat2, bd);
235

236 237
  flat = _mm_and_si128(flat, mask);
  flat2 = _mm_and_si128(flat2, flat);
238

239 240
  __m128i ps[2], qs[2];
  highbd_filter4(p, q, &mask, &thresh, bd, ps, qs);
241 242

  // flat and wide flat calculations
243
  __m128i flat_p[3], flat_q[3];
244
  __m128i flat2_p[6], flat2_q[6];
245 246 247 248
  {
    const __m128i eight = _mm_set1_epi16(8);
    const __m128i four = _mm_set1_epi16(4);

249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
    __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
    __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));

    __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
    sum_p = _mm_add_epi16(sum_p, sum_lp);

    __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
    sum_q = _mm_add_epi16(sum_q, sum_lq);
    sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));

    flat2_p[0] = _mm_srli_epi16(
        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
                                           _mm_add_epi16(p[1], q[0]))),
        4);
    flat2_q[0] = _mm_srli_epi16(
        _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
                                           _mm_add_epi16(p[0], q[1]))),
        4);
    flat_p[0] =
        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
    flat_q[0] =
        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);

    __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
    __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
    __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
    __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);

    sum_q = _mm_sub_epi16(sum_p, p[5]);
    sum_p = _mm_sub_epi16(sum_p, q[5]);
    flat2_p[1] = _mm_srli_epi16(
        _mm_add_epi16(
            sum_p, _mm_add_epi16(
                       sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
        4);
    flat2_q[1] = _mm_srli_epi16(
        _mm_add_epi16(
            sum_q, _mm_add_epi16(
                       sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
        4);

    sum_lq = _mm_sub_epi16(sum_lp, p[2]);
    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
    flat_p[1] =
        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
    flat_q[1] =
        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);

    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
    sum_p3 = _mm_add_epi16(sum_p3, p[3]);
    sum_q3 = _mm_add_epi16(sum_q3, q[3]);
    sum_p = _mm_sub_epi16(sum_p, q[4]);
    sum_q = _mm_sub_epi16(sum_q, p[4]);
    flat2_p[2] = _mm_srli_epi16(
        _mm_add_epi16(
            sum_p, _mm_add_epi16(
                       sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
        4);
    flat2_q[2] = _mm_srli_epi16(
        _mm_add_epi16(
            sum_q, _mm_add_epi16(
                       sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
        4);
    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
    sum_lq = _mm_sub_epi16(sum_lq, p[1]);
    flat_p[2] =
        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
    flat_q[2] =
        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);

    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
    sum_p = _mm_sub_epi16(sum_p, q[3]);
    sum_q = _mm_sub_epi16(sum_q, p[3]);
    flat2_p[3] = _mm_srli_epi16(
        _mm_add_epi16(
            sum_p, _mm_add_epi16(
                       sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
        4);
    flat2_q[3] = _mm_srli_epi16(
        _mm_add_epi16(
            sum_q, _mm_add_epi16(
                       sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
        4);

    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
    sum_p = _mm_sub_epi16(sum_p, q[2]);
    sum_q = _mm_sub_epi16(sum_q, p[2]);
    flat2_p[4] = _mm_srli_epi16(
        _mm_add_epi16(
            sum_p, _mm_add_epi16(
                       sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
        4);
    flat2_q[4] = _mm_srli_epi16(
        _mm_add_epi16(
            sum_q, _mm_add_epi16(
                       sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
        4);

    sum_p6 = _mm_add_epi16(sum_p6, p[6]);
    sum_q6 = _mm_add_epi16(sum_q6, q[6]);
    sum_p = _mm_sub_epi16(sum_p, q[1]);
    sum_q = _mm_sub_epi16(sum_q, p[1]);
    flat2_p[5] = _mm_srli_epi16(
        _mm_add_epi16(
            sum_p, _mm_add_epi16(
                       sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
        4);
    flat2_q[5] = _mm_srli_epi16(
        _mm_add_epi16(
            sum_q, _mm_add_epi16(
                       sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
        4);
365 366 367 368
  }

  // highbd_filter8
  p[2] = _mm_andnot_si128(flat, p[2]);
369
  //  p2 remains unchanged if !(flat && mask)
370
  flat_p[2] = _mm_and_si128(flat, flat_p[2]);
371
  //  when (flat && mask)
372 373 374 375 376 377 378 379 380 381 382 383 384 385
  p[2] = _mm_or_si128(p[2], flat_p[2]);  // full list of p2 values
  q[2] = _mm_andnot_si128(flat, q[2]);
  flat_q[2] = _mm_and_si128(flat, flat_q[2]);
  q[2] = _mm_or_si128(q[2], flat_q[2]);  // full list of q2 values

  int i;
  for (i = 1; i >= 0; i--) {
    ps[i] = _mm_andnot_si128(flat, ps[i]);
    flat_p[i] = _mm_and_si128(flat, flat_p[i]);
    p[i] = _mm_or_si128(ps[i], flat_p[i]);
    qs[i] = _mm_andnot_si128(flat, qs[i]);
    flat_q[i] = _mm_and_si128(flat, flat_q[i]);
    q[i] = _mm_or_si128(qs[i], flat_q[i]);
  }
386

Yaowu Xu's avatar
Yaowu Xu committed
387
  // highbd_filter16
388 389 390 391 392 393 394 395 396 397 398
  for (i = 5; i >= 0; i--) {
    //  p[i] remains unchanged if !(flat2 && flat && mask)
    p[i] = _mm_andnot_si128(flat2, p[i]);
    flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
    //  get values for when (flat2 && flat && mask)
    p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values

    q[i] = _mm_andnot_si128(flat2, q[i]);
    flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
    q[i] = _mm_or_si128(q[i], flat2_q[i]);
    if (pixel_output == FOUR_PIXELS) {
399 400
      _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), p[i]);
      _mm_storel_epi64((__m128i *)(s + i * pitch), q[i]);
401
    } else {
402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
      _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
      _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
    }
  }
}

// Note:
//  highbd_lpf_horz_edge_8_8p() output 8 pixels per register
//  highbd_lpf_horz_edge_8_4p() output 4 pixels per register
static INLINE void highbd_lpf_horz_edge_8_4p(uint16_t *s, int pitch,
                                             const uint8_t *blt,
                                             const uint8_t *lt,
                                             const uint8_t *thr, int bd) {
  highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, FOUR_PIXELS);
}

static INLINE void highbd_lpf_horz_edge_8_8p(uint16_t *s, int pitch,
                                             const uint8_t *blt,
                                             const uint8_t *lt,
                                             const uint8_t *thr, int bd) {
  highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, EIGHT_PIXELS);
}

425
void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int p,
426 427 428
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh, int bd) {
429
  highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
430 431
}

432
void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int p,
433 434 435
                                            const uint8_t *_blimit,
                                            const uint8_t *_limit,
                                            const uint8_t *_thresh, int bd) {
436
  highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
437
  highbd_lpf_horz_edge_8_4p(s + 4, p, _blimit, _limit, _thresh, bd);
438 439 440 441 442 443 444 445 446 447 448 449
}

static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1,
                                      const __m128i *p0, const __m128i *q0,
                                      const __m128i *q1, const __m128i *q2,
                                      int p, uint16_t *s) {
  _mm_storel_epi64((__m128i *)(s - 3 * p), *p2);
  _mm_storel_epi64((__m128i *)(s - 2 * p), *p1);
  _mm_storel_epi64((__m128i *)(s - 1 * p), *p0);
  _mm_storel_epi64((__m128i *)(s + 0 * p), *q0);
  _mm_storel_epi64((__m128i *)(s + 1 * p), *q1);
  _mm_storel_epi64((__m128i *)(s + 2 * p), *q2);
450 451
}

Yaowu Xu's avatar
Yaowu Xu committed
452
void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
453 454
                                      const uint8_t *_blimit,
                                      const uint8_t *_limit,
clang-format's avatar
clang-format committed
455
                                      const uint8_t *_thresh, int bd) {
456 457 458 459 460 461
  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
462
  const __m128i zero = _mm_set1_epi16(0);
463
  __m128i blimit, limit, thresh;
464
  __m128i mask, hev, flat;
465 466 467 468 469 470 471 472
  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
  __m128i q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
473 474 475 476 477 478 479 480
  const __m128i one = _mm_set1_epi16(1);
  const __m128i ffff = _mm_cmpeq_epi16(one, one);
  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
  const __m128i four = _mm_set1_epi16(4);
  __m128i workp_a, workp_b, workp_shft;

  const __m128i t4 = _mm_set1_epi16(4);
  const __m128i t3 = _mm_set1_epi16(3);
481
  __m128i t80;
482
  const __m128i t1 = _mm_set1_epi16(0x1);
483
  __m128i ps1, ps0, qs0, qs1;
484 485 486 487
  __m128i filt;
  __m128i work_a;
  __m128i filter1, filter2;

488 489 490 491 492 493 494
  if (bd == 8) {
    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
    t80 = _mm_set1_epi16(0x80);
  } else if (bd == 10) {
    blimit = _mm_slli_epi16(
clang-format's avatar
clang-format committed
495
        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
496
    limit = _mm_slli_epi16(
clang-format's avatar
clang-format committed
497
        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
498
    thresh = _mm_slli_epi16(
clang-format's avatar
clang-format committed
499
        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
500 501 502
    t80 = _mm_set1_epi16(0x200);
  } else {  // bd == 12
    blimit = _mm_slli_epi16(
clang-format's avatar
clang-format committed
503
        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
504
    limit = _mm_slli_epi16(
clang-format's avatar
clang-format committed
505
        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
506
    thresh = _mm_slli_epi16(
clang-format's avatar
clang-format committed
507
        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
508 509 510 511 512 513 514 515
    t80 = _mm_set1_epi16(0x800);
  }

  ps1 = _mm_subs_epi16(p1, t80);
  ps0 = _mm_subs_epi16(p0, t80);
  qs0 = _mm_subs_epi16(q0, t80);
  qs1 = _mm_subs_epi16(q1, t80);

516
  // filter_mask and hev_mask
clang-format's avatar
clang-format committed
517 518 519 520 521
  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));

  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
522 523 524 525
  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
  hev = _mm_subs_epu16(flat, thresh);
  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);

clang-format's avatar
clang-format committed
526
  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
527 528 529 530 531 532 533 534 535 536 537
  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
  // So taking maximums continues to work:
  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
  mask = _mm_max_epi16(abs_p1p0, mask);
  // mask |= (abs(p1 - p0) > limit) * -1;
  mask = _mm_max_epi16(abs_q1q0, mask);
  // mask |= (abs(q1 - q0) > limit) * -1;

clang-format's avatar
clang-format committed
538 539 540
  work = _mm_max_epi16(
      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
541
  mask = _mm_max_epi16(work, mask);
clang-format's avatar
clang-format committed
542 543 544
  work = _mm_max_epi16(
      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
545 546 547 548 549
  mask = _mm_max_epi16(work, mask);
  mask = _mm_subs_epu16(mask, limit);
  mask = _mm_cmpeq_epi16(mask, zero);

  // flat_mask4
clang-format's avatar
clang-format committed
550 551 552 553 554 555
  flat = _mm_max_epi16(
      _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
      _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
  work = _mm_max_epi16(
      _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
      _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
556 557 558
  flat = _mm_max_epi16(work, flat);
  flat = _mm_max_epi16(abs_p1p0, flat);
  flat = _mm_max_epi16(abs_q1q0, flat);
559 560 561 562 563 564 565 566

  if (bd == 8)
    flat = _mm_subs_epu16(flat, one);
  else if (bd == 10)
    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
  else  // bd == 12
    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));

567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
  flat = _mm_cmpeq_epi16(flat, zero);
  flat = _mm_and_si128(flat, mask);  // flat & mask

  // Added before shift for rounding part of ROUND_POWER_OF_TWO

  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
  _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);

  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
  _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);

  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
  _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);

  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
  _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);

  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
  _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);

  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
  _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);

  // lp filter
603 604 605 606 607 608 609
  const __m128i pmax =
      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
  const __m128i pmin = _mm_subs_epi16(zero, t80);

  filt = _mm_subs_epi16(ps1, qs1);
  pixel_clamp(&pmin, &pmax, &filt);

610 611 612 613 614
  filt = _mm_and_si128(filt, hev);
  work_a = _mm_subs_epi16(qs0, ps0);
  filt = _mm_adds_epi16(filt, work_a);
  filt = _mm_adds_epi16(filt, work_a);
  filt = _mm_adds_epi16(filt, work_a);
Yaowu Xu's avatar
Yaowu Xu committed
615
  // (aom_filter + 3 * (qs0 - ps0)) & mask
616
  pixel_clamp(&pmin, &pmax, &filt);
617 618 619 620 621 622
  filt = _mm_and_si128(filt, mask);

  filter1 = _mm_adds_epi16(filt, t4);
  filter2 = _mm_adds_epi16(filt, t3);

  // Filter1 >> 3
623
  pixel_clamp(&pmin, &pmax, &filter1);
624 625 626
  filter1 = _mm_srai_epi16(filter1, 3);

  // Filter2 >> 3
627
  pixel_clamp(&pmin, &pmax, &filter2);
628 629 630 631 632 633 634
  filter2 = _mm_srai_epi16(filter2, 3);

  // filt >> 1
  filt = _mm_adds_epi16(filter1, t1);
  filt = _mm_srai_epi16(filt, 1);
  filt = _mm_andnot_si128(hev, filt);

635 636
  work_a = _mm_subs_epi16(qs0, filter1);
  pixel_clamp(&pmin, &pmax, &work_a);
637 638 639 640 641 642
  work_a = _mm_adds_epi16(work_a, t80);
  q0 = _mm_load_si128((__m128i *)flat_oq0);
  work_a = _mm_andnot_si128(flat, work_a);
  q0 = _mm_and_si128(flat, q0);
  q0 = _mm_or_si128(work_a, q0);

643 644
  work_a = _mm_subs_epi16(qs1, filt);
  pixel_clamp(&pmin, &pmax, &work_a);
645 646 647 648 649 650 651 652 653 654 655 656
  work_a = _mm_adds_epi16(work_a, t80);
  q1 = _mm_load_si128((__m128i *)flat_oq1);
  work_a = _mm_andnot_si128(flat, work_a);
  q1 = _mm_and_si128(flat, q1);
  q1 = _mm_or_si128(work_a, q1);

  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
  q2 = _mm_load_si128((__m128i *)flat_oq2);
  work_a = _mm_andnot_si128(flat, work_a);
  q2 = _mm_and_si128(flat, q2);
  q2 = _mm_or_si128(work_a, q2);

657 658
  work_a = _mm_adds_epi16(ps0, filter2);
  pixel_clamp(&pmin, &pmax, &work_a);
659 660 661 662 663 664
  work_a = _mm_adds_epi16(work_a, t80);
  p0 = _mm_load_si128((__m128i *)flat_op0);
  work_a = _mm_andnot_si128(flat, work_a);
  p0 = _mm_and_si128(flat, p0);
  p0 = _mm_or_si128(work_a, p0);

665 666
  work_a = _mm_adds_epi16(ps1, filt);
  pixel_clamp(&pmin, &pmax, &work_a);
667 668 669 670 671 672 673 674 675 676 677 678
  work_a = _mm_adds_epi16(work_a, t80);
  p1 = _mm_load_si128((__m128i *)flat_op1);
  work_a = _mm_andnot_si128(flat, work_a);
  p1 = _mm_and_si128(flat, p1);
  p1 = _mm_or_si128(work_a, p1);

  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
  p2 = _mm_load_si128((__m128i *)flat_op2);
  work_a = _mm_andnot_si128(flat, work_a);
  p2 = _mm_and_si128(flat, p2);
  p2 = _mm_or_si128(work_a, p2);

679
  store_horizontal_8(&p2, &p1, &p0, &q0, &q1, &q2, p, s);
680 681
}

Yaowu Xu's avatar
Yaowu Xu committed
682
void aom_highbd_lpf_horizontal_8_dual_sse2(
clang-format's avatar
clang-format committed
683 684 685
    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
    const uint8_t *_thresh1, int bd) {
Yaowu Xu's avatar
Yaowu Xu committed
686
  aom_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
Yi Luo's avatar
Yi Luo committed
687
  aom_highbd_lpf_horizontal_8_sse2(s + 4, p, _blimit1, _limit1, _thresh1, bd);
688 689
}

Yaowu Xu's avatar
Yaowu Xu committed
690
void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
691 692
                                      const uint8_t *_blimit,
                                      const uint8_t *_limit,
clang-format's avatar
clang-format committed
693
                                      const uint8_t *_thresh, int bd) {
694
  const __m128i zero = _mm_set1_epi16(0);
695
  __m128i blimit, limit, thresh;
696 697 698 699 700
  __m128i mask, hev, flat;
  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
clang-format's avatar
clang-format committed
701 702 703 704
  const __m128i abs_p1p0 =
      _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
  const __m128i abs_q1q0 =
      _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
705 706
  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
  const __m128i one = _mm_set1_epi16(1);
clang-format's avatar
clang-format committed
707 708 709 710
  __m128i abs_p0q0 =
      _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
  __m128i abs_p1q1 =
      _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
711

712 713
  const __m128i t4 = _mm_set1_epi16(4);
  const __m128i t3 = _mm_set1_epi16(3);
714 715 716 717
  __m128i t80;
  __m128i tff80;
  __m128i tffe0;
  __m128i t1f;
718 719 720
  // equivalent to shifting 0x1f left by bitdepth - 8
  // and setting new bits to 1
  const __m128i t1 = _mm_set1_epi16(0x1);
721
  __m128i t7f;
722 723
  // equivalent to shifting 0x7f left by bitdepth - 8
  // and setting new bits to 1
724
  __m128i ps1, ps0, qs0, qs1;
725 726 727 728
  __m128i filt;
  __m128i work_a;
  __m128i filter1, filter2;

729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
  if (bd == 8) {
    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
    t80 = _mm_set1_epi16(0x80);
    tff80 = _mm_set1_epi16(0xff80);
    tffe0 = _mm_set1_epi16(0xffe0);
    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
  } else if (bd == 10) {
    blimit = _mm_slli_epi16(
        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
    limit = _mm_slli_epi16(
        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
    thresh = _mm_slli_epi16(
        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
  } else {  // bd == 12
    blimit = _mm_slli_epi16(
        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
    limit = _mm_slli_epi16(
        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
    thresh = _mm_slli_epi16(
        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
  }

  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);

769 770 771 772 773
  // filter_mask and hev_mask
  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
  hev = _mm_subs_epu16(flat, thresh);
  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);

clang-format's avatar
clang-format committed
774
  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
775 776 777 778 779 780 781
  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
  // So taking maximums continues to work:
  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
  mask = _mm_max_epi16(flat, mask);
782

783 784 785 786
  mask = _mm_subs_epu16(mask, limit);
  mask = _mm_cmpeq_epi16(mask, zero);

  // filter4
787 788 789 790 791 792
  const __m128i pmax =
      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
  const __m128i pmin = _mm_subs_epi16(zero, t80);

  filt = _mm_subs_epi16(ps1, qs1);
  pixel_clamp(&pmin, &pmax, &filt);
793 794 795 796
  filt = _mm_and_si128(filt, hev);
  work_a = _mm_subs_epi16(qs0, ps0);
  filt = _mm_adds_epi16(filt, work_a);
  filt = _mm_adds_epi16(filt, work_a);
797 798
  filt = _mm_adds_epi16(filt, work_a);
  pixel_clamp(&pmin, &pmax, &filt);
799

Yaowu Xu's avatar
Yaowu Xu committed
800
  // (aom_filter + 3 * (qs0 - ps0)) & mask
801 802
  filt = _mm_and_si128(filt, mask);

803 804 805 806 807
  filter1 = _mm_adds_epi16(filt, t4);
  pixel_clamp(&pmin, &pmax, &filter1);

  filter2 = _mm_adds_epi16(filt, t3);
  pixel_clamp(&pmin, &pmax, &filter2);
808 809 810 811

  // Filter1 >> 3
  work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
  filter1 = _mm_srli_epi16(filter1, 3);
clang-format's avatar
clang-format committed
812 813
  work_a = _mm_and_si128(work_a, tffe0);    // sign bits for the values < 0
  filter1 = _mm_and_si128(filter1, t1f);    // clamp the range
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832
  filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits

  // Filter2 >> 3
  work_a = _mm_cmpgt_epi16(zero, filter2);
  filter2 = _mm_srli_epi16(filter2, 3);
  work_a = _mm_and_si128(work_a, tffe0);
  filter2 = _mm_and_si128(filter2, t1f);
  filter2 = _mm_or_si128(filter2, work_a);

  // filt >> 1
  filt = _mm_adds_epi16(filter1, t1);
  work_a = _mm_cmpgt_epi16(zero, filt);
  filt = _mm_srli_epi16(filt, 1);
  work_a = _mm_and_si128(work_a, tff80);
  filt = _mm_and_si128(filt, t7f);
  filt = _mm_or_si128(filt, work_a);

  filt = _mm_andnot_si128(hev, filt);

833 834 835 836 837 838 839 840 841 842 843 844 845 846 847
  q0 = _mm_subs_epi16(qs0, filter1);
  pixel_clamp(&pmin, &pmax, &q0);
  q0 = _mm_adds_epi16(q0, t80);

  q1 = _mm_subs_epi16(qs1, filt);
  pixel_clamp(&pmin, &pmax, &q1);
  q1 = _mm_adds_epi16(q1, t80);

  p0 = _mm_adds_epi16(ps0, filter2);
  pixel_clamp(&pmin, &pmax, &p0);
  p0 = _mm_adds_epi16(p0, t80);

  p1 = _mm_adds_epi16(ps1, filt);
  pixel_clamp(&pmin, &pmax, &p1);
  p1 = _mm_adds_epi16(p1, t80);
848 849 850 851
  _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
  _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
  _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
  _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
852 853
}

Yaowu Xu's avatar
Yaowu Xu committed
854
void aom_highbd_lpf_horizontal_4_dual_sse2(
clang-format's avatar
clang-format committed
855 856 857
    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
    const uint8_t *_thresh1, int bd) {
Yaowu Xu's avatar
Yaowu Xu committed
858
  aom_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
Yi Luo's avatar
Yi Luo committed
859
  aom_highbd_lpf_horizontal_4_sse2(s + 4, p, _blimit1, _limit1, _thresh1, bd);
860 861
}

Yaowu Xu's avatar
Yaowu Xu committed
862
void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
clang-format's avatar
clang-format committed
863
                                    const uint8_t *limit, const uint8_t *thresh,
864
                                    int bd) {
865
  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
866 867 868 869 870 871 872 873 874 875
  uint16_t *src[1];
  uint16_t *dst[1];

  // Transpose 8x8
  src[0] = s - 4;
  dst[0] = t_dst;

  highbd_transpose(src, p, dst, 8, 1);

  // Loop filtering
Yaowu Xu's avatar
Yaowu Xu committed
876
  aom_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
877 878 879 880 881 882 883 884

  src[0] = t_dst;
  dst[0] = s - 4;

  // Transpose back
  highbd_transpose(src, 8, dst, p, 1);
}

Yaowu Xu's avatar
Yaowu Xu committed
885
void aom_highbd_lpf_vertical_4_dual_sse2(
clang-format's avatar
clang-format committed
886 887 888
    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
    const uint8_t *thresh1, int bd) {
889
  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
890 891 892 893 894 895 896
  uint16_t *src[2];
  uint16_t *dst[2];

  // Transpose 8x16
  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);

  // Loop filtering
Yaowu Xu's avatar
Yaowu Xu committed
897
  aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
898 899 900 901 902 903 904 905 906 907
                                        thresh0, blimit1, limit1, thresh1, bd);
  src[0] = t_dst;
  src[1] = t_dst + 8;
  dst[0] = s - 4;
  dst[1] = s - 4 + p * 8;

  // Transpose back
  highbd_transpose(src, 16, dst, p, 2);
}

Yaowu Xu's avatar
Yaowu Xu committed
908
void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
clang-format's avatar
clang-format committed
909
                                    const uint8_t *limit, const uint8_t *thresh,
910
                                    int bd) {
911
  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
912 913 914 915 916 917 918 919 920 921
  uint16_t *src[1];
  uint16_t *dst[1];

  // Transpose 8x8
  src[0] = s - 4;
  dst[0] = t_dst;

  highbd_transpose(src, p, dst, 8, 1);

  // Loop filtering
Yaowu Xu's avatar
Yaowu Xu committed
922
  aom_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
923 924 925 926 927 928 929 930

  src[0] = t_dst;
  dst[0] = s - 4;

  // Transpose back
  highbd_transpose(src, 8, dst, p, 1);
}

Yaowu Xu's avatar
Yaowu Xu committed
931
void aom_highbd_lpf_vertical_8_dual_sse2(
clang-format's avatar
clang-format committed
932 933 934
    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
    const uint8_t *thresh1, int bd) {
935
  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
936 937 938 939 940 941 942
  uint16_t *src[2];
  uint16_t *dst[2];

  // Transpose 8x16
  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);

  // Loop filtering
Yaowu Xu's avatar
Yaowu Xu committed
943
  aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
944 945 946 947 948 949 950 951 952 953 954
                                        thresh0, blimit1, limit1, thresh1, bd);
  src[0] = t_dst;
  src[1] = t_dst + 8;

  dst[0] = s - 4;
  dst[1] = s - 4 + p * 8;

  // Transpose back
  highbd_transpose(src, 16, dst, p, 2);
}

955
void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int p, const uint8_t *blimit,
956
                                     const uint8_t *limit,
clang-format's avatar
clang-format committed
957
                                     const uint8_t *thresh, int bd) {
958
  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
959 960 961 962 963 964 965 966 967 968 969 970
  uint16_t *src[2];
  uint16_t *dst[2];

  src[0] = s - 8;
  src[1] = s;
  dst[0] = t_dst;
  dst[1] = t_dst + 8 * 8;

  // Transpose 16x8
  highbd_transpose(src, p, dst, 8, 2);

  // Loop filtering
971
  aom_highbd_lpf_horizontal_14_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
972
                                    bd);
973 974 975 976 977 978 979 980 981
  src[0] = t_dst;
  src[1] = t_dst + 8 * 8;
  dst[0] = s - 8;
  dst[1] = s;

  // Transpose back
  highbd_transpose(src, 8, dst, p, 2);
}

982
void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int p,
983 984
                                          const uint8_t *blimit,
                                          const uint8_t *limit,
clang-format's avatar
clang-format committed
985
                                          const uint8_t *thresh, int bd) {
986
  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
987 988 989 990

  //  Transpose 16x16
  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
991
  highbd_lpf_horz_edge_8_8p(t_dst + 8 * 16, 16, blimit, limit, thresh, bd);
992 993 994 995
  //  Transpose back
  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
}