loopfilter_sse2.c 92.9 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
/*
Yaowu Xu's avatar
Yaowu Xu committed
2
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
John Koleszar's avatar
John Koleszar committed
3
 *
Yaowu Xu's avatar
Yaowu Xu committed
4 5 6 7 8 9
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
John Koleszar's avatar
John Koleszar committed
10 11
 */

12
#include <emmintrin.h>  // SSE2
13

Yaowu Xu's avatar
Yaowu Xu committed
14
#include "./aom_dsp_rtcd.h"
15
#include "aom_dsp/x86/synonyms.h"
16 17
#include "aom_ports/mem.h"
#include "aom_ports/emmintrin_compat.h"
John Koleszar's avatar
John Koleszar committed
18

19 20 21 22
static INLINE __m128i abs_diff(__m128i a, __m128i b) {
  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
}

23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
#if CONFIG_PARALLEL_DEBLOCKING
// filter_mask and hev_mask
#define FILTER_HEV_MASK4                                                      \
  do {                                                                        \
    /* (abs(q1 - q0), abs(p1 - p0) */                                         \
    __m128i flat = abs_diff(q1p1, q0p0);                                      \
    /* abs(p1 - q1), abs(p0 - q0) */                                          \
    const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
    __m128i abs_p0q0, abs_p1q1;                                               \
                                                                              \
    /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
    hev =                                                                     \
        _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
    hev = _mm_packs_epi16(hev, hev);                                          \
                                                                              \
    /* const int8_t mask = filter_mask2(*limit, *blimit, */                   \
    /*                                  p1, p0, q0, q1); */                   \
    abs_p0q0 =                                                                \
        _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
    abs_p1q1 =                                                                \
        _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
    abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
    /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
    mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
    mask = _mm_unpacklo_epi64(mask, flat);                                    \
    mask = _mm_subs_epu8(mask, limit);                                        \
    mask = _mm_cmpeq_epi8(mask, zero);                                        \
    mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
  } while (0)
#endif  // CONFIG_PARALLEL_DEBLOCKING

57
// filter_mask and hev_mask
clang-format's avatar
clang-format committed
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
#define FILTER_HEV_MASK                                                       \
  do {                                                                        \
    /* (abs(q1 - q0), abs(p1 - p0) */                                         \
    __m128i flat = abs_diff(q1p1, q0p0);                                      \
    /* abs(p1 - q1), abs(p0 - q0) */                                          \
    const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
    __m128i abs_p0q0, abs_p1q1, work;                                         \
                                                                              \
    /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
    hev =                                                                     \
        _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
    hev = _mm_packs_epi16(hev, hev);                                          \
                                                                              \
    /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
    /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */    \
    abs_p0q0 =                                                                \
        _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
    abs_p1q1 =                                                                \
        _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
    abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
    /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
    mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
    /* abs(p3 - p2), abs(p2 - p1) */                                          \
    work = abs_diff(p3p2, p2p1);                                              \
    flat = _mm_max_epu8(work, flat);                                          \
    /* abs(q3 - q2), abs(q2 - q1) */                                          \
    work = abs_diff(q3q2, q2q1);                                              \
    flat = _mm_max_epu8(work, flat);                                          \
    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
    mask = _mm_unpacklo_epi64(mask, flat);                                    \
    mask = _mm_subs_epu8(mask, limit);                                        \
    mask = _mm_cmpeq_epi8(mask, zero);                                        \
    mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
  } while (0)

95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, __m128i *hev,
                                   __m128i *mask, __m128i *qs1qs0,
                                   __m128i *ps1ps0) {
  const __m128i t3t4 =
      _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
  const __m128i t80 = _mm_set1_epi8(0x80);
  __m128i filter, filter2filter1, work;
  __m128i ps1ps0_work, qs1qs0_work;
  const __m128i ff = _mm_cmpeq_epi8(t80, t80);

  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
  qs1qs0_work = _mm_xor_si128(*q1q0, t80);

  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
  filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
  filter = _mm_subs_epi8(filter, work);
  filter = _mm_subs_epi8(filter, work);
  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
  filter = _mm_and_si128(filter, *mask); /* & mask */
  filter = _mm_unpacklo_epi64(filter, filter);

  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
  filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
  filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */
  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);

  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
  filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
  filter = _mm_unpacklo_epi8(filter, filter);
  filter = _mm_srai_epi16(filter, 9); /* round */
  filter = _mm_packs_epi16(filter, filter);
  filter = _mm_andnot_si128(*hev, filter);

  *hev = _mm_unpackhi_epi64(filter2filter1, filter);
  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);

  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, *hev);
  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
}
144

Yaowu Xu's avatar
Yaowu Xu committed
145
void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
146 147 148 149 150 151 152 153
                               const uint8_t *_blimit, const uint8_t *_limit,
                               const uint8_t *_thresh) {
  const __m128i zero = _mm_set1_epi16(0);
  const __m128i limit =
      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
                         _mm_loadl_epi64((const __m128i *)_limit));
  const __m128i thresh =
      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
154 155 156 157
#if !CONFIG_PARALLEL_DEBLOCKING
  __m128i p3p2, p2p1, q3q2, q2q1;
#endif  // !CONFIG_PARALLEL_DEBLOCKING
  __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
158
  __m128i mask, hev;
159
#if !CONFIG_PARALLEL_DEBLOCKING
160 161
  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
162
#endif  // !CONFIG_PARALLEL_DEBLOCKING
163 164 165 166
  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
167
#if !CONFIG_PARALLEL_DEBLOCKING
168 169
  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
170
#endif  // !CONFIG_PARALLEL_DEBLOCKING
171 172
  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
173 174
#if !CONFIG_PARALLEL_DEBLOCKING
  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
175
  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
176 177
#endif  // !CONFIG_PARALLEL_DEBLOCKING
#if !CONFIG_PARALLEL_DEBLOCKING
178
  FILTER_HEV_MASK;
179 180 181
#else   // CONFIG_PARALLEL_DEBLOCKING
  FILTER_HEV_MASK4;
#endif  // !CONFIG_PARALLEL_DEBLOCKING
182
  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
183

184
#if CONFIG_PARALLEL_DEBLOCKING
185 186 187 188
  xx_storel_32(s - 1 * p, ps1ps0);
  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8));
  xx_storel_32(s + 0 * p, qs1qs0);
  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8));
189
#else
190
  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
clang-format's avatar
clang-format committed
191 192
  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
193
  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
194
#endif
195 196
}

Yaowu Xu's avatar
Yaowu Xu committed
197
void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
198 199 200 201 202 203 204 205
                             const uint8_t *_blimit, const uint8_t *_limit,
                             const uint8_t *_thresh) {
  const __m128i zero = _mm_set1_epi16(0);
  const __m128i limit =
      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
                         _mm_loadl_epi64((const __m128i *)_limit));
  const __m128i thresh =
      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
206

207
  __m128i x0, x1, x2, x3;
208 209 210 211
#if !CONFIG_PARALLEL_DEBLOCKING
  __m128i p3p2, p2p1, q3q2, q2q1;
#endif  // !CONFIG_PARALLEL_DEBLOCKING
  __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
  __m128i mask, hev;

  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));

  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));

  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));

  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));

  // Transpose 8x8
  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
  p1p0 = _mm_unpacklo_epi16(q1q0, x1);
  // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
  x0 = _mm_unpacklo_epi16(x2, x3);
235
#if !CONFIG_PARALLEL_DEBLOCKING
236 237
  // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
  p3p2 = _mm_unpacklo_epi32(p1p0, x0);
238
#endif  // !CONFIG_PARALLEL_DEBLOCKING
239 240
  // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
  p1p0 = _mm_unpackhi_epi32(p1p0, x0);
241
#if !CONFIG_PARALLEL_DEBLOCKING
242
  p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
243
#endif  // !CONFIG_PARALLEL_DEBLOCKING
244 245 246 247 248 249
  p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high

  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
  q1q0 = _mm_unpackhi_epi16(q1q0, x1);
  // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
  x2 = _mm_unpackhi_epi16(x2, x3);
250
#if !CONFIG_PARALLEL_DEBLOCKING
251 252
  // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
  q3q2 = _mm_unpackhi_epi32(q1q0, x2);
253
#endif  // !CONFIG_PARALLEL_DEBLOCKING
254 255 256 257 258 259
  // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
  q1q0 = _mm_unpacklo_epi32(q1q0, x2);

  q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
  q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
260
#if !CONFIG_PARALLEL_DEBLOCKING
261 262
  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
263 264
#endif  // !CONFIG_PARALLEL_DEBLOCKING
#if !CONFIG_PARALLEL_DEBLOCKING
265
  FILTER_HEV_MASK;
266 267 268
#else   // CONFIG_PARALLEL_DEBLOCKING
  FILTER_HEV_MASK4;
#endif  // !CONFIG_PARALLEL_DEBLOCKING
269
  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
270 271 272 273 274 275 276 277 278 279

  // Transpose 8x4 to 4x8
  // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
  // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
  // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
  ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
  // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
  x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
  // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
280
#if !CONFIG_PARALLEL_DEBLOCKING
281 282
  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
  qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
283
#endif
284 285 286
  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);

287 288 289 290
  xx_storel_32(s + 0 * p - 2, ps1ps0);
  xx_storel_32(s + 1 * p - 2, _mm_srli_si128(ps1ps0, 4));
  xx_storel_32(s + 2 * p - 2, _mm_srli_si128(ps1ps0, 8));
  xx_storel_32(s + 3 * p - 2, _mm_srli_si128(ps1ps0, 12));
291
#if !CONFIG_PARALLEL_DEBLOCKING
292 293 294 295
  xx_storel_32(s + 4 * p - 2, qs1qs0);
  xx_storel_32(s + 5 * p - 2, _mm_srli_si128(qs1qs0, 4));
  xx_storel_32(s + 6 * p - 2, _mm_srli_si128(qs1qs0, 8));
  xx_storel_32(s + 7 * p - 2, _mm_srli_si128(qs1qs0, 12));
296 297 298
#endif
}

299
static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
300
#if CONFIG_PARALLEL_DEBLOCKING
301 302
  xx_storel_32(s - (num + 1) * p, x);
  xx_storel_32(s + num * p, _mm_srli_si128(x, 8));
303
#else
304 305
  xx_storel_64(s - (num + 1) * p, x);
  _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(x));
306
#endif
307 308
}

James Zern's avatar
James Zern committed
309 310 311 312
void aom_lpf_horizontal_16_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
                                const unsigned char *_thresh) {
313
  const __m128i zero = _mm_set1_epi16(0);
314
  const __m128i one = _mm_set1_epi8(1);
315 316 317 318
  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
  __m128i mask, hev, flat, flat2;
319 320 321 322
#if !CONFIG_DEBLOCK_13TAP
  __m128i q7p7;
#endif
  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
323
  __m128i abs_p1p0;
324

325
  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
clang-format's avatar
clang-format committed
326 327
  q4p4 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
328
  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
clang-format's avatar
clang-format committed
329 330
  q3p3 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
331
  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
clang-format's avatar
clang-format committed
332 333
  q2p2 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
334
  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
clang-format's avatar
clang-format committed
335 336
  q1p1 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
337 338
  p1q1 = _mm_shuffle_epi32(q1p1, 78);
  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
clang-format's avatar
clang-format committed
339 340
  q0p0 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
341
  p0q0 = _mm_shuffle_epi32(q0p0, 78);
342

343
  {
344
    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
345
    abs_p1p0 = abs_diff(q1p1, q0p0);
clang-format's avatar
clang-format committed
346
    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
347 348
    fe = _mm_set1_epi8(0xfe);
    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
349 350
    abs_p0q0 = abs_diff(q0p0, p0q0);
    abs_p1q1 = abs_diff(q1p1, p1q1);
351 352 353 354
    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    hev = _mm_subs_epu8(flat, thresh);
    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

clang-format's avatar
clang-format committed
355
    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
356 357 358 359
    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
360
    mask = _mm_max_epu8(abs_p1p0, mask);
361 362
    // mask |= (abs(p1 - p0) > limit) * -1;
    // mask |= (abs(q1 - q0) > limit) * -1;
363

clang-format's avatar
clang-format committed
364
    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
365
    mask = _mm_max_epu8(work, mask);
366
    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
367 368 369 370 371 372 373 374 375
    mask = _mm_subs_epu8(mask, limit);
    mask = _mm_cmpeq_epi8(mask, zero);
  }

  // lp filter
  {
    const __m128i t4 = _mm_set1_epi8(4);
    const __m128i t3 = _mm_set1_epi8(3);
    const __m128i t80 = _mm_set1_epi8(0x80);
376 377 378 379 380
    const __m128i t1 = _mm_set1_epi16(0x1);
    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
    __m128i qs0 = _mm_xor_si128(p0q0, t80);
    __m128i qs1 = _mm_xor_si128(p1q1, t80);
381 382 383
    __m128i filt;
    __m128i work_a;
    __m128i filter1, filter2;
384 385 386 387
#if !CONFIG_DEBLOCK_13TAP
    __m128i flat2_q6p6;
#endif
    __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
388
    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
389

390 391
    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
    work_a = _mm_subs_epi8(qs0, qs0ps0);
392 393 394
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
Yaowu Xu's avatar
Yaowu Xu committed
395
    // (aom_filter + 3 * (qs0 - ps0)) & mask
396 397 398 399 400
    filt = _mm_and_si128(filt, mask);

    filter1 = _mm_adds_epi8(filt, t4);
    filter2 = _mm_adds_epi8(filt, t3);

401 402 403 404 405
    filter1 = _mm_unpacklo_epi8(zero, filter1);
    filter1 = _mm_srai_epi16(filter1, 0xB);
    filter2 = _mm_unpacklo_epi8(zero, filter2);
    filter2 = _mm_srai_epi16(filter2, 0xB);

406
    // Filter1 >> 3
407 408
    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
409

410
    // filt >> 1
411 412 413 414 415 416
    filt = _mm_adds_epi16(filter1, t1);
    filt = _mm_srai_epi16(filt, 1);
    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
                            filt);
    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
417 418 419 420
    // loopfilter done

    {
      __m128i work;
421
      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
422 423
      flat = _mm_max_epu8(abs_p1p0, flat);
      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
424 425 426 427
      flat = _mm_subs_epu8(flat, one);
      flat = _mm_cmpeq_epi8(flat, zero);
      flat = _mm_and_si128(flat, mask);

428
      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
clang-format's avatar
clang-format committed
429 430
      q5p5 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
431 432

      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
clang-format's avatar
clang-format committed
433 434
      q6p6 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
435
      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
436
#if !CONFIG_DEBLOCK_13TAP
437
      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
clang-format's avatar
clang-format committed
438 439
      q7p7 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
440
      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
441 442 443
#else
      work = abs_diff(q6p6, q0p0);
#endif
444
      flat2 = _mm_max_epu8(work, flat2);
445
      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
446 447 448 449 450 451 452 453 454
      flat2 = _mm_subs_epu8(flat2, one);
      flat2 = _mm_cmpeq_epi8(flat2, zero);
      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    }
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // flat and wide flat calculations
    {
      const __m128i eight = _mm_set1_epi16(8);
      const __m128i four = _mm_set1_epi16(4);
455 456 457 458 459
#if !CONFIG_DEBLOCK_13TAP
      __m128i p7_16, q7_16;
#endif
      __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
      __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
460 461
      __m128i pixelFilter_p, pixelFilter_q;
      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
462 463 464 465 466 467
#if !CONFIG_DEBLOCK_13TAP
      __m128i sum_p7, sum_q7;
#else
      __m128i sum_p6, sum_q6;
#endif
      __m128i sum_p3, sum_q3, res_p, res_q;
468

469
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
470
      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
471
#endif
472 473 474 475 476 477 478 479 480 481 482 483 484 485
      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
486
#if !CONFIG_DEBLOCK_13TAP
487 488 489 490 491 492
      q7_16 = _mm_unpackhi_epi8(q7p7, zero);

      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
                                    _mm_add_epi16(p4_16, p3_16));
      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
                                    _mm_add_epi16(q4_16, q3_16));
493 494 495 496
#else
      pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
      pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
#endif
497 498

      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
clang-format's avatar
clang-format committed
499
      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
500 501

      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
clang-format's avatar
clang-format committed
502 503 504 505 506 507
      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
      pixelFilter_p =
          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
      pixetFilter_p2p1p0 = _mm_add_epi16(
          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
      res_p = _mm_srli_epi16(
508
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
509
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
510 511 512 513 514 515
#else
          _mm_add_epi16(pixelFilter_p,
                        _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
                                      _mm_add_epi16(p1_16, q0_16))),
          4);
#endif
clang-format's avatar
clang-format committed
516
      res_q = _mm_srli_epi16(
517
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
518
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
519 520 521 522 523 524
#else
          _mm_add_epi16(pixelFilter_p,
                        _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
                                      _mm_add_epi16(p0_16, q1_16))),
          4);
#endif
525
      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
526

clang-format's avatar
clang-format committed
527 528 529 530
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
531 532 533

      flat_q0p0 = _mm_packus_epi16(res_p, res_q);

534
#if !CONFIG_DEBLOCK_13TAP
535 536
      sum_p7 = _mm_add_epi16(p7_16, p7_16);
      sum_q7 = _mm_add_epi16(q7_16, q7_16);
537 538 539 540
#else
      sum_p6 = _mm_add_epi16(p6_16, p6_16);
      sum_q6 = _mm_add_epi16(q6_16, q6_16);
#endif
541 542 543
      sum_p3 = _mm_add_epi16(p3_16, p3_16);
      sum_q3 = _mm_add_epi16(q3_16, q3_16);

544
#if !CONFIG_DEBLOCK_13TAP
545 546
      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
547 548 549 550 551
#else
      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
#endif

clang-format's avatar
clang-format committed
552
      res_p = _mm_srli_epi16(
553
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
554
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
555 556 557 558 559 560 561
#else
          _mm_add_epi16(
              pixelFilter_p,
              _mm_add_epi16(sum_p6,
                            _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
          4);
#endif
clang-format's avatar
clang-format committed
562
      res_q = _mm_srli_epi16(
563
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
564
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
565 566 567 568 569 570 571
#else
          _mm_add_epi16(
              pixelFilter_q,
              _mm_add_epi16(sum_q6,
                            _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
          4);
#endif
572 573 574 575
      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);

      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
clang-format's avatar
clang-format committed
576 577 578 579
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
580 581
      flat_q1p1 = _mm_packus_epi16(res_p, res_q);

582
#if !CONFIG_DEBLOCK_13TAP
583 584
      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
585 586 587 588
#else
      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
#endif
589 590 591
      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
      sum_q3 = _mm_add_epi16(sum_q3, q3_16);

592
#if !CONFIG_DEBLOCK_13TAP
593 594
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
595 596 597 598 599
#else
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
#endif

clang-format's avatar
clang-format committed
600
      res_p = _mm_srli_epi16(
601
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
602
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
603 604 605 606 607 608 609
#else
          _mm_add_epi16(
              pixelFilter_p,
              _mm_add_epi16(sum_p6,
                            _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
          4);
#endif
clang-format's avatar
clang-format committed
610
      res_q = _mm_srli_epi16(
611
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
612
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
613 614 615 616 617 618 619
#else
          _mm_add_epi16(
              pixelFilter_q,
              _mm_add_epi16(sum_q6,
                            _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
          4);
#endif
620 621 622 623 624
      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);

      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);

clang-format's avatar
clang-format committed
625 626 627 628
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
629 630
      flat_q2p2 = _mm_packus_epi16(res_p, res_q);

631
#if !CONFIG_DEBLOCK_13TAP
632 633
      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
634 635 636 637 638 639
#else
      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
#endif

#if !CONFIG_DEBLOCK_13TAP
640 641
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
642 643 644 645 646
#else
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
#endif

clang-format's avatar
clang-format committed
647
      res_p = _mm_srli_epi16(
648
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
649
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
650 651 652 653 654 655 656
#else
          _mm_add_epi16(
              pixelFilter_p,
              _mm_add_epi16(sum_p6,
                            _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
          4);
#endif
clang-format's avatar
clang-format committed
657
      res_q = _mm_srli_epi16(
658
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
659
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
660 661 662 663 664 665 666
#else
          _mm_add_epi16(
              pixelFilter_q,
              _mm_add_epi16(sum_q6,
                            _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
          4);
#endif
667 668
      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);

669
#if !CONFIG_DEBLOCK_13TAP
670 671
      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
672 673 674 675 676 677
#else
      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
#endif

#if !CONFIG_DEBLOCK_13TAP
678 679
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
680 681 682 683 684
#else
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
#endif

clang-format's avatar
clang-format committed
685
      res_p = _mm_srli_epi16(
686
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
687
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
688 689 690 691 692 693 694
#else
          _mm_add_epi16(
              pixelFilter_p,
              _mm_add_epi16(sum_p6,
                            _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
          4);
#endif
clang-format's avatar
clang-format committed
695
      res_q = _mm_srli_epi16(
696
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
697
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
698 699 700 701 702 703 704
#else
          _mm_add_epi16(
              pixelFilter_q,
              _mm_add_epi16(sum_q6,
                            _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
          4);
#endif
705 706
      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);

707
#if !CONFIG_DEBLOCK_13TAP
708 709
      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
710 711 712 713 714 715
#else
      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
#endif

#if !CONFIG_DEBLOCK_13TAP
716 717
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
718 719 720 721 722
#else
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
#endif

clang-format's avatar
clang-format committed
723
      res_p = _mm_srli_epi16(
724
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
725
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
726 727 728 729 730 731 732
#else
          _mm_add_epi16(
              pixelFilter_p,
              _mm_add_epi16(sum_p6,
                            _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
          4);
#endif
clang-format's avatar
clang-format committed
733
      res_q = _mm_srli_epi16(
734
#if !CONFIG_DEBLOCK_13TAP
clang-format's avatar
clang-format committed
735
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
736 737 738 739 740 741 742
#else
          _mm_add_epi16(
              pixelFilter_q,
              _mm_add_epi16(sum_q6,
                            _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
          4);
#endif
743 744
      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);

745
#if !CONFIG_DEBLOCK_13TAP
746 747 748 749
      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
clang-format's avatar
clang-format committed
750 751 752 753
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
754
      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
755
#endif
756 757 758 759
    }
    // wide flat
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

760 761 762 763 764 765 766 767 768 769 770 771 772 773 774
    flat = _mm_shuffle_epi32(flat, 68);
    flat2 = _mm_shuffle_epi32(flat2, 68);

    q2p2 = _mm_andnot_si128(flat, q2p2);
    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
    q2p2 = _mm_or_si128(q2p2, flat_q2p2);

    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);

    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);

775
#if !CONFIG_DEBLOCK_13TAP
776 777 778
    q6p6 = _mm_andnot_si128(flat2, q6p6);
    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
779
    store_buffer_horz_8(q6p6, p, 6, s);
780
#endif
781 782 783 784

    q5p5 = _mm_andnot_si128(flat2, q5p5);
    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
785
    store_buffer_horz_8(q5p5, p, 5, s);
786 787 788 789

    q4p4 = _mm_andnot_si128(flat2, q4p4);
    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
790
    store_buffer_horz_8(q4p4, p, 4, s);
791 792 793 794

    q3p3 = _mm_andnot_si128(flat2, q3p3);
    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
795
    store_buffer_horz_8(q3p3, p, 3, s);
796 797 798 799

    q2p2 = _mm_andnot_si128(flat2, q2p2);
    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
800
    store_buffer_horz_8(q2p2, p, 2, s);
801 802 803 804

    q1p1 = _mm_andnot_si128(flat2, q1p1);
    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
805
    store_buffer_horz_8(q1p1, p, 1, s);
806 807 808 809

    q0p0 = _mm_andnot_si128(flat2, q0p0);
    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
810
    store_buffer_horz_8(q0p0, p, 0, s);
811 812 813
  }
}

814 815 816 817 818 819 820 821
static INLINE __m128i filter_add2_sub2(const __m128i *const total,
                                       const __m128i *const a1,
                                       const __m128i *const a2,
                                       const __m128i *const s1,
                                       const __m128i *const s2) {
  __m128i x = _mm_add_epi16(*a1, *total);
  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
  return x;
822 823
}

824 825 826 827
static INLINE __m128i filter8_mask(const __m128i *const flat,
                                   const __m128i *const other_filt,
                                   const __m128i *const f8_lo,
                                   const __m128i *const f8_hi) {
clang-format's avatar
clang-format committed
828 829
  const __m128i f8 =
      _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
830 831
  const __m128i result = _mm_and_si128(*flat, f8);
  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
832 833
}

834 835 836 837
static INLINE __m128i filter16_mask(const __m128i *const flat,
                                    const __m128i *const other_filt,
                                    const __m128i *const f_lo,
                                    const __m128i *const f_hi) {
clang-format's avatar
clang-format committed
838 839
  const __m128i f =
      _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
840 841
  const __m128i result = _mm_and_si128(*flat, f);
  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
842 843
}

844 845 846 847 848 849 850
typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput;

static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x,
                                        int p, int offset, uint8_t *s) {
  int i;
  if (pixel_num == FOUR_PIXELS) {
    for (i = 13; i >= 0; i--) {
851
      xx_storel_32(s - (i - offset) * p, x[i]);
852 853 854 855
    }
  }
  if (pixel_num == EIGHT_PIXELS) {
    for (i = 13; i >= 0; i--) {
856
      xx_storel_64(s - (i - offset) * p, x[i]);
857 858 859 860
    }
  }
  if (pixel_num == SIXTEEN_PIXELS) {
    for (i = 13; i >= 0; i--) {
861
      xx_storeu_128(s - (i - offset) * p, x[i]);
862 863 864 865 866 867 868 869 870
    }
  }
}

static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num,
                                             unsigned char *s, int p,
                                             const unsigned char *_blimit,
                                             const unsigned char *_limit,
                                             const unsigned char *_thresh) {
871 872
  const __m128i zero = _mm_set1_epi16(0);
  const __m128i one = _mm_set1_epi8(1);
873 874 875 876
  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
  __m128i mask, hev, flat, flat2;
877 878 879 880
  __m128i p7, p6, p5;
  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
  __m128i q5, q6, q7;

881 882 883 884 885 886 887
  __m128i op2, op1, op0, oq0, oq1, oq2;

  __m128i max_abs_p1p0q1q0;

  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
888 889 890 891 892 893 894 895 896 897
  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
898 899 900
  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
901 902

  {
903 904
    const __m128i abs_p1p0 = abs_diff(p1, p0);
    const __m128i abs_q1q0 = abs_diff(q1, q0);
905
    const __m128i fe = _mm_set1_epi8(0xfe);
906 907 908
    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
    __m128i abs_p0q0 = abs_diff(p0, q0);
    __m128i abs_p1q1 = abs_diff(p1, q1);
909
    __m128i work;
910
    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
911

clang-format's avatar
clang-format committed
912
    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
913 914 915 916
    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
917
    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
918 919
    // mask |= (abs(p1 - p0) > limit) * -1;
    // mask |= (abs(q1 - q0) > limit) * -1;
920
    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
921
    mask = _mm_max_epu8(work, mask);
922
    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
923 924 925 926 927
    mask = _mm_max_epu8(work, mask);
    mask = _mm_subs_epu8(mask, limit);
    mask = _mm_cmpeq_epi8(mask, zero);
  }

928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950
  {
    __m128i work;
    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
    flat = _mm_max_epu8(work, flat);
    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
    flat = _mm_subs_epu8(flat, one);
    flat = _mm_cmpeq_epi8(flat, zero);
    flat = _mm_and_si128(flat, mask);
    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
    flat2 = _mm_max_epu8(work, flat2);
    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
    flat2 = _mm_max_epu8(work, flat2);
    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
    flat2 = _mm_max_epu8(work, flat2);
    flat2 = _mm_subs_epu8(flat2, one);
    flat2 = _mm_cmpeq_epi8(flat2, zero);
    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
  }

  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  // filter4
951 952 953 954 955 956 957 958
  {
    const __m128i t4 = _mm_set1_epi8(4);
    const __m128i t3 = _mm_set1_epi8(3);
    const __m128i t80 = _mm_set1_epi8(0x80);
    const __m128i te0 = _mm_set1_epi8(0xe0);
    const __m128i t1f = _mm_set1_epi8(0x1f);
    const __m128i t1 = _mm_set1_epi8(0x1);
    const __m128i t7f = _mm_set1_epi8(0x7f);
959
    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
960 961 962 963 964

    __m128i filt;
    __m128i work_a;
    __m128i filter1, filter2;

965 966 967 968 969 970 971 972 973 974
    op1 = _mm_xor_si128(p1, t80);
    op0 = _mm_xor_si128(p0, t80);
    oq0 = _mm_xor_si128(q0, t80);
    oq1 = _mm_xor_si128(q1, t80);

    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);

    work_a = _mm_subs_epi8(oq0, op0);
975 976 977
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
Yaowu Xu's avatar
Yaowu Xu committed
978
    // (aom_filter + 3 * (qs0 - ps0)) & mask
979 980 981 982
    filt = _mm_and_si128(filt, mask);
    filter1 = _mm_adds_epi8(filt, t4);
    filter2 = _mm_adds_epi8(filt, t3);

983
    // Filter1 >> 3
984 985 986 987 988
    work_a = _mm_cmpgt_epi8(zero, filter1);
    filter1 = _mm_srli_epi16(filter1, 3);
    work_a = _mm_and_si128(work_a, te0);
    filter1 = _mm_and_si128(filter1, t1f);
    filter1 = _mm_or_si128(filter1, work_a);
989
    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
990

991
    // Filter2 >> 3
992 993 994 995 996
    work_a = _mm_cmpgt_epi8(zero, filter2);
    filter2 = _mm_srli_epi16(filter2, 3);
    work_a = _mm_and_si128(work_a, te0);
    filter2 = _mm_and_si128(filter2, t1f);
    filter2 = _mm_or_si128(filter2, work_a);
997
    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
998

999
    // filt >> 1
1000 1001 1002 1003 1004 1005 1006
    filt = _mm_adds_epi8(filter1, t1);
    work_a = _mm_cmpgt_epi8(zero, filt);
    filt = _mm_srli_epi16(filt, 1);
    work_a = _mm_and_si128(work_a, t80);
    filt = _mm_and_si128(filt, t7f);
    filt = _mm_or_si128(filt, work_a);
    filt = _mm_andnot_si128(hev, filt);
1007 1008
    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
1009 1010
    // loopfilter done

1011 1012
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // filter8
1013
    {
1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
      const __m128i four = _mm_set1_epi16(4);
      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);

      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
      __m128i f8_lo, f8_hi;

      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
                            _mm_add_epi16(p3_lo, p2_lo));
      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
                            _mm_add_epi16(p2_lo, p1_lo));
      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);

      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
                            _mm_add_epi16(p3_hi, p2_hi));
      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
                            _mm_add_epi16(p2_hi, p1_hi));
      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);

1046
      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
1047

1048 1049 1050
      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
1051

1052 1053 1054
      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
1055

1056 1057 1058
      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
1059

1060 1061 1062
      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
1063

1064 1065 1066
      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
1067 1068 1069
    }

    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1070
    // wide flat calculations
1071 1072
    {
      const __m128i eight = _mm_set1_epi16(8);
1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110
      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);

      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);

      __m128i f_lo;
      __m128i f_hi;

      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
clang-format's avatar
clang-format committed
1111 1112
      f_lo =
          _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
1113 1114 1115 1116 1117 1118
      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
                           _mm_add_epi16(p2_lo, p1_lo));
      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);

      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
clang-format's avatar
clang-format committed
1119 1120
      f_hi =
          _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
1121 1122 1123 1124 1125
      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
                           _mm_add_epi16(p2_hi, p1_hi));
      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);

1126 1127
      __m128i x[14];
      x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
1128

1129 1130
      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
1131
      x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
1132

1133 1134
      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
1135
      x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
1136

1137 1138
      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
1139
      x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
1140

1141 1142
      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
1143
      x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
1144

1145 1146
      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
1147
      x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
1148

1149 1150
      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
1151
      x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
1152

1153 1154
      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
1155
      x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
1156

1157 1158
      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
1159
      x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
1160

1161 1162
      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
1163
      x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
1164

1165 1166
      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo,