clpf_simd.h 22.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include "./aom_dsp_rtcd.h"
Steinar Midtskogen's avatar
Steinar Midtskogen committed
13
#include "aom_ports/mem.h"
Steinar Midtskogen's avatar
Steinar Midtskogen committed
14
15
#include "aom_ports/bitops.h"
#include "av1/common/clpf_simd_kernel.h"
16

Steinar Midtskogen's avatar
Steinar Midtskogen committed
17
18
// Process blocks of width 8, two lines at a time, 8 bit.
static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
19
                        int dstride, int x0, int y0, int sizey,
20
21
                        BOUNDARY_TYPE bt, unsigned int strength,
                        unsigned int dmp) {
22
23
24
  const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1;
  const int right = !(bt & TILE_RIGHT_BOUNDARY);
  const int left = !(bt & TILE_LEFT_BOUNDARY);
25
  const int top = bt & TILE_ABOVE_BOUNDARY ? 0 : -1;
Steinar Midtskogen's avatar
Steinar Midtskogen committed
26
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
27
                  c_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
28
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
29
                  d_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
30
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
31
                  e_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
32
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
33
                  f_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
34
35
  int y;

36
37
  dst += x0 + y0 * dstride;
  src += x0 + y0 * sstride;
38

Steinar Midtskogen's avatar
Steinar Midtskogen committed
39
40
41
  for (y = 0; y < sizey; y += 2) {
    const v64 l1 = v64_load_aligned(src);
    const v64 l2 = v64_load_aligned(src + sstride);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
42
43
    const v64 l3 = v64_load_aligned(src - (y != top) * sstride);
    const v64 l4 = v64_load_aligned(src + ((y != bottom) + 1) * sstride);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
44
45
    v128 o = v128_from_v64(l1, l2);
    const v128 a =
Steinar Midtskogen's avatar
Steinar Midtskogen committed
46
47
48
49
50
51
        v128_from_v64(v64_load_aligned(src - 2 * (y != top) * sstride), l3);
    const v128 b = v128_from_v64(l3, l1);
    const v128 g = v128_from_v64(l2, l4);
    const v128 h = v128_from_v64(
        l4, v64_load_aligned(src + (2 * (y != bottom) + 1) * sstride));
    v128 c, d, e, f;
52

53
    if (left) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
54
      c = v128_from_v64(v64_load_unaligned(src - 2),
55
                        v64_load_unaligned(src - 2 + sstride));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
56
      d = v128_from_v64(v64_load_unaligned(src - 1),
57
58
59
                        v64_load_unaligned(src - 1 + sstride));
    } else {  // Left clipping
      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
60
      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
61
    }
62
    if (right) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
63
      e = v128_from_v64(v64_load_unaligned(src + 1),
64
                        v64_load_unaligned(src + 1 + sstride));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
65
      f = v128_from_v64(v64_load_unaligned(src + 2),
66
67
68
                        v64_load_unaligned(src + 2 + sstride));
    } else {  // Right clipping
      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
69
      f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
70
    }
71

72
    o = calc_delta(o, a, b, c, d, e, f, g, h, strength, dmp);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
73
74
75
76
77
78
79
    v64_store_aligned(dst, v128_high_v64(o));
    v64_store_aligned(dst + dstride, v128_low_v64(o));
    src += sstride * 2;
    dst += dstride * 2;
  }
}

80
81
82
// As above, but with no clipping tests
static void clpf_block8_noclip(const uint8_t *src, uint8_t *dst, int sstride,
                               int dstride, int x0, int y0, int sizey,
83
                               unsigned int strength, unsigned int dmp) {
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
  int y;

  dst += x0 + y0 * dstride;
  src += x0 + y0 * sstride;

  for (y = 0; y < sizey; y += 2) {
    const v64 l1 = v64_load_aligned(src);
    const v64 l2 = v64_load_aligned(src + sstride);
    const v64 l3 = v64_load_aligned(src - sstride);
    const v64 l4 = v64_load_aligned(src + 2 * sstride);
    const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3);
    const v128 b = v128_from_v64(l3, l1);
    const v128 g = v128_from_v64(l2, l4);
    const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride));
    const v128 c = v128_from_v64(v64_load_unaligned(src - 2),
                                 v64_load_unaligned(src - 2 + sstride));
    const v128 d = v128_from_v64(v64_load_unaligned(src - 1),
                                 v64_load_unaligned(src - 1 + sstride));
    const v128 e = v128_from_v64(v64_load_unaligned(src + 1),
                                 v64_load_unaligned(src + 1 + sstride));
    const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
                                 v64_load_unaligned(src + 2 + sstride));
106
107
    const v128 o = calc_delta(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h,
                              strength, dmp);
108
109
110
111
112
113
114
115

    v64_store_aligned(dst, v128_high_v64(o));
    v64_store_aligned(dst + dstride, v128_low_v64(o));
    src += sstride * 2;
    dst += dstride * 2;
  }
}

Steinar Midtskogen's avatar
Steinar Midtskogen committed
116
117
// Process blocks of width 4, four lines at a time, 8 bit.
static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
118
                        int dstride, int x0, int y0, int sizey,
119
120
                        BOUNDARY_TYPE bt, unsigned int strength,
                        unsigned int dmp) {
121
122
123
  const int right = !(bt & TILE_RIGHT_BOUNDARY);
  const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 4 : -1;
  const int left = !(bt & TILE_LEFT_BOUNDARY);
124
  const int top = bt & TILE_ABOVE_BOUNDARY ? 0 : -1;
125

Steinar Midtskogen's avatar
Steinar Midtskogen committed
126
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
127
                  c_shuff[]) = { 0x0504040401000000LL, 0x0d0c0c0c09080808LL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
128
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
129
                  d_shuff[]) = { 0x0605040402010000LL, 0x0e0d0c0c0a090808LL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
130
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
131
                  e_shuff[]) = { 0x0707060503030201LL, 0x0f0f0e0d0b0b0a09LL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
132
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
133
                  f_shuff[]) = { 0x0707070603030302LL, 0x0f0f0f0e0b0b0b0aLL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
134
135
136
137
138
139
  int y;

  dst += x0 + y0 * dstride;
  src += x0 + y0 * sstride;

  for (y = 0; y < sizey; y += 4) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
140
141
142
143
144
145
146
147
148
149
    const uint32_t l0 = u32_load_aligned(src - 2 * (y != top) * sstride);
    const uint32_t l1 = u32_load_aligned(src - (y != top) * sstride);
    const uint32_t l2 = u32_load_aligned(src);
    const uint32_t l3 = u32_load_aligned(src + sstride);
    const uint32_t l4 = u32_load_aligned(src + 2 * sstride);
    const uint32_t l5 = u32_load_aligned(src + 3 * sstride);
    const uint32_t l6 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
    const uint32_t l7 =
        u32_load_aligned(src + (2 * (y != bottom) + 3) * sstride);
    v128 o = v128_from_32(l2, l3, l4, l5);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
150
    const v128 a = v128_from_32(l0, l1, l2, l3);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
151
152
153
154
    const v128 b = v128_from_32(l1, l2, l3, l4);
    const v128 g = v128_from_32(l3, l4, l5, l6);
    const v128 h = v128_from_32(l4, l5, l6, l7);
    v128 c, d, e, f;
Steinar Midtskogen's avatar
Steinar Midtskogen committed
155

156
    if (left) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
157
      c = v128_from_32(u32_load_unaligned(src - 2),
158
159
160
                       u32_load_unaligned(src + sstride - 2),
                       u32_load_unaligned(src + 2 * sstride - 2),
                       u32_load_unaligned(src + 3 * sstride - 2));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
161
      d = v128_from_32(u32_load_unaligned(src - 1),
162
163
164
165
166
                       u32_load_unaligned(src + sstride - 1),
                       u32_load_unaligned(src + 2 * sstride - 1),
                       u32_load_unaligned(src + 3 * sstride - 1));
    } else {  // Left clipping
      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
167
      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
168
    }
169
    if (right) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
170
      e = v128_from_32(u32_load_unaligned(src + 1),
171
172
173
                       u32_load_unaligned(src + sstride + 1),
                       u32_load_unaligned(src + 2 * sstride + 1),
                       u32_load_unaligned(src + 3 * sstride + 1));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
174
      f = v128_from_32(u32_load_unaligned(src + 2),
175
176
177
178
179
                       u32_load_unaligned(src + sstride + 2),
                       u32_load_unaligned(src + 2 * sstride + 2),
                       u32_load_unaligned(src + 3 * sstride + 2));
    } else {  // Right clipping
      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
180
      f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
181
    }
Steinar Midtskogen's avatar
Steinar Midtskogen committed
182

183
    o = calc_delta(o, a, b, c, d, e, f, g, h, strength, dmp);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
184
185
186
187
188
189
190
    u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
    u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
    u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
    u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));

    dst += 4 * dstride;
    src += 4 * sstride;
191
192
193
  }
}

194
195
196
// As above, but with no clipping tests
static void clpf_block4_noclip(const uint8_t *src, uint8_t *dst, int sstride,
                               int dstride, int x0, int y0, int sizey,
197
                               unsigned int strength, unsigned int dmp) {
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
  int y;

  dst += x0 + y0 * dstride;
  src += x0 + y0 * sstride;

  for (y = 0; y < sizey; y += 4) {
    const uint32_t l0 = u32_load_aligned(src - 2 * sstride);
    const uint32_t l1 = u32_load_aligned(src - sstride);
    const uint32_t l2 = u32_load_aligned(src);
    const uint32_t l3 = u32_load_aligned(src + sstride);
    const uint32_t l4 = u32_load_aligned(src + 2 * sstride);
    const uint32_t l5 = u32_load_aligned(src + 3 * sstride);
    const uint32_t l6 = u32_load_aligned(src + 4 * sstride);
    const uint32_t l7 = u32_load_aligned(src + 5 * sstride);
    const v128 a = v128_from_32(l0, l1, l2, l3);
    const v128 b = v128_from_32(l1, l2, l3, l4);
    const v128 g = v128_from_32(l3, l4, l5, l6);
    const v128 h = v128_from_32(l4, l5, l6, l7);
    const v128 c = v128_from_32(u32_load_unaligned(src - 2),
                                u32_load_unaligned(src + sstride - 2),
                                u32_load_unaligned(src + 2 * sstride - 2),
                                u32_load_unaligned(src + 3 * sstride - 2));
    const v128 d = v128_from_32(u32_load_unaligned(src - 1),
                                u32_load_unaligned(src + sstride - 1),
                                u32_load_unaligned(src + 2 * sstride - 1),
                                u32_load_unaligned(src + 3 * sstride - 1));
    const v128 e = v128_from_32(u32_load_unaligned(src + 1),
                                u32_load_unaligned(src + sstride + 1),
                                u32_load_unaligned(src + 2 * sstride + 1),
                                u32_load_unaligned(src + 3 * sstride + 1));
    const v128 f = v128_from_32(u32_load_unaligned(src + 2),
                                u32_load_unaligned(src + sstride + 2),
                                u32_load_unaligned(src + 2 * sstride + 2),
                                u32_load_unaligned(src + 3 * sstride + 2));

    const v128 o = calc_delta(v128_from_32(l2, l3, l4, l5), a, b, c, d, e, f, g,
234
                              h, strength, dmp);
235
236
237
238
239
240
241
242
243
244
245

    u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
    u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
    u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
    u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));

    dst += 4 * dstride;
    src += 4 * sstride;
  }
}

246
247
void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
                               int dstride, int x0, int y0, int sizex,
248
                               int sizey, unsigned int strength,
249
                               BOUNDARY_TYPE bt, unsigned int dmp) {
250
251
252
253
  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
    // Fallback to C for odd sizes:
    // * block widths not 4 or 8
    // * block heights not a multiple of 4 if the block width is 4
254
    aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, strength,
255
                     bt, dmp);
256
  } else {
257
258
    if (bt)
      (sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0,
259
                                               y0, sizey, bt, strength, dmp);
260
261
    else
      (sizex == 4 ? clpf_block4_noclip : clpf_block8_noclip)(
262
          src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
263
264
  }
}
265

266
#if defined(CONFIG_AOM_HIGHBITDEPTH)
Steinar Midtskogen's avatar
Steinar Midtskogen committed
267
// sign(a - b) * max(0, abs(a - b) - max(0, abs(a - b) -
268
// strength + (abs(a - b) >> (dmp - log2(s)))))
Steinar Midtskogen's avatar
Steinar Midtskogen committed
269
SIMD_INLINE v128 constrain_hbd(v128 a, v128 b, unsigned int strength,
270
                               unsigned int dmp) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
271
272
273
274
275
  const v128 diff = v128_sub_16(v128_max_s16(a, b), v128_min_s16(a, b));
  const v128 sign = v128_cmpeq_16(v128_min_s16(a, b), a);  // -(a <= b)
  const v128 zero = v128_zero();
  const v128 s = v128_max_s16(
      zero, v128_sub_16(v128_dup_16(strength),
276
                        v128_shr_u16(diff, dmp - get_msb(strength))));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
277
278
279
280
281
282
283
284
  return v128_sub_16(
      v128_xor(sign,
               v128_max_s16(
                   zero, v128_sub_16(
                             diff, v128_max_s16(zero, v128_sub_16(diff, s))))),
      sign);
}

285
286
287
288
// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
//         1/16 * constrain(c, x, s, dmp) + 3/16 * constrain(d, x, s, dmp) +
//         3/16 * constrain(e, x, s, dmp) + 1/16 * constrain(f, x, s, dmp) +
//         3/16 * constrain(g, x, s, dmp) + 1/16 * constrain(h, x, s, dmp)
Steinar Midtskogen's avatar
Steinar Midtskogen committed
289
290
SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
                                v128 f, v128 g, v128 h, unsigned int s,
291
                                unsigned int dmp) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
292
  const v128 bdeg = v128_add_16(
293
294
      v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(d, x, s, dmp)),
      v128_add_16(constrain_hbd(e, x, s, dmp), constrain_hbd(g, x, s, dmp)));
295
296
  const v128 delta = v128_add_16(
      v128_add_16(
297
298
299
          v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(c, x, s, dmp)),
          v128_add_16(constrain_hbd(f, x, s, dmp),
                      constrain_hbd(h, x, s, dmp))),
Steinar Midtskogen's avatar
Steinar Midtskogen committed
300
      v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
301
  return v128_add_16(
Steinar Midtskogen's avatar
Steinar Midtskogen committed
302
303
304
305
306
      x,
      v128_shr_s16(
          v128_add_16(v128_dup_16(8),
                      v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
          4));
307
308
}

Steinar Midtskogen's avatar
Steinar Midtskogen committed
309
static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
310
                            v128 f, v128 g, v128 h, uint16_t *dst,
311
312
                            unsigned int s, unsigned int dmp, int dstride) {
  o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
313
314
315
316
317
  v64_store_aligned(dst, v128_high_v64(o));
  v64_store_aligned(dst + dstride, v128_low_v64(o));
}

static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
318
                            v128 f, v128 g, v128 h, uint16_t *dst,
319
320
                            unsigned int s, unsigned int dmp) {
  v128_store_aligned(dst, calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
321
322
323
324
325
}

// Process blocks of width 4, two lines at time.
SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
                                 int sstride, int dstride, int x0, int y0,
326
                                 int sizey, unsigned int strength,
327
                                 BOUNDARY_TYPE bt, unsigned int dmp) {
328
329
330
  const int right = !(bt & TILE_RIGHT_BOUNDARY);
  const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1;
  const int left = !(bt & TILE_LEFT_BOUNDARY);
331
  const int top = bt & TILE_ABOVE_BOUNDARY ? 0 : -1;
332

Steinar Midtskogen's avatar
Steinar Midtskogen committed
333
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
334
                  c_shuff[]) = { 0x0302010001000100LL, 0x0b0a090809080908LL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
335
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
336
                  d_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080908LL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
337
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
338
                  e_shuff[]) = { 0x0706070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
339
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
340
                  f_shuff[]) = { 0x0706070607060504LL, 0x0f0e0f0e0f0e0d0cLL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
341
342
343
344
345
346
347
348
  int y;

  dst += x0 + y0 * dstride;
  src += x0 + y0 * sstride;

  for (y = 0; y < sizey; y += 2) {
    const v64 l1 = v64_load_aligned(src);
    const v64 l2 = v64_load_aligned(src + sstride);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
349
350
    const v64 l3 = v64_load_aligned(src - (y != top) * sstride);
    const v64 l4 = v64_load_aligned(src + ((y != bottom) + 1) * sstride);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
351
352
    v128 o = v128_from_v64(l1, l2);
    const v128 a =
Steinar Midtskogen's avatar
Steinar Midtskogen committed
353
354
355
356
357
358
        v128_from_v64(v64_load_aligned(src - 2 * (y != top) * sstride), l3);
    const v128 b = v128_from_v64(l3, l1);
    const v128 g = v128_from_v64(l2, l4);
    const v128 h = v128_from_v64(
        l4, v64_load_aligned(src + (2 * (y != bottom) + 1) * sstride));
    v128 c, d, e, f;
Steinar Midtskogen's avatar
Steinar Midtskogen committed
359

360
    if (left) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
361
      c = v128_from_v64(v64_load_unaligned(src - 2),
362
                        v64_load_unaligned(src - 2 + sstride));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
363
      d = v128_from_v64(v64_load_unaligned(src - 1),
364
365
366
                        v64_load_unaligned(src - 1 + sstride));
    } else {  // Left clipping
      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
367
      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
368
    }
369
    if (right) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
370
      e = v128_from_v64(v64_load_unaligned(src + 1),
371
                        v64_load_unaligned(src + 1 + sstride));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
372
      f = v128_from_v64(v64_load_unaligned(src + 2),
373
374
375
                        v64_load_unaligned(src + 2 + sstride));
    } else {  // Right clipping
      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
376
      f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
377
    }
378
    calc_delta_hbd4(o, a, b, c, d, e, f, g, h, dst, strength, dmp, dstride);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
379
380
381
382
383
    src += sstride * 2;
    dst += dstride * 2;
  }
}

384
385
386
387
388
// As above, but with no clipping tests
SIMD_INLINE void clpf_block_hbd4_noclip(const uint16_t *src, uint16_t *dst,
                                        int sstride, int dstride, int x0,
                                        int y0, int sizey,
                                        unsigned int strength,
389
                                        unsigned int dmp) {
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
  int y;

  dst += x0 + y0 * dstride;
  src += x0 + y0 * sstride;

  for (y = 0; y < sizey; y += 2) {
    const v64 l1 = v64_load_aligned(src);
    const v64 l2 = v64_load_aligned(src + sstride);
    const v64 l3 = v64_load_aligned(src - sstride);
    const v64 l4 = v64_load_aligned(src + 2 * sstride);
    const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3);
    const v128 b = v128_from_v64(l3, l1);
    const v128 g = v128_from_v64(l2, l4);
    const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride));
    const v128 c = v128_from_v64(v64_load_unaligned(src - 2),
                                 v64_load_unaligned(src - 2 + sstride));
    const v128 d = v128_from_v64(v64_load_unaligned(src - 1),
                                 v64_load_unaligned(src - 1 + sstride));
    const v128 e = v128_from_v64(v64_load_unaligned(src + 1),
                                 v64_load_unaligned(src + 1 + sstride));
    const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
                                 v64_load_unaligned(src + 2 + sstride));

    calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst,
414
                    strength, dmp, dstride);
415
416
417
418
419
    src += sstride * 2;
    dst += dstride * 2;
  }
}

Steinar Midtskogen's avatar
Steinar Midtskogen committed
420
// The most simple case.  Start here if you need to understand the functions.
421
422
SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
                                int dstride, int x0, int y0, int sizey,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
423
                                unsigned int strength, BOUNDARY_TYPE bt,
424
                                unsigned int dmp) {
425
426
  const int right = !(bt & TILE_RIGHT_BOUNDARY);
  const int left = !(bt & TILE_LEFT_BOUNDARY);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
427
428
  const int ymin = -!(bt & TILE_ABOVE_BOUNDARY) * 2;
  const int ymax = sizey + !(bt & TILE_BOTTOM_BOUNDARY) * 2 - 1;
429

Steinar Midtskogen's avatar
Steinar Midtskogen committed
430
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
431
                  c_shuff[]) = { 0x0302010001000100LL, 0x0b0a090807060504LL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
432
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
433
                  d_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080706LL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
434
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
435
                  e_shuff[]) = { 0x0908070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
436
  DECLARE_ALIGNED(16, static const uint64_t,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
437
                  f_shuff[]) = { 0x0b0a090807060504LL, 0x0f0e0f0e0f0e0d0cLL };
Steinar Midtskogen's avatar
Steinar Midtskogen committed
438
  int y;
439
440
441
442

  dst += x0 + y0 * dstride;
  src += x0 + y0 * sstride;

Steinar Midtskogen's avatar
Steinar Midtskogen committed
443
444
445
446
447
  // Read 8 set of pixels at a time.  Clipping along upper and lower
  // edges is handled by reading the upper or lower line twice.
  // Clipping along the left and right edges is handled by shuffle
  // instructions doing shift and pad.
  for (y = 0; y < sizey; y++) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
448
449
450
451
452
453
    const v128 o = v128_load_aligned(src + y * sstride);
    const v128 a = v128_load_aligned(src + AOMMAX(ymin, y - 2) * sstride);
    const v128 b = v128_load_aligned(src + AOMMAX(ymin, y - 1) * sstride);
    const v128 g = v128_load_aligned(src + AOMMIN(ymax, y + 1) * sstride);
    const v128 h = v128_load_aligned(src + AOMMIN(ymax, y + 2) * sstride);
    v128 c, d, e, f;
Steinar Midtskogen's avatar
Steinar Midtskogen committed
454

455
    if (left) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
456
457
      c = v128_load_unaligned(src + y * sstride - 2);
      d = v128_load_unaligned(src + y * sstride - 1);
458
459
    } else {  // Left clipping
      c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
460
      d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
461
    }
462
    if (right) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
463
464
      e = v128_load_unaligned(src + y * sstride + 1);
      f = v128_load_unaligned(src + y * sstride + 2);
465
466
    } else {  // Right clipping
      e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
467
      f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
468
    }
469
    calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, dmp);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
470
    dst += dstride;
471
472
473
  }
}

474
475
476
477
// As above, but with no clipping tests
SIMD_INLINE void clpf_block_hbd_noclip(const uint16_t *src, uint16_t *dst,
                                       int sstride, int dstride, int x0, int y0,
                                       int sizey, unsigned int strength,
478
                                       unsigned int dmp) {
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
  int y;

  dst += x0 + y0 * dstride;
  src += x0 + y0 * sstride;

  for (y = 0; y < sizey; y++) {
    const v128 o = v128_load_aligned(src);
    const v128 a = v128_load_aligned(src - 2 * sstride);
    const v128 b = v128_load_aligned(src - 1 * sstride);
    const v128 g = v128_load_aligned(src + sstride);
    const v128 h = v128_load_aligned(src + 2 * sstride);
    const v128 c = v128_load_unaligned(src - 2);
    const v128 d = v128_load_unaligned(src - 1);
    const v128 e = v128_load_unaligned(src + 1);
    const v128 f = v128_load_unaligned(src + 2);

495
    calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, dmp);
496
497
498
499
500
    src += sstride;
    dst += dstride;
  }
}

501
502
void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
                                   int sstride, int dstride, int x0, int y0,
503
                                   int sizex, int sizey, unsigned int strength,
504
                                   BOUNDARY_TYPE bt, unsigned int dmp) {
505
506
507
508
  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
    // Fallback to C for odd sizes:
    // * block width not 4 or 8
    // * block heights not a multiple of 2 if the block width is 4
509
    aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
510
                         strength, bt, dmp);
511
  } else {
512
513
    if (bt)
      (sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)(
514
          src, dst, sstride, dstride, x0, y0, sizey, strength, bt, dmp);
515
516
    else
      (sizex == 4 ? clpf_block_hbd4_noclip : clpf_block_hbd_noclip)(
517
          src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
518
519
520
  }
}
#endif