clpf_simd.h 21 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

12
#include "./av1_rtcd.h"
Steinar Midtskogen's avatar
Steinar Midtskogen committed
13
#include "aom_ports/mem.h"
Steinar Midtskogen's avatar
Steinar Midtskogen committed
14
#include "aom_ports/bitops.h"
15

16
// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
17
SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
18
                           unsigned int adjdamp) {
19
20
  const v256 diff16 = v256_sub_16(a, b);
  v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
21
  const v128 sign = v128_cmplt_s8(diff, v128_zero());
22
23
24
25
26
27
  diff = v128_abs_s8(diff);
  return v128_xor(
      v128_add_8(sign,
                 v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
                                                v128_shr_u8(diff, adjdamp)))),
      sign);
28
29
}

30
31
32
33
// delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
//         1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
//         3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
//         3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
34
35
SIMD_INLINE v128 calc_delta(v256 x, v256 a, v256 b, v256 c, v256 d, v256 e,
                            v256 f, v256 g, v256 h, unsigned int s,
36
37
38
39
40
41
42
43
44
                            unsigned int dmp) {
  const v128 bdeg =
      v128_add_8(v128_add_8(constrain(b, x, s, dmp), constrain(d, x, s, dmp)),
                 v128_add_8(constrain(e, x, s, dmp), constrain(g, x, s, dmp)));
  const v128 delta = v128_add_8(
      v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(c, x, s, dmp)),
                 v128_add_8(constrain(f, x, s, dmp), constrain(h, x, s, dmp))),
      v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
  return v128_add_8(
45
46
47
48
49
      v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
      v128_shr_s8(
          v128_add_8(v128_dup_8(8),
                     v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
          4));
50
51
}

52
53
// delta = 1/8 * constrain(a, x, s, d) + 3/8 * constrain(b, x, s, d) +
//         3/8 * constrain(c, x, s, d) + 1/8 * constrain(d, x, s, d) +
54
SIMD_INLINE v128 calc_hdelta(v256 x, v256 a, v256 b, v256 c, v256 d,
55
56
57
58
59
60
                             unsigned int s, unsigned int dmp) {
  const v128 bc = v128_add_8(constrain(b, x, s, dmp), constrain(c, x, s, dmp));
  const v128 delta =
      v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(d, x, s, dmp)),
                 v128_add_8(v128_add_8(bc, bc), bc));
  return v128_add_8(
61
62
63
64
65
      v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
      v128_shr_s8(
          v128_add_8(v128_dup_8(4),
                     v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
          3));
66
}
67

Steinar Midtskogen's avatar
Steinar Midtskogen committed
68
// Process blocks of width 8, two lines at a time, 8 bit.
69
70
71
72
static void SIMD_FUNC(clpf_block8)(uint8_t *dst, const uint16_t *src,
                                   int dstride, int sstride, int sizey,
                                   unsigned int strength,
                                   unsigned int adjdamp) {
73
74
75
  int y;

  for (y = 0; y < sizey; y += 2) {
76
77
78
79
    const v128 l1 = v128_load_aligned(src);
    const v128 l2 = v128_load_aligned(src + sstride);
    const v128 l3 = v128_load_aligned(src - sstride);
    const v128 l4 = v128_load_aligned(src + 2 * sstride);
80
81
82
83
84
85
86
87
88
89
90
91
92
    const v256 a = v256_from_v128(v128_load_aligned(src - 2 * sstride), l3);
    const v256 b = v256_from_v128(l3, l1);
    const v256 g = v256_from_v128(l2, l4);
    const v256 h = v256_from_v128(l4, v128_load_aligned(src + 3 * sstride));
    const v256 c = v256_from_v128(v128_load_unaligned(src - 2),
                                  v128_load_unaligned(src - 2 + sstride));
    const v256 d = v256_from_v128(v128_load_unaligned(src - 1),
                                  v128_load_unaligned(src - 1 + sstride));
    const v256 e = v256_from_v128(v128_load_unaligned(src + 1),
                                  v128_load_unaligned(src + 1 + sstride));
    const v256 f = v256_from_v128(v128_load_unaligned(src + 2),
                                  v128_load_unaligned(src + 2 + sstride));
    const v128 o = calc_delta(v256_from_v128(l1, l2), a, b, c, d, e, f, g, h,
93
                              strength, adjdamp);
94
95
96
97
98
99
100
101

    v64_store_aligned(dst, v128_high_v64(o));
    v64_store_aligned(dst + dstride, v128_low_v64(o));
    src += sstride * 2;
    dst += dstride * 2;
  }
}

Steinar Midtskogen's avatar
Steinar Midtskogen committed
102
// Process blocks of width 4, four lines at a time, 8 bit.
103
104
105
106
static void SIMD_FUNC(clpf_block4)(uint8_t *dst, const uint16_t *src,
                                   int dstride, int sstride, int sizey,
                                   unsigned int strength,
                                   unsigned int adjdamp) {
107
108
  int y;

109
110
111
112
113
114
115
116
117
  for (y = 0; y < sizey; y += 4) {
    const v64 l0 = v64_load_aligned(src - 2 * sstride);
    const v64 l1 = v64_load_aligned(src - sstride);
    const v64 l2 = v64_load_aligned(src);
    const v64 l3 = v64_load_aligned(src + sstride);
    const v64 l4 = v64_load_aligned(src + 2 * sstride);
    const v64 l5 = v64_load_aligned(src + 3 * sstride);
    const v64 l6 = v64_load_aligned(src + 4 * sstride);
    const v64 l7 = v64_load_aligned(src + 5 * sstride);
118
    const v128 o =
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
        calc_delta(v256_from_v64(l2, l3, l4, l5), v256_from_v64(l0, l1, l2, l3),
                   v256_from_v64(l1, l2, l3, l4),
                   v256_from_v64(v64_load_unaligned(src - 2),
                                 v64_load_unaligned(src + sstride - 2),
                                 v64_load_unaligned(src + 2 * sstride - 2),
                                 v64_load_unaligned(src + 3 * sstride - 2)),
                   v256_from_v64(v64_load_unaligned(src - 1),
                                 v64_load_unaligned(src + sstride - 1),
                                 v64_load_unaligned(src + 2 * sstride - 1),
                                 v64_load_unaligned(src + 3 * sstride - 1)),
                   v256_from_v64(v64_load_unaligned(src + 1),
                                 v64_load_unaligned(src + sstride + 1),
                                 v64_load_unaligned(src + 2 * sstride + 1),
                                 v64_load_unaligned(src + 3 * sstride + 1)),
                   v256_from_v64(v64_load_unaligned(src + 2),
                                 v64_load_unaligned(src + sstride + 2),
                                 v64_load_unaligned(src + 2 * sstride + 2),
                                 v64_load_unaligned(src + 3 * sstride + 2)),
                   v256_from_v64(l3, l4, l5, l6), v256_from_v64(l4, l5, l6, l7),
                   strength, adjdamp);
139
140
141
142
143
144
145
146
147
148
149

    u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
    u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
    u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
    u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));

    dst += 4 * dstride;
    src += 4 * sstride;
  }
}

150
151
152
153
static void SIMD_FUNC(clpf_hblock8)(uint8_t *dst, const uint16_t *src,
                                    int dstride, int sstride, int sizey,
                                    unsigned int strength,
                                    unsigned int adjdamp) {
154
155
156
  int y;

  for (y = 0; y < sizey; y += 2) {
157
158
159
160
161
162
163
164
165
166
167
    const v256 x = v256_from_v128(v128_load_aligned(src),
                                  v128_load_aligned(src + sstride));
    const v256 a = v256_from_v128(v128_load_unaligned(src - 2),
                                  v128_load_unaligned(src - 2 + sstride));
    const v256 b = v256_from_v128(v128_load_unaligned(src - 1),
                                  v128_load_unaligned(src - 1 + sstride));
    const v256 c = v256_from_v128(v128_load_unaligned(src + 1),
                                  v128_load_unaligned(src + 1 + sstride));
    const v256 d = v256_from_v128(v128_load_unaligned(src + 2),
                                  v128_load_unaligned(src + 2 + sstride));
    const v128 o = calc_hdelta(x, a, b, c, d, strength, adjdamp);
168
169
170
171
172
173
174
175
176

    v64_store_aligned(dst, v128_high_v64(o));
    v64_store_aligned(dst + dstride, v128_low_v64(o));
    src += sstride * 2;
    dst += dstride * 2;
  }
}

// Process blocks of width 4, four lines at a time, 8 bit.
177
178
179
180
static void SIMD_FUNC(clpf_hblock4)(uint8_t *dst, const uint16_t *src,
                                    int dstride, int sstride, int sizey,
                                    unsigned int strength,
                                    unsigned int adjdamp) {
181
  int y;
182
183

  for (y = 0; y < sizey; y += 4) {
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
    const v256 a = v256_from_v64(v64_load_unaligned(src - 2),
                                 v64_load_unaligned(src + sstride - 2),
                                 v64_load_unaligned(src + 2 * sstride - 2),
                                 v64_load_unaligned(src + 3 * sstride - 2));
    const v256 b = v256_from_v64(v64_load_unaligned(src - 1),
                                 v64_load_unaligned(src + sstride - 1),
                                 v64_load_unaligned(src + 2 * sstride - 1),
                                 v64_load_unaligned(src + 3 * sstride - 1));
    const v256 c = v256_from_v64(v64_load_unaligned(src + 1),
                                 v64_load_unaligned(src + sstride + 1),
                                 v64_load_unaligned(src + 2 * sstride + 1),
                                 v64_load_unaligned(src + 3 * sstride + 1));
    const v256 d = v256_from_v64(v64_load_unaligned(src + 2),
                                 v64_load_unaligned(src + sstride + 2),
                                 v64_load_unaligned(src + 2 * sstride + 2),
                                 v64_load_unaligned(src + 3 * sstride + 2));
200
201

    const v128 o = calc_hdelta(
202
203
204
205
        v256_from_v64(v64_load_aligned(src), v64_load_aligned(src + sstride),
                      v64_load_aligned(src + 2 * sstride),
                      v64_load_aligned(src + 3 * sstride)),
        a, b, c, d, strength, adjdamp);
206
207
208
209
210
211
212
213
214
215
216

    u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
    u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
    u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
    u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));

    dst += 4 * dstride;
    src += 4 * sstride;
  }
}

217
218
219
void SIMD_FUNC(aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride,
                               int sstride, int sizex, int sizey,
                               unsigned int strength, unsigned int dmp) {
220
221
222
223
  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
    // Fallback to C for odd sizes:
    // * block widths not 4 or 8
    // * block heights not a multiple of 4 if the block width is 4
224
    aom_clpf_block_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
225
  } else {
226
227
    (sizex == 4 ? SIMD_FUNC(clpf_block4) : SIMD_FUNC(clpf_block8))(
        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
228
229
230
231
232
233
234
235
236
237
238
239
  }
}

void SIMD_FUNC(aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride,
                                int sstride, int sizex, int sizey,
                                unsigned int strength, unsigned int dmp) {
  if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
    // Fallback to C for odd sizes:
    // * block widths not 4 or 8
    // * block heights not a multiple of 4 if the block width is 4
    aom_clpf_hblock_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
  } else {
240
    (sizex == 4 ? SIMD_FUNC(clpf_hblock4) : SIMD_FUNC(clpf_hblock8))(
241
        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
242
243
  }
}
244

245
// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
Steinar Midtskogen's avatar
Steinar Midtskogen committed
246
SIMD_INLINE v128 constrain_hbd(v128 a, v128 b, unsigned int strength,
247
                               unsigned int adjdamp) {
248
249
250
  v128 diff = v128_sub_16(a, b);
  const v128 sign = v128_shr_n_s16(diff, 15);
  diff = v128_abs_s16(diff);
251
252
  const v128 s =
      v128_ssub_u16(v128_dup_16(strength), v128_shr_u16(diff, adjdamp));
253
  return v128_xor(v128_add_16(sign, v128_min_s16(diff, s)), sign);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
254
255
}

256
257
258
259
// delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
//         1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
//         3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
//         3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
Steinar Midtskogen's avatar
Steinar Midtskogen committed
260
261
SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
                                v128 f, v128 g, v128 h, unsigned int s,
262
                                unsigned int dmp) {
Steinar Midtskogen's avatar
Steinar Midtskogen committed
263
  const v128 bdeg = v128_add_16(
264
265
      v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(d, x, s, dmp)),
      v128_add_16(constrain_hbd(e, x, s, dmp), constrain_hbd(g, x, s, dmp)));
266
267
  const v128 delta = v128_add_16(
      v128_add_16(
268
269
270
          v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(c, x, s, dmp)),
          v128_add_16(constrain_hbd(f, x, s, dmp),
                      constrain_hbd(h, x, s, dmp))),
Steinar Midtskogen's avatar
Steinar Midtskogen committed
271
      v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
272
  return v128_add_16(
Steinar Midtskogen's avatar
Steinar Midtskogen committed
273
274
275
276
277
      x,
      v128_shr_s16(
          v128_add_16(v128_dup_16(8),
                      v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
          4));
278
279
}

Steinar Midtskogen's avatar
Steinar Midtskogen committed
280
static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
281
                            v128 f, v128 g, v128 h, uint16_t *dst,
282
283
                            unsigned int s, unsigned int dmp, int dstride) {
  o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp);
Steinar Midtskogen's avatar
Steinar Midtskogen committed
284
285
286
287
288
  v64_store_aligned(dst, v128_high_v64(o));
  v64_store_aligned(dst + dstride, v128_low_v64(o));
}

static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
Steinar Midtskogen's avatar
Steinar Midtskogen committed
289
                            v128 f, v128 g, v128 h, uint16_t *dst,
290
291
292
                            unsigned int s, unsigned int adjdamp) {
  v128_store_aligned(dst,
                     calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, adjdamp));
Steinar Midtskogen's avatar
Steinar Midtskogen committed
293
294
}

295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
// delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
//         3/16 * constrain(c, x, s, dmp) + 1/16 * constrain(d, x, s, dmp)
SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d,
                                 unsigned int s, unsigned int dmp) {
  const v128 bc =
      v128_add_16(constrain_hbd(b, x, s, dmp), constrain_hbd(c, x, s, dmp));
  const v128 delta = v128_add_16(
      v128_add_16(constrain_hbd(a, x, s, dmp), constrain_hbd(d, x, s, dmp)),
      v128_add_16(v128_add_16(bc, bc), bc));
  return v128_add_16(
      x,
      v128_shr_s16(
          v128_add_16(v128_dup_16(4),
                      v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
          3));
}

static void calc_hdelta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d,
313
314
315
                             uint16_t *dst, unsigned int s,
                             unsigned int adjdamp, int dstride) {
  o = calc_hdelta_hbd(o, a, b, c, d, s, adjdamp);
316
317
318
319
320
  v64_store_aligned(dst, v128_high_v64(o));
  v64_store_aligned(dst + dstride, v128_low_v64(o));
}

static void calc_hdelta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d,
321
322
323
                             uint16_t *dst, unsigned int s,
                             unsigned int adjdamp) {
  v128_store_aligned(dst, calc_hdelta_hbd(o, a, b, c, d, s, adjdamp));
324
325
}

Steinar Midtskogen's avatar
Steinar Midtskogen committed
326
// Process blocks of width 4, two lines at time.
327
328
329
330
static void SIMD_FUNC(clpf_block_hbd4)(uint16_t *dst, const uint16_t *src,
                                       int dstride, int sstride, int sizey,
                                       unsigned int strength,
                                       unsigned int adjdamp) {
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
  int y;

  for (y = 0; y < sizey; y += 2) {
    const v64 l1 = v64_load_aligned(src);
    const v64 l2 = v64_load_aligned(src + sstride);
    const v64 l3 = v64_load_aligned(src - sstride);
    const v64 l4 = v64_load_aligned(src + 2 * sstride);
    const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3);
    const v128 b = v128_from_v64(l3, l1);
    const v128 g = v128_from_v64(l2, l4);
    const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride));
    const v128 c = v128_from_v64(v64_load_unaligned(src - 2),
                                 v64_load_unaligned(src - 2 + sstride));
    const v128 d = v128_from_v64(v64_load_unaligned(src - 1),
                                 v64_load_unaligned(src - 1 + sstride));
    const v128 e = v128_from_v64(v64_load_unaligned(src + 1),
                                 v64_load_unaligned(src + 1 + sstride));
    const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
                                 v64_load_unaligned(src + 2 + sstride));

    calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst,
352
                    strength, adjdamp, dstride);
353
354
355
356
357
    src += sstride * 2;
    dst += dstride * 2;
  }
}

Steinar Midtskogen's avatar
Steinar Midtskogen committed
358
// The most simple case.  Start here if you need to understand the functions.
359
360
361
362
static void SIMD_FUNC(clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
                                      int dstride, int sstride, int sizey,
                                      unsigned int strength,
                                      unsigned int adjdamp) {
363
364
365
366
367
368
369
370
371
372
373
374
375
  int y;

  for (y = 0; y < sizey; y++) {
    const v128 o = v128_load_aligned(src);
    const v128 a = v128_load_aligned(src - 2 * sstride);
    const v128 b = v128_load_aligned(src - 1 * sstride);
    const v128 g = v128_load_aligned(src + sstride);
    const v128 h = v128_load_aligned(src + 2 * sstride);
    const v128 c = v128_load_unaligned(src - 2);
    const v128 d = v128_load_unaligned(src - 1);
    const v128 e = v128_load_unaligned(src + 1);
    const v128 f = v128_load_unaligned(src + 2);

376
    calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, adjdamp);
377
378
379
380
381
    src += sstride;
    dst += dstride;
  }
}

382
// Process blocks of width 4, horizontal filter, two lines at time.
383
384
385
386
static void SIMD_FUNC(clpf_hblock_hbd4)(uint16_t *dst, const uint16_t *src,
                                        int dstride, int sstride, int sizey,
                                        unsigned int strength,
                                        unsigned int adjdamp) {
387
388
389
390
391
392
393
394
395
396
397
398
399
400
  int y;

  for (y = 0; y < sizey; y += 2) {
    const v128 a = v128_from_v64(v64_load_unaligned(src - 2),
                                 v64_load_unaligned(src - 2 + sstride));
    const v128 b = v128_from_v64(v64_load_unaligned(src - 1),
                                 v64_load_unaligned(src - 1 + sstride));
    const v128 c = v128_from_v64(v64_load_unaligned(src + 1),
                                 v64_load_unaligned(src + 1 + sstride));
    const v128 d = v128_from_v64(v64_load_unaligned(src + 2),
                                 v64_load_unaligned(src + 2 + sstride));

    calc_hdelta_hbd4(v128_from_v64(v64_load_unaligned(src),
                                   v64_load_unaligned(src + sstride)),
401
                     a, b, c, d, dst, strength, adjdamp, dstride);
402
403
404
405
406
407
    src += sstride * 2;
    dst += dstride * 2;
  }
}

// Process blocks of width 8, horizontal filter, two lines at time.
408
409
410
411
static void SIMD_FUNC(clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
                                       int dstride, int sstride, int sizey,
                                       unsigned int strength,
                                       unsigned int adjdamp) {
412
413
414
415
416
417
418
419
420
  int y;

  for (y = 0; y < sizey; y++) {
    const v128 o = v128_load_aligned(src);
    const v128 a = v128_load_unaligned(src - 2);
    const v128 b = v128_load_unaligned(src - 1);
    const v128 c = v128_load_unaligned(src + 1);
    const v128 d = v128_load_unaligned(src + 2);

421
    calc_hdelta_hbd8(o, a, b, c, d, dst, strength, adjdamp);
422
423
424
425
426
    src += sstride;
    dst += dstride;
  }
}

427
428
429
void SIMD_FUNC(aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
                                   int dstride, int sstride, int sizex,
                                   int sizey, unsigned int strength,
430
                                   unsigned int dmp) {
431
432
433
434
  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
    // Fallback to C for odd sizes:
    // * block width not 4 or 8
    // * block heights not a multiple of 2 if the block width is 4
435
436
    aom_clpf_block_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
                         dmp);
437
  } else {
438
    (sizex == 4 ? SIMD_FUNC(clpf_block_hbd4) : SIMD_FUNC(clpf_block_hbd))(
439
        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
440
441
  }
}
442

443
444
445
void SIMD_FUNC(aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
                                    int dstride, int sstride, int sizex,
                                    int sizey, unsigned int strength,
446
447
448
449
450
                                    unsigned int dmp) {
  if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
    // Fallback to C for odd sizes:
    // * block width not 4 or 8
    // * block heights not a multiple of 2 if the block width is 4
451
452
    aom_clpf_hblock_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
                          dmp);
453
  } else {
454
    (sizex == 4 ? SIMD_FUNC(clpf_hblock_hbd4) : SIMD_FUNC(clpf_hblock_hbd))(
455
        dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
456
457
  }
}