intrapred_neon.c 28.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include <arm_neon.h>

13
#include "./vpx_config.h"
14
#include "./vpx_dsp_rtcd.h"
Yaowu Xu's avatar
Yaowu Xu committed
15
#include "aom/vpx_integer.h"
16

17 18 19 20
//------------------------------------------------------------------------------
// DC 4x4

// 'do_above' and 'do_left' facilitate branch removal when inlined.
clang-format's avatar
clang-format committed
21 22
static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
                          const uint8_t *left, int do_above, int do_left) {
23 24 25 26 27 28 29 30 31 32 33 34
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x8_t A = vld1_u8(above);  // top row
    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
    const uint16x4_t p1 = vpadd_u16(p0, p0);
    sum_top = vcombine_u16(p1, p1);
  }

  if (do_left) {
clang-format's avatar
clang-format committed
35
    const uint8x8_t L = vld1_u8(left);   // left border
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
    const uint16x4_t p1 = vpadd_u16(p0, p0);
    sum_left = vcombine_u16(p1, p1);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 3);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 2);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 2);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 4; ++i) {
clang-format's avatar
clang-format committed
56
      vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
57 58 59 60
    }
  }
}

61
void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
62 63 64 65
                               const uint8_t *above, const uint8_t *left) {
  dc_4x4(dst, stride, above, left, 1, 1);
}

66
void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
67 68 69 70 71
                                    const uint8_t *above, const uint8_t *left) {
  (void)above;
  dc_4x4(dst, stride, NULL, left, 0, 1);
}

72
void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
73 74 75 76 77
                                   const uint8_t *above, const uint8_t *left) {
  (void)left;
  dc_4x4(dst, stride, above, NULL, 1, 0);
}

78
void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
79 80 81 82 83 84
                                   const uint8_t *above, const uint8_t *left) {
  (void)above;
  (void)left;
  dc_4x4(dst, stride, NULL, NULL, 0, 0);
}

85 86 87 88
//------------------------------------------------------------------------------
// DC 8x8

// 'do_above' and 'do_left' facilitate branch removal when inlined.
clang-format's avatar
clang-format committed
89 90
static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
                          const uint8_t *left, int do_above, int do_left) {
91 92 93 94 95 96 97 98 99 100 101 102 103
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x8_t A = vld1_u8(above);  // top row
    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
    const uint16x4_t p1 = vpadd_u16(p0, p0);
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    sum_top = vcombine_u16(p2, p2);
  }

  if (do_left) {
clang-format's avatar
clang-format committed
104
    const uint8x8_t L = vld1_u8(left);   // left border
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
    const uint16x4_t p1 = vpadd_u16(p0, p0);
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    sum_left = vcombine_u16(p2, p2);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 4);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 3);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 3);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 8; ++i) {
clang-format's avatar
clang-format committed
126
      vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
127 128 129 130
    }
  }
}

131
void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
132 133 134 135
                               const uint8_t *above, const uint8_t *left) {
  dc_8x8(dst, stride, above, left, 1, 1);
}

136
void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
137 138 139 140 141
                                    const uint8_t *above, const uint8_t *left) {
  (void)above;
  dc_8x8(dst, stride, NULL, left, 0, 1);
}

142
void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
143 144 145 146 147
                                   const uint8_t *above, const uint8_t *left) {
  (void)left;
  dc_8x8(dst, stride, above, NULL, 1, 0);
}

148
void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
149 150 151 152 153 154
                                   const uint8_t *above, const uint8_t *left) {
  (void)above;
  (void)left;
  dc_8x8(dst, stride, NULL, NULL, 0, 0);
}

155 156 157 158 159 160 161 162 163 164 165 166 167
//------------------------------------------------------------------------------
// DC 16x16

// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
                            const uint8_t *above, const uint8_t *left,
                            int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x16_t A = vld1q_u8(above);  // top row
clang-format's avatar
clang-format committed
168
    const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    const uint16x4_t p3 = vpadd_u16(p2, p2);
    sum_top = vcombine_u16(p3, p3);
  }

  if (do_left) {
    const uint8x16_t L = vld1q_u8(left);  // left row
    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    const uint16x4_t p3 = vpadd_u16(p2, p2);
    sum_left = vcombine_u16(p3, p3);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 5);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 4);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 4);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 16; ++i) {
      vst1q_u8(dst + i * stride, dc);
    }
  }
}

204
void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
205 206 207 208
                                 const uint8_t *above, const uint8_t *left) {
  dc_16x16(dst, stride, above, left, 1, 1);
}

209
void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
210 211 212 213 214 215
                                      const uint8_t *above,
                                      const uint8_t *left) {
  (void)above;
  dc_16x16(dst, stride, NULL, left, 0, 1);
}

216
void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
217 218 219 220 221 222
                                     const uint8_t *above,
                                     const uint8_t *left) {
  (void)left;
  dc_16x16(dst, stride, above, NULL, 1, 0);
}

223
void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
224 225 226 227 228 229 230
                                     const uint8_t *above,
                                     const uint8_t *left) {
  (void)above;
  (void)left;
  dc_16x16(dst, stride, NULL, NULL, 0, 0);
}

231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
//------------------------------------------------------------------------------
// DC 32x32

// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
                            const uint8_t *above, const uint8_t *left,
                            int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x16_t A0 = vld1q_u8(above);  // top row
    const uint8x16_t A1 = vld1q_u8(above + 16);
    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
    const uint16x8_t p1 = vpaddlq_u8(A1);
    const uint16x8_t p2 = vaddq_u16(p0, p1);
    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
    const uint16x4_t p4 = vpadd_u16(p3, p3);
    const uint16x4_t p5 = vpadd_u16(p4, p4);
    sum_top = vcombine_u16(p5, p5);
  }

  if (do_left) {
    const uint8x16_t L0 = vld1q_u8(left);  // left row
    const uint8x16_t L1 = vld1q_u8(left + 16);
    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
    const uint16x8_t p1 = vpaddlq_u8(L1);
    const uint16x8_t p2 = vaddq_u16(p0, p1);
    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
    const uint16x4_t p4 = vpadd_u16(p3, p3);
    const uint16x4_t p5 = vpadd_u16(p4, p4);
    sum_left = vcombine_u16(p5, p5);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 6);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 5);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 5);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 32; ++i) {
      vst1q_u8(dst + i * stride, dc);
      vst1q_u8(dst + i * stride + 16, dc);
    }
  }
}

287
void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
288 289 290 291
                                 const uint8_t *above, const uint8_t *left) {
  dc_32x32(dst, stride, above, left, 1, 1);
}

292
void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
293 294 295 296 297 298
                                      const uint8_t *above,
                                      const uint8_t *left) {
  (void)above;
  dc_32x32(dst, stride, NULL, left, 0, 1);
}

299
void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
300 301 302 303 304 305
                                     const uint8_t *above,
                                     const uint8_t *left) {
  (void)left;
  dc_32x32(dst, stride, above, NULL, 1, 0);
}

306
void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
307 308 309 310 311 312 313
                                     const uint8_t *above,
                                     const uint8_t *left) {
  (void)above;
  (void)left;
  dc_32x32(dst, stride, NULL, NULL, 0, 0);
}

James Zern's avatar
James Zern committed
314 315
// -----------------------------------------------------------------------------

316
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
James Zern's avatar
James Zern committed
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
                                const uint8_t *above, const uint8_t *left) {
  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
  const uint64x1_t A1 = vshr_n_u64(A0, 8);
  const uint64x1_t A2 = vshr_n_u64(A0, 16);
  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
  (void)left;
  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
  dst[3 * stride + 3] = above[7];
}

339
void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
James Zern's avatar
James Zern committed
340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
                                const uint8_t *above, const uint8_t *left) {
  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
  const uint8x8_t A0 = vld1_u8(above);  // top row
  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
  const uint8x8_t avg1 = vhadd_u8(A0, A2);
  uint8x8_t row = vrhadd_u8(avg1, A1);
  int i;
  (void)left;
  for (i = 0; i < 7; ++i) {
    vst1_u8(dst + i * stride, row);
    row = vtbl1_u8(row, sh_12345677);
  }
  vst1_u8(dst + i * stride, row);
357 358
}

359
void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
360 361 362 363 364 365 366 367 368 369 370 371 372 373
                                  const uint8_t *above, const uint8_t *left) {
  const uint8x16_t A0 = vld1q_u8(above);  // top row
  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
  uint8x16_t row = vrhaddq_u8(avg1, A1);
  int i;
  (void)left;
  for (i = 0; i < 15; ++i) {
    vst1q_u8(dst + i * stride, row);
    row = vextq_u8(row, above_right, 1);
  }
  vst1q_u8(dst + i * stride, row);
James Zern's avatar
James Zern committed
374 375
}

James Zern's avatar
James Zern committed
376 377
// -----------------------------------------------------------------------------

378
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
James Zern's avatar
James Zern committed
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
                                 const uint8_t *above, const uint8_t *left) {
  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
  const uint32x2_t zero = vdup_n_u32(0);
  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
}

406 407
#if !HAVE_NEON_ASM

408
void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
409 410 411 412
                              const uint8_t *above, const uint8_t *left) {
  int i;
  uint32x2_t d0u32 = vdup_n_u32(0);
  (void)left;
413

414
  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
415
  for (i = 0; i < 4; i++, dst += stride)
416
    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
417 418
}

419
void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
420 421 422 423
                              const uint8_t *above, const uint8_t *left) {
  int i;
  uint8x8_t d0u8 = vdup_n_u8(0);
  (void)left;
424

425
  d0u8 = vld1_u8(above);
clang-format's avatar
clang-format committed
426
  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
427 428
}

429
void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
430 431 432 433
                                const uint8_t *above, const uint8_t *left) {
  int i;
  uint8x16_t q0u8 = vdupq_n_u8(0);
  (void)left;
434

435
  q0u8 = vld1q_u8(above);
clang-format's avatar
clang-format committed
436
  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
437 438
}

439
void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
440 441 442 443 444
                                const uint8_t *above, const uint8_t *left) {
  int i;
  uint8x16_t q0u8 = vdupq_n_u8(0);
  uint8x16_t q1u8 = vdupq_n_u8(0);
  (void)left;
445

446 447
  q0u8 = vld1q_u8(above);
  q1u8 = vld1q_u8(above + 16);
448
  for (i = 0; i < 32; i++, dst += stride) {
449 450 451
    vst1q_u8(dst, q0u8);
    vst1q_u8(dst + 16, q1u8);
  }
452 453
}

454
void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
455 456 457 458
                              const uint8_t *above, const uint8_t *left) {
  uint8x8_t d0u8 = vdup_n_u8(0);
  uint32x2_t d1u32 = vdup_n_u32(0);
  (void)above;
459

460
  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
461

462 463
  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
464
  dst += stride;
465 466
  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
467
  dst += stride;
468 469
  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
470
  dst += stride;
471 472
  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
473 474
}

475
void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
476 477 478 479
                              const uint8_t *above, const uint8_t *left) {
  uint8x8_t d0u8 = vdup_n_u8(0);
  uint64x1_t d1u64 = vdup_n_u64(0);
  (void)above;
480

481
  d1u64 = vld1_u64((const uint64_t *)left);
482

483 484
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
  vst1_u8(dst, d0u8);
485
  dst += stride;
486 487
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
  vst1_u8(dst, d0u8);
488
  dst += stride;
489 490
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
  vst1_u8(dst, d0u8);
491
  dst += stride;
492 493
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
  vst1_u8(dst, d0u8);
494
  dst += stride;
495 496
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
  vst1_u8(dst, d0u8);
497
  dst += stride;
498 499
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
  vst1_u8(dst, d0u8);
500
  dst += stride;
501 502
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
  vst1_u8(dst, d0u8);
503
  dst += stride;
504 505 506 507
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
  vst1_u8(dst, d0u8);
}

508
void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
509 510 511 512 513 514 515 516 517 518 519 520
                                const uint8_t *above, const uint8_t *left) {
  int j;
  uint8x8_t d2u8 = vdup_n_u8(0);
  uint8x16_t q0u8 = vdupq_n_u8(0);
  uint8x16_t q1u8 = vdupq_n_u8(0);
  (void)above;

  q1u8 = vld1q_u8(left);
  d2u8 = vget_low_u8(q1u8);
  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
    q0u8 = vdupq_lane_u8(d2u8, 0);
    vst1q_u8(dst, q0u8);
521
    dst += stride;
522 523
    q0u8 = vdupq_lane_u8(d2u8, 1);
    vst1q_u8(dst, q0u8);
524
    dst += stride;
525 526
    q0u8 = vdupq_lane_u8(d2u8, 2);
    vst1q_u8(dst, q0u8);
527
    dst += stride;
528 529
    q0u8 = vdupq_lane_u8(d2u8, 3);
    vst1q_u8(dst, q0u8);
530
    dst += stride;
531 532
    q0u8 = vdupq_lane_u8(d2u8, 4);
    vst1q_u8(dst, q0u8);
533
    dst += stride;
534 535
    q0u8 = vdupq_lane_u8(d2u8, 5);
    vst1q_u8(dst, q0u8);
536
    dst += stride;
537 538
    q0u8 = vdupq_lane_u8(d2u8, 6);
    vst1q_u8(dst, q0u8);
539
    dst += stride;
540 541
    q0u8 = vdupq_lane_u8(d2u8, 7);
    vst1q_u8(dst, q0u8);
542
    dst += stride;
543
  }
544 545
}

546
void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
547 548 549 550 551 552
                                const uint8_t *above, const uint8_t *left) {
  int j, k;
  uint8x8_t d2u8 = vdup_n_u8(0);
  uint8x16_t q0u8 = vdupq_n_u8(0);
  uint8x16_t q1u8 = vdupq_n_u8(0);
  (void)above;
553

554
  for (k = 0; k < 2; k++, left += 16) {
555 556 557
    q1u8 = vld1q_u8(left);
    d2u8 = vget_low_u8(q1u8);
    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
558 559 560
      q0u8 = vdupq_lane_u8(d2u8, 0);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
561
      dst += stride;
562 563 564
      q0u8 = vdupq_lane_u8(d2u8, 1);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
565
      dst += stride;
566 567 568
      q0u8 = vdupq_lane_u8(d2u8, 2);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
569
      dst += stride;
570 571 572
      q0u8 = vdupq_lane_u8(d2u8, 3);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
573
      dst += stride;
574 575 576
      q0u8 = vdupq_lane_u8(d2u8, 4);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
577
      dst += stride;
578 579 580
      q0u8 = vdupq_lane_u8(d2u8, 5);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
581
      dst += stride;
582 583 584
      q0u8 = vdupq_lane_u8(d2u8, 6);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
585
      dst += stride;
586 587 588
      q0u8 = vdupq_lane_u8(d2u8, 7);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
589
      dst += stride;
590
    }
591
  }
592 593
}

594
void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
595 596 597 598 599 600
                               const uint8_t *above, const uint8_t *left) {
  int i;
  uint16x8_t q1u16, q3u16;
  int16x8_t q1s16;
  uint8x8_t d0u8 = vdup_n_u8(0);
  uint32x2_t d2u32 = vdup_n_u32(0);
601

602
  d0u8 = vld1_dup_u8(above - 1);
603 604
  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
605
  for (i = 0; i < 4; i++, dst += stride) {
606
    q1u16 = vdupq_n_u16((uint16_t)left[i]);
clang-format's avatar
clang-format committed
607 608
    q1s16 =
        vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
609 610 611
    d0u8 = vqmovun_s16(q1s16);
    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  }
612 613
}

614
void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
615 616 617 618 619 620
                               const uint8_t *above, const uint8_t *left) {
  int j;
  uint16x8_t q0u16, q3u16, q10u16;
  int16x8_t q0s16;
  uint16x4_t d20u16;
  uint8x8_t d0u8, d2u8, d30u8;
621

622
  d0u8 = vld1_dup_u8(above - 1);
623 624 625 626 627 628 629
  d30u8 = vld1_u8(left);
  d2u8 = vld1_u8(above);
  q10u16 = vmovl_u8(d30u8);
  q3u16 = vsubl_u8(d2u8, d0u8);
  d20u16 = vget_low_u16(q10u16);
  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
    q0u16 = vdupq_lane_u16(d20u16, 0);
clang-format's avatar
clang-format committed
630 631
    q0s16 =
        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
632 633
    d0u8 = vqmovun_s16(q0s16);
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
634
    dst += stride;
635
    q0u16 = vdupq_lane_u16(d20u16, 1);
clang-format's avatar
clang-format committed
636 637
    q0s16 =
        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
638 639
    d0u8 = vqmovun_s16(q0s16);
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
640
    dst += stride;
641
    q0u16 = vdupq_lane_u16(d20u16, 2);
clang-format's avatar
clang-format committed
642 643
    q0s16 =
        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
644 645
    d0u8 = vqmovun_s16(q0s16);
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
646
    dst += stride;
647
    q0u16 = vdupq_lane_u16(d20u16, 3);
clang-format's avatar
clang-format committed
648 649
    q0s16 =
        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
650 651
    d0u8 = vqmovun_s16(q0s16);
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
652
    dst += stride;
653
  }
654 655
}

656
void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
657 658 659 660 661 662 663
                                 const uint8_t *above, const uint8_t *left) {
  int j, k;
  uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
  uint8x16_t q0u8, q1u8;
  int16x8_t q0s16, q1s16, q8s16, q11s16;
  uint16x4_t d20u16;
  uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
664

665
  q0u8 = vld1q_dup_u8(above - 1);
666 667 668 669 670 671
  q1u8 = vld1q_u8(above);
  q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
  q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
  for (k = 0; k < 2; k++, left += 8) {
    d18u8 = vld1_u8(left);
    q10u16 = vmovl_u8(d18u8);
672 673
    d20u16 = vget_low_u16(q10u16);
    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
674 675
      q0u16 = vdupq_lane_u16(d20u16, 0);
      q8u16 = vdupq_lane_u16(d20u16, 1);
clang-format's avatar
clang-format committed
676 677 678 679 680 681 682 683
      q1s16 =
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
      q0s16 =
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
      q11s16 =
          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
      q8s16 =
          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
684 685 686 687 688 689
      d2u8 = vqmovun_s16(q1s16);
      d3u8 = vqmovun_s16(q0s16);
      d22u8 = vqmovun_s16(q11s16);
      d23u8 = vqmovun_s16(q8s16);
      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
690
      dst += stride;
691 692
      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
693
      dst += stride;
694

695 696
      q0u16 = vdupq_lane_u16(d20u16, 2);
      q8u16 = vdupq_lane_u16(d20u16, 3);
clang-format's avatar
clang-format committed
697 698 699 700 701 702 703 704
      q1s16 =
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
      q0s16 =
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
      q11s16 =
          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
      q8s16 =
          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
705 706 707 708 709 710
      d2u8 = vqmovun_s16(q1s16);
      d3u8 = vqmovun_s16(q0s16);
      d22u8 = vqmovun_s16(q11s16);
      d23u8 = vqmovun_s16(q8s16);
      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
711
      dst += stride;
712 713
      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
714
      dst += stride;
715
    }
716
  }
717 718
}

719
void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
720 721 722 723 724 725 726
                                 const uint8_t *above, const uint8_t *left) {
  int j, k;
  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
  uint8x16_t q0u8, q1u8, q2u8;
  int16x8_t q12s16, q13s16, q14s16, q15s16;
  uint16x4_t d6u16;
  uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
727

728
  q0u8 = vld1q_dup_u8(above - 1);
729 730 731 732 733 734 735 736 737 738 739 740
  q1u8 = vld1q_u8(above);
  q2u8 = vld1q_u8(above + 16);
  q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
  q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
  q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
  q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
  for (k = 0; k < 4; k++, left += 8) {
    d26u8 = vld1_u8(left);
    q3u16 = vmovl_u8(d26u8);
    d6u16 = vget_low_u16(q3u16);
    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
      q0u16 = vdupq_lane_u16(d6u16, 0);
clang-format's avatar
clang-format committed
741 742 743 744
      q12s16 =
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
      q13s16 =
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
745 746 747 748 749 750 751 752 753 754 755 756
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q10u16));
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q11u16));
      d0u8 = vqmovun_s16(q12s16);
      d1u8 = vqmovun_s16(q13s16);
      d2u8 = vqmovun_s16(q14s16);
      d3u8 = vqmovun_s16(q15s16);
      q0u8 = vcombine_u8(d0u8, d1u8);
      q1u8 = vcombine_u8(d2u8, d3u8);
      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
757
      dst += stride;
758

759
      q0u16 = vdupq_lane_u16(d6u16, 1);
clang-format's avatar
clang-format committed
760 761 762 763
      q12s16 =
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
      q13s16 =
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
764 765 766 767 768 769 770 771 772 773 774 775
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q10u16));
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q11u16));
      d0u8 = vqmovun_s16(q12s16);
      d1u8 = vqmovun_s16(q13s16);
      d2u8 = vqmovun_s16(q14s16);
      d3u8 = vqmovun_s16(q15s16);
      q0u8 = vcombine_u8(d0u8, d1u8);
      q1u8 = vcombine_u8(d2u8, d3u8);
      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
776
      dst += stride;
777

778
      q0u16 = vdupq_lane_u16(d6u16, 2);
clang-format's avatar
clang-format committed
779 780 781 782
      q12s16 =
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
      q13s16 =
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
783 784 785 786 787 788 789 790 791 792 793 794
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q10u16));
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q11u16));
      d0u8 = vqmovun_s16(q12s16);
      d1u8 = vqmovun_s16(q13s16);
      d2u8 = vqmovun_s16(q14s16);
      d3u8 = vqmovun_s16(q15s16);
      q0u8 = vcombine_u8(d0u8, d1u8);
      q1u8 = vcombine_u8(d2u8, d3u8);
      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
795
      dst += stride;
796

797
      q0u16 = vdupq_lane_u16(d6u16, 3);
clang-format's avatar
clang-format committed
798 799 800 801
      q12s16 =
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
      q13s16 =
          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
802 803 804 805 806 807 808 809 810 811 812 813
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q10u16));
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q11u16));
      d0u8 = vqmovun_s16(q12s16);
      d1u8 = vqmovun_s16(q13s16);
      d2u8 = vqmovun_s16(q14s16);
      d3u8 = vqmovun_s16(q15s16);
      q0u8 = vcombine_u8(d0u8, d1u8);
      q1u8 = vcombine_u8(d2u8, d3u8);
      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
814
      dst += stride;
815
    }
816
  }
817
}
818
#endif  // !HAVE_NEON_ASM