sad_neon.c 7.56 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
/*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include <arm_neon.h>
Johann's avatar
Johann committed
12

13 14
#include "./vpx_config.h"

Yaowu Xu's avatar
Yaowu Xu committed
15
#include "aom/vpx_integer.h"
16

clang-format's avatar
clang-format committed
17 18 19 20 21 22 23 24 25 26 27 28 29 30
unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride,
                              unsigned char *ref_ptr, int ref_stride) {
  uint8x8_t d0, d8;
  uint16x8_t q12;
  uint32x4_t q1;
  uint64x2_t q3;
  uint32x2_t d5;
  int i;

  d0 = vld1_u8(src_ptr);
  src_ptr += src_stride;
  d8 = vld1_u8(ref_ptr);
  ref_ptr += ref_stride;
  q12 = vabdl_u8(d0, d8);
Johann's avatar
Johann committed
31

clang-format's avatar
clang-format committed
32
  for (i = 0; i < 15; i++) {
Johann's avatar
Johann committed
33 34 35 36
    d0 = vld1_u8(src_ptr);
    src_ptr += src_stride;
    d8 = vld1_u8(ref_ptr);
    ref_ptr += ref_stride;
clang-format's avatar
clang-format committed
37 38 39 40 41 42 43 44 45
    q12 = vabal_u8(q12, d0, d8);
  }

  q1 = vpaddlq_u16(q12);
  q3 = vpaddlq_u32(q1);
  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
                vreinterpret_u32_u64(vget_high_u64(q3)));

  return vget_lane_u32(d5, 0);
Johann's avatar
Johann committed
46 47
}

clang-format's avatar
clang-format committed
48 49 50 51 52 53 54 55 56 57 58 59 60
unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride,
                             unsigned char *ref_ptr, int ref_stride) {
  uint8x8_t d0, d8;
  uint16x8_t q12;
  uint32x2_t d1;
  uint64x1_t d3;
  int i;

  d0 = vld1_u8(src_ptr);
  src_ptr += src_stride;
  d8 = vld1_u8(ref_ptr);
  ref_ptr += ref_stride;
  q12 = vabdl_u8(d0, d8);
Johann's avatar
Johann committed
61

clang-format's avatar
clang-format committed
62
  for (i = 0; i < 3; i++) {
Johann's avatar
Johann committed
63 64 65 66
    d0 = vld1_u8(src_ptr);
    src_ptr += src_stride;
    d8 = vld1_u8(ref_ptr);
    ref_ptr += ref_stride;
clang-format's avatar
clang-format committed
67 68
    q12 = vabal_u8(q12, d0, d8);
  }
Johann's avatar
Johann committed
69

clang-format's avatar
clang-format committed
70 71
  d1 = vpaddl_u16(vget_low_u16(q12));
  d3 = vpaddl_u32(d1);
Johann's avatar
Johann committed
72

clang-format's avatar
clang-format committed
73
  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
Johann's avatar
Johann committed
74 75
}

clang-format's avatar
clang-format committed
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride,
                              unsigned char *ref_ptr, int ref_stride) {
  uint8x16_t q0, q4;
  uint16x8_t q12, q13;
  uint32x4_t q1;
  uint64x2_t q3;
  uint32x2_t d5;
  int i;

  q0 = vld1q_u8(src_ptr);
  src_ptr += src_stride;
  q4 = vld1q_u8(ref_ptr);
  ref_ptr += ref_stride;
  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
Johann's avatar
Johann committed
91

clang-format's avatar
clang-format committed
92
  for (i = 0; i < 7; i++) {
Johann's avatar
Johann committed
93 94 95 96
    q0 = vld1q_u8(src_ptr);
    src_ptr += src_stride;
    q4 = vld1q_u8(ref_ptr);
    ref_ptr += ref_stride;
clang-format's avatar
clang-format committed
97 98 99 100 101 102 103 104 105 106 107
    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
  }

  q12 = vaddq_u16(q12, q13);
  q1 = vpaddlq_u16(q12);
  q3 = vpaddlq_u32(q1);
  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
                vreinterpret_u32_u64(vget_high_u64(q3)));

  return vget_lane_u32(d5, 0);
Johann's avatar
Johann committed
108 109
}

110 111
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
                                                    const uint16x8_t vec_hi) {
clang-format's avatar
clang-format committed
112 113 114 115
  const uint32x4_t vec_l_lo =
      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
  const uint32x4_t vec_l_hi =
      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
116 117 118 119 120 121
  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
  const uint64x2_t b = vpaddlq_u32(a);
  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
                                vreinterpret_u32_u64(vget_high_u64(b)));
  return vget_lane_u32(c, 0);
}
Scott LaVarnway's avatar
Scott LaVarnway committed
122 123
static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
  const uint32x4_t a = vpaddlq_u16(vec_16x8);
124 125 126 127 128 129
  const uint64x2_t b = vpaddlq_u32(a);
  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
                                vreinterpret_u32_u64(vget_high_u64(b)));
  return vget_lane_u32(c, 0);
}

Johann's avatar
Johann committed
130
unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
                               const uint8_t *ref, int ref_stride) {
  int i;
  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
  for (i = 0; i < 64; ++i) {
    const uint8x16_t vec_src_00 = vld1q_u8(src);
    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
    src += src_stride;
    ref += ref_stride;
    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
                            vget_low_u8(vec_ref_00));
    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
                            vget_high_u8(vec_ref_00));
    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
                            vget_low_u8(vec_ref_16));
    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
                            vget_high_u8(vec_ref_16));
    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
                            vget_low_u8(vec_ref_32));
    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
                            vget_high_u8(vec_ref_32));
    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
                            vget_low_u8(vec_ref_48));
    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
                            vget_high_u8(vec_ref_48));
  }
  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
}

Johann's avatar
Johann committed
166
unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
                               const uint8_t *ref, int ref_stride) {
  int i;
  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
  uint16x8_t vec_accum_hi = vdupq_n_u16(0);

  for (i = 0; i < 32; ++i) {
    const uint8x16_t vec_src_00 = vld1q_u8(src);
    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
    src += src_stride;
    ref += ref_stride;
    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
                            vget_low_u8(vec_ref_00));
    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
                            vget_high_u8(vec_ref_00));
    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
                            vget_low_u8(vec_ref_16));
    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
                            vget_high_u8(vec_ref_16));
  }
Scott LaVarnway's avatar
Scott LaVarnway committed
188
  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
189 190
}

Johann's avatar
Johann committed
191
unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
192 193 194 195 196 197 198 199 200 201
                               const uint8_t *ref, int ref_stride) {
  int i;
  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
  uint16x8_t vec_accum_hi = vdupq_n_u16(0);

  for (i = 0; i < 16; ++i) {
    const uint8x16_t vec_src = vld1q_u8(src);
    const uint8x16_t vec_ref = vld1q_u8(ref);
    src += src_stride;
    ref += ref_stride;
clang-format's avatar
clang-format committed
202 203 204 205
    vec_accum_lo =
        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
    vec_accum_hi =
        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
206
  }
Scott LaVarnway's avatar
Scott LaVarnway committed
207 208 209
  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
}

Johann's avatar
Johann committed
210
unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
Scott LaVarnway's avatar
Scott LaVarnway committed
211 212 213 214 215 216 217 218 219 220 221 222
                             const uint8_t *ref, int ref_stride) {
  int i;
  uint16x8_t vec_accum = vdupq_n_u16(0);

  for (i = 0; i < 8; ++i) {
    const uint8x8_t vec_src = vld1_u8(src);
    const uint8x8_t vec_ref = vld1_u8(ref);
    src += src_stride;
    ref += ref_stride;
    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
  }
  return horizontal_add_16x8(vec_accum);
223
}