Commit feaf766b authored by James Yu's avatar James Yu Committed by Gerrit Code Review
Browse files

VP8 for ARMv8 by using NEON intrinsics 12



Add sad_neon.c
- vp8_sad16x16_neon
- vp8_sad16x8_neon
- vp8_sad8x8_neon
- vp8_sad8x16_neon
- vp8_sad4x4_neon

Change-Id: I08eaae49ec03fb91b394354660a5df0367cea311
Signed-off-by: default avatarJames Yu <james.yu@linaro.org>
parent 4a8336fa
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sad16x16_neon|
EXPORT |vp8_sad16x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int src_stride
; r2 unsigned char *ref_ptr
; r3 int ref_stride
|vp8_sad16x16_neon| PROC
;;
vpush {d8-d15}
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabdl.u8 q12, d0, d8
vabdl.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0]
vld1.8 {q7}, [r2]
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vadd.u16 q0, q12, q13
vpaddl.u16 q1, q0
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
vpop {d8-d15}
bx lr
ENDP
;==============================
;unsigned int vp8_sad16x8_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad16x8_neon| PROC
vpush {d8-d15}
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabdl.u8 q12, d0, d8
vabdl.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vadd.u16 q0, q12, q13
vpaddl.u16 q1, q0
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
vpop {d8-d15}
bx lr
ENDP
END
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sad8x8_neon|
EXPORT |vp8_sad8x16_neon|
EXPORT |vp8_sad4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; unsigned int vp8_sad8x8_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad8x8_neon| PROC
vpush {d8-d15}
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabdl.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q12, d6, d14
vpaddl.u16 q1, q12
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
vpop {d8-d15}
bx lr
ENDP
;============================
;unsigned int vp8_sad8x16_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad8x16_neon| PROC
vpush {d8-d15}
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabdl.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q12, d6, d14
vpaddl.u16 q1, q12
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
vpop {d8-d15}
bx lr
ENDP
;===========================
;unsigned int vp8_sad4x4_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad4x4_neon| PROC
vpush {d8-d15}
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabdl.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q12, d6, d14
vpaddl.u16 d1, d24
vpaddl.u32 d0, d1
vmov.32 r0, d0[0]
vpop {d8-d15}
bx lr
ENDP
END
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
unsigned int vp8_sad8x8_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 7; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vp8_sad8x16_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 15; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vp8_sad4x4_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x2_t d1;
uint64x1_t d3;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 3; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
d1 = vpaddl_u16(vget_low_u16(q12));
d3 = vpaddl_u32(d1);
return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
}
unsigned int vp8_sad16x16_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x16_t q0, q4;
uint16x8_t q12, q13;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
for (i = 0; i < 15; i++) {
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
}
q12 = vaddq_u16(q12, q13);
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vp8_sad16x8_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x16_t q0, q4;
uint16x8_t q12, q13;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
for (i = 0; i < 7; i++) {
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
}
q12 = vaddq_u16(q12, q13);
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}