Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
a1a7ad0c
Commit
a1a7ad0c
authored
Oct 11, 2016
by
Yaowu Xu
Committed by
Gerrit Code Review
Oct 11, 2016
Browse files
Merge "Make generic SIMD work with clang." into nextgenv2
parents
0bab35bf
7b7624e8
Changes
4
Hide whitespace changes
Inline
Side-by-side
aom_dsp/simd/v128_intrinsics_arm.h
View file @
a1a7ad0c
...
...
@@ -28,7 +28,7 @@ SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
SIMD_INLINE
v128
v128_from_v64
(
v64
a
,
v64
b
)
{
return
vcombine_s64
(
b
,
a
);
}
SIMD_INLINE
v128
v128_from_64
(
uint64_t
a
,
uint64_t
b
)
{
return
vcombine_s64
(
b
,
a
);
return
vcombine_s64
(
(
uint64x1_t
)
b
,
(
uint64x1_t
)
a
);
}
SIMD_INLINE
v128
v128_from_32
(
uint32_t
a
,
uint32_t
b
,
uint32_t
c
,
uint32_t
d
)
{
...
...
@@ -52,7 +52,9 @@ SIMD_INLINE void v128_store_unaligned(void *p, v128 r) {
}
SIMD_INLINE
v128
v128_align
(
v128
a
,
v128
b
,
const
unsigned
int
c
)
{
#if __OPTIMIZE__
// The following functions require an immediate.
// Some compilers will check this during optimisation, others wont.
#if __OPTIMIZE__ && !__clang__
return
c
?
vreinterpretq_s64_s8
(
vextq_s8
(
vreinterpretq_s8_s64
(
b
),
vreinterpretq_s8_s64
(
a
),
c
))
:
b
;
...
...
@@ -122,7 +124,7 @@ typedef struct { ssd64_internal hi, lo; } ssd128_internal;
SIMD_INLINE
ssd128_internal
v128_ssd_u8_init
()
{
ssd128_internal
s
;
s
.
hi
=
s
.
lo
=
0
;
s
.
hi
=
s
.
lo
=
(
ssd64_internal
)(
uint64_t
)
0
;
return
s
;
}
...
...
@@ -430,11 +432,11 @@ SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
SIMD_INLINE
v128
v128_shuffle_8
(
v128
x
,
v128
pattern
)
{
return
v128_from_64
(
vreinterpret_s64_u8
(
(
uint64_t
)
vreinterpret_s64_u8
(
vtbl2_u8
((
uint8x8x2_t
){
{
vget_low_u8
(
vreinterpretq_u8_s64
(
x
)),
vget_high_u8
(
vreinterpretq_u8_s64
(
x
))
}
},
vreinterpret_u8_s64
(
vget_high_s64
(
pattern
)))),
vreinterpret_s64_u8
(
(
uint64_t
)
vreinterpret_s64_u8
(
vtbl2_u8
((
uint8x8x2_t
){
{
vget_low_u8
(
vreinterpretq_u8_s64
(
x
)),
vget_high_u8
(
vreinterpretq_u8_s64
(
x
))
}
},
vreinterpret_u8_s64
(
vget_low_s64
(
pattern
)))));
...
...
@@ -521,21 +523,24 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
vshlq_s32
(
vreinterpretq_s32_s64
(
a
),
vdupq_n_s32
(
-
c
)));
}
#if __OPTIMIZE__
#if __OPTIMIZE__
&& !__clang__
SIMD_INLINE
v128
v128_shl_n_byte
(
v128
a
,
const
unsigned
int
n
)
{
return
n
<
8
?
v128_from_64
(
vorr_u64
(
vshl_n_u64
(
vreinterpret_u64_s64
(
vget_high_s64
(
a
)),
n
*
8
),
vshr_n_u64
(
vreinterpret_u64_s64
(
vget_low_s64
(
a
)),
(
8
-
n
)
*
8
)),
vshl_n_u64
(
vreinterpret_u64_s64
(
vget_low_s64
(
a
)),
n
*
8
))
:
(
n
==
8
?
v128_from_64
(
vreinterpret_u64_s64
(
vget_low_s64
(
a
)),
0
)
:
v128_from_64
(
vshl_n_u64
(
vreinterpret_u64_s64
(
vget_low_s64
(
a
)),
(
n
-
8
)
*
8
),
0
));
(
uint64_t
)
vorr_u64
(
vshl_n_u64
(
vreinterpret_u64_s64
(
vget_high_s64
(
a
)),
n
*
8
),
vshr_n_u64
(
vreinterpret_u64_s64
(
vget_low_s64
(
a
)),
(
8
-
n
)
*
8
)),
(
uint64_t
)
vshl_n_u64
(
vreinterpret_u64_s64
(
vget_low_s64
(
a
)),
n
*
8
))
:
(
n
==
8
?
v128_from_64
(
(
uint64_t
)
vreinterpret_u64_s64
(
vget_low_s64
(
a
)),
0
)
:
v128_from_64
((
uint64_t
)
vshl_n_u64
(
vreinterpret_u64_s64
(
vget_low_s64
(
a
)),
(
n
-
8
)
*
8
),
0
));
}
SIMD_INLINE
v128
v128_shr_n_byte
(
v128
a
,
const
unsigned
int
n
)
{
...
...
aom_dsp/simd/v128_intrinsics_x86.h
View file @
a1a7ad0c
...
...
@@ -58,7 +58,9 @@ SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
_mm_storeu_si128
((
__m128i
*
)
p
,
a
);
}
#if defined(__OPTIMIZE__)
// The following function requires an immediate.
// Some compilers will check this during optimisation, others wont.
#if __OPTIMIZE__ && !__clang__
#if defined(__SSSE3__)
SIMD_INLINE
v128
v128_align
(
v128
a
,
v128
b
,
const
unsigned
int
c
)
{
return
c
?
_mm_alignr_epi8
(
a
,
b
,
c
)
:
b
;
...
...
aom_dsp/simd/v64_intrinsics_arm.h
View file @
a1a7ad0c
...
...
@@ -14,12 +14,10 @@
#include
<arm_neon.h>
#include
"./v64_intrinsics_arm.h"
#include
"aom_ports/arm.h"
/* vzip in gcc is broken. Fixed in 4.6.1? */
#if __GNUC__ && \
((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ < \
(4 << 16) + (6 << 8) + 1)
#error vzip buggy in gcc. Get at least gcc 4.6.1.
#ifdef AOM_INCOMPATIBLE_GCC
#error Incompatible gcc
#endif
typedef
int64x1_t
v64
;
...
...
@@ -51,7 +49,7 @@ SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
SIMD_INLINE
v64
v64_from_64
(
uint64_t
x
)
{
return
vcreate_s64
(
x
);
}
SIMD_INLINE
uint64_t
v64_u64
(
v64
x
)
{
return
x
;
}
SIMD_INLINE
uint64_t
v64_u64
(
v64
x
)
{
return
(
uint64_t
)
x
;
}
SIMD_INLINE
uint32_t
u32_load_aligned
(
const
void
*
p
)
{
return
*
((
uint32_t
*
)
p
);
...
...
@@ -66,12 +64,16 @@ SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
}
SIMD_INLINE
void
u32_store_unaligned
(
void
*
p
,
uint32_t
a
)
{
#if __CC_ARM
#if __clang__
vst1_lane_u32
((
uint32_t
*
)
p
,
vreinterpret_u32_s64
((
uint64x1_t
)(
uint64_t
)
a
),
0
);
#elif __CC_ARM
*
(
__packed
uint32_t
*
)
p
)
=
a
;
#elif __GNUC__
*
((
__attribute
((
packed
))
uint32_t
*
)
p
)
=
a
;
#else
vst1_lane_u32
((
uint32_t
*
)
p
,
vreinterpret_u32_s64
(
a
),
0
);
vst1_lane_u32
((
uint32_t
*
)
p
,
vreinterpret_u32_s64
((
uint64x1_t
)(
uint64_t
)
a
),
0
);
#endif
}
...
...
@@ -91,13 +93,16 @@ SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
vst1_u8
((
uint8_t
*
)
p
,
vreinterpret_u8_s64
(
r
));
}
// The following function requires an immediate.
// Some compilers will check this if it's optimising, others wont.
SIMD_INLINE
v64
v64_align
(
v64
a
,
v64
b
,
const
unsigned
int
c
)
{
#if __OPTIMIZE__
#if __OPTIMIZE__
&& !__clang__
return
c
?
vreinterpret_s64_s8
(
vext_s8
(
vreinterpret_s8_s64
(
b
),
vreinterpret_s8_s64
(
a
),
c
))
:
b
;
#else
return
c
?
v64_from_64
(
b
>>
c
*
8
)
|
(
a
<<
(
8
-
c
)
*
8
)
:
b
;
return
c
?
v64_from_64
((
uint64_t
)
b
>>
c
*
8
)
|
((
uint64_t
)
a
<<
(
8
-
c
)
*
8
)
:
b
;
#endif
}
...
...
@@ -121,21 +126,21 @@ SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
int64x2_t
r
=
vpaddlq_s32
(
vpaddlq_s16
(
vmulq_s16
(
vmovl_s8
(
vreinterpret_s8_s64
(
x
)),
vreinterpretq_s16_u16
(
vmovl_u8
(
vreinterpret_u8_s64
(
y
))))));
return
vadd_s64
(
vget_high_s64
(
r
),
vget_low_s64
(
r
));
return
(
int64_t
)
vadd_s64
(
vget_high_s64
(
r
),
vget_low_s64
(
r
));
}
SIMD_INLINE
int64_t
v64_dotp_s16
(
v64
x
,
v64
y
)
{
int64x2_t
r
=
vpaddlq_s32
(
vmull_s16
(
vreinterpret_s16_s64
(
x
),
vreinterpret_s16_s64
(
y
)));
return
vget_high_s64
(
r
)
+
vget_low_s64
(
r
);
return
(
int64_t
)(
vget_high_s64
(
r
)
+
vget_low_s64
(
r
)
)
;
}
SIMD_INLINE
uint64_t
v64_hadd_u8
(
v64
x
)
{
return
vpaddl_u32
(
vpaddl_u16
(
vpaddl_u8
(
vreinterpret_u8_s64
(
x
))));
return
(
uint64_t
)
vpaddl_u32
(
vpaddl_u16
(
vpaddl_u8
(
vreinterpret_u8_s64
(
x
))));
}
SIMD_INLINE
int64_t
v64_hadd_s16
(
v64
a
)
{
return
vpaddl_s32
(
vpaddl_s16
(
vreinterpret_s16_s64
(
a
)));
return
(
int64_t
)
vpaddl_s32
(
vpaddl_s16
(
vreinterpret_s16_s64
(
a
)));
}
typedef
uint16x8_t
sad64_internal
;
...
...
@@ -151,12 +156,14 @@ SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
SIMD_INLINE
uint32_t
v64_sad_u8_sum
(
sad64_internal
s
)
{
uint64x2_t
r
=
vpaddlq_u32
(
vpaddlq_u16
(
s
));
return
(
uint32_t
)(
vget_high_u64
(
r
)
+
vget_low_u64
(
r
));
return
(
uint32_t
)(
uint64_t
)(
vget_high_u64
(
r
)
+
vget_low_u64
(
r
));
}
typedef
int64x1_t
ssd64_internal
;
SIMD_INLINE
ssd64_internal
v64_ssd_u8_init
()
{
return
0
;
}
SIMD_INLINE
ssd64_internal
v64_ssd_u8_init
()
{
return
(
ssd64_internal
)(
uint64_t
)
0
;
}
/* Implementation dependent return value. Result must be finalised with
* v64_ssd_u8_sum(). */
...
...
@@ -166,7 +173,9 @@ SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
return
vadd_u64
(
s
,
vadd_u64
(
vget_high_u64
(
r
),
vget_low_u64
(
r
)));
}
SIMD_INLINE
uint32_t
v64_ssd_u8_sum
(
ssd64_internal
s
)
{
return
(
uint32_t
)
s
;
}
SIMD_INLINE
uint32_t
v64_ssd_u8_sum
(
ssd64_internal
s
)
{
return
(
uint32_t
)(
uint64_t
)
s
;
}
SIMD_INLINE
v64
v64_or
(
v64
x
,
v64
y
)
{
return
vorr_s64
(
x
,
y
);
}
...
...
@@ -470,7 +479,9 @@ SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
vshl_s32
(
vreinterpret_s32_s64
(
a
),
vdup_n_s32
(
-
(
int
)
c
)));
}
#if __OPTIMIZE__
// The following functions require an immediate.
// Some compilers will check this during optimisation, others wont.
#if __OPTIMIZE__ && !__clang__
SIMD_INLINE
v64
v64_shl_n_byte
(
v64
a
,
const
unsigned
int
c
)
{
return
vshl_n_s64
(
a
,
c
*
8
);
...
...
aom_dsp/simd/v64_intrinsics_x86.h
View file @
a1a7ad0c
...
...
@@ -86,6 +86,7 @@ SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
_mm_storel_epi64
((
__m128i
*
)
p
,
a
);
}
// The following function requires an immediate.
#if __OPTIMIZE__
#define v64_align(a, b, c) \
(c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b;
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment