Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Guillaume Martres
aom-rav1e
Commits
fe437bc8
Commit
fe437bc8
authored
May 03, 2014
by
Johann
Committed by
Gerrit Code Review
May 03, 2014
Browse files
Merge "VP8 for ARMv8 by using NEON intrinsics 07"
parents
56186c25
930557be
Changes
3
Hide whitespace changes
Inline
Side-by-side
vp8/common/arm/neon/iwalsh_neon.asm
deleted
100644 → 0
View file @
56186c25
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT
|
vp8_short_inv_walsh4x4_neon
|
ARM
REQUIRE8
PRESERVE8
AREA
|
.text
|
,
CODE
,
READONLY
; name this block of code
;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
|
vp8_short_inv_walsh4x4_neon
|
PROC
; read in all four lines of values: d0->d3
vld1.i16
{
q0
-
q1
}
,
[
r0@128
]
; first for loop
vadd.s16
d4
,
d0
,
d3
;a = [0] + [12]
vadd.s16
d6
,
d1
,
d2
;b = [4] + [8]
vsub.s16
d5
,
d0
,
d3
;d = [0] - [12]
vsub.s16
d7
,
d1
,
d2
;c = [4] - [8]
vadd.s16
q0
,
q2
,
q3
; a+b d+c
vsub.s16
q1
,
q2
,
q3
; a-b d-c
vtrn.32
d0
,
d2
;d0: 0 1 8 9
;d2: 2 3 10 11
vtrn.32
d1
,
d3
;d1: 4 5 12 13
;d3: 6 7 14 15
vtrn.16
d0
,
d1
;d0: 0 4 8 12
;d1: 1 5 9 13
vtrn.16
d2
,
d3
;d2: 2 6 10 14
;d3: 3 7 11 15
; second for loop
vadd.s16
d4
,
d0
,
d3
;a = [0] + [3]
vadd.s16
d6
,
d1
,
d2
;b = [1] + [2]
vsub.s16
d5
,
d0
,
d3
;d = [0] - [3]
vsub.s16
d7
,
d1
,
d2
;c = [1] - [2]
vmov.i16
q8
,
#
3
vadd.s16
q0
,
q2
,
q3
; a+b d+c
vsub.s16
q1
,
q2
,
q3
; a-b d-c
vadd.i16
q0
,
q0
,
q8
;e/f += 3
vadd.i16
q1
,
q1
,
q8
;g/h += 3
vshr.s16
q0
,
q0
,
#
3
;e/f >> 3
vshr.s16
q1
,
q1
,
#
3
;g/h >> 3
mov
r2
,
#
64
add
r3
,
r1
,
#
32
vst1.i16
d0
[
0
],
[
r1
],
r2
vst1.i16
d1
[
0
],
[
r3
],
r2
vst1.i16
d2
[
0
],
[
r1
],
r2
vst1.i16
d3
[
0
],
[
r3
],
r2
vst1.i16
d0
[
1
],
[
r1
],
r2
vst1.i16
d1
[
1
],
[
r3
],
r2
vst1.i16
d2
[
1
],
[
r1
],
r2
vst1.i16
d3
[
1
],
[
r3
],
r2
vst1.i16
d0
[
2
],
[
r1
],
r2
vst1.i16
d1
[
2
],
[
r3
],
r2
vst1.i16
d2
[
2
],
[
r1
],
r2
vst1.i16
d3
[
2
],
[
r3
],
r2
vst1.i16
d0
[
3
],
[
r1
],
r2
vst1.i16
d1
[
3
],
[
r3
],
r2
vst1.i16
d2
[
3
],
[
r1
]
vst1.i16
d3
[
3
],
[
r3
]
bx
lr
ENDP
; |vp8_short_inv_walsh4x4_neon|
END
vp8/common/arm/neon/iwalsh_neon.c
0 → 100644
View file @
fe437bc8
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include
<arm_neon.h>
void
vp8_short_inv_walsh4x4_neon
(
int16_t
*
input
,
int16_t
*
mb_dqcoeff
)
{
int16x8_t
q0s16
,
q1s16
,
q2s16
,
q3s16
;
int16x4_t
d4s16
,
d5s16
,
d6s16
,
d7s16
;
int16x4x2_t
v2tmp0
,
v2tmp1
;
int32x2x2_t
v2tmp2
,
v2tmp3
;
int16x8_t
qAdd3
;
q0s16
=
vld1q_s16
(
input
);
q1s16
=
vld1q_s16
(
input
+
8
);
// 1st for loop
d4s16
=
vadd_s16
(
vget_low_s16
(
q0s16
),
vget_high_s16
(
q1s16
));
d6s16
=
vadd_s16
(
vget_high_s16
(
q0s16
),
vget_low_s16
(
q1s16
));
d5s16
=
vsub_s16
(
vget_low_s16
(
q0s16
),
vget_high_s16
(
q1s16
));
d7s16
=
vsub_s16
(
vget_high_s16
(
q0s16
),
vget_low_s16
(
q1s16
));
q2s16
=
vcombine_s16
(
d4s16
,
d5s16
);
q3s16
=
vcombine_s16
(
d6s16
,
d7s16
);
q0s16
=
vaddq_s16
(
q2s16
,
q3s16
);
q1s16
=
vsubq_s16
(
q2s16
,
q3s16
);
v2tmp2
=
vtrn_s32
(
vreinterpret_s32_s16
(
vget_low_s16
(
q0s16
)),
vreinterpret_s32_s16
(
vget_low_s16
(
q1s16
)));
v2tmp3
=
vtrn_s32
(
vreinterpret_s32_s16
(
vget_high_s16
(
q0s16
)),
vreinterpret_s32_s16
(
vget_high_s16
(
q1s16
)));
v2tmp0
=
vtrn_s16
(
vreinterpret_s16_s32
(
v2tmp2
.
val
[
0
]),
vreinterpret_s16_s32
(
v2tmp3
.
val
[
0
]));
v2tmp1
=
vtrn_s16
(
vreinterpret_s16_s32
(
v2tmp2
.
val
[
1
]),
vreinterpret_s16_s32
(
v2tmp3
.
val
[
1
]));
// 2nd for loop
d4s16
=
vadd_s16
(
v2tmp0
.
val
[
0
],
v2tmp1
.
val
[
1
]);
d6s16
=
vadd_s16
(
v2tmp0
.
val
[
1
],
v2tmp1
.
val
[
0
]);
d5s16
=
vsub_s16
(
v2tmp0
.
val
[
0
],
v2tmp1
.
val
[
1
]);
d7s16
=
vsub_s16
(
v2tmp0
.
val
[
1
],
v2tmp1
.
val
[
0
]);
q2s16
=
vcombine_s16
(
d4s16
,
d5s16
);
q3s16
=
vcombine_s16
(
d6s16
,
d7s16
);
qAdd3
=
vdupq_n_s16
(
3
);
q0s16
=
vaddq_s16
(
q2s16
,
q3s16
);
q1s16
=
vsubq_s16
(
q2s16
,
q3s16
);
q0s16
=
vaddq_s16
(
q0s16
,
qAdd3
);
q1s16
=
vaddq_s16
(
q1s16
,
qAdd3
);
q0s16
=
vshrq_n_s16
(
q0s16
,
3
);
q1s16
=
vshrq_n_s16
(
q1s16
,
3
);
// store
vst1_lane_s16
(
mb_dqcoeff
,
vget_low_s16
(
q0s16
),
0
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_high_s16
(
q0s16
),
0
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_low_s16
(
q1s16
),
0
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_high_s16
(
q1s16
),
0
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_low_s16
(
q0s16
),
1
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_high_s16
(
q0s16
),
1
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_low_s16
(
q1s16
),
1
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_high_s16
(
q1s16
),
1
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_low_s16
(
q0s16
),
2
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_high_s16
(
q0s16
),
2
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_low_s16
(
q1s16
),
2
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_high_s16
(
q1s16
),
2
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_low_s16
(
q0s16
),
3
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_high_s16
(
q0s16
),
3
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_low_s16
(
q1s16
),
3
);
mb_dqcoeff
+=
16
;
vst1_lane_s16
(
mb_dqcoeff
,
vget_high_s16
(
q1s16
),
3
);
mb_dqcoeff
+=
16
;
return
;
}
vp8/vp8_common.mk
View file @
fe437bc8
...
...
@@ -159,7 +159,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_
VP8_COMMON_SRCS-$(HAVE_MEDIA)
+=
common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6
$(ASM)
# common (neon)
VP8_COMMON_SRCS-$(HAVE_NEON)
+=
common/arm/neon/iwalsh_neon
$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON)
+=
common/arm/neon/loopfilter_neon
$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON)
+=
common/arm/neon/loopfiltersimplehorizontaledge_neon
$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON)
+=
common/arm/neon/loopfiltersimpleverticaledge_neon
$(ASM)
...
...
@@ -186,6 +185,7 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON)
+=
common/arm/neon/dequant_idct_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON)
+=
common/arm/neon/dequantizeb_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON)
+=
common/arm/neon/idct_dequant_full_2x_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON)
+=
common/arm/neon/iwalsh_neon.c
$(eval
$(call
rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment