Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
4cde2ab7
Commit
4cde2ab7
authored
Mar 23, 2011
by
Johann
Committed by
Code Review
Mar 23, 2011
Browse files
Merge "ARMv6 optimized fdct4x4"
parents
edfc93ae
a61785b6
Changes
5
Hide whitespace changes
Inline
Side-by-side
vp8/encoder/arm/arm_csystemdependent.c
View file @
4cde2ab7
...
...
@@ -59,9 +59,9 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;*/
/*cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_
c
;
cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_
c;*/
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
*/
cpi
->
rtcd
.
fdct
.
fast4x4
=
vp8_fast_fdct4x4_
armv6
;
cpi
->
rtcd
.
fdct
.
fast8x4
=
vp8_fast_fdct8x4_
armv6
;
cpi
->
rtcd
.
fdct
.
walsh_short4x4
=
vp8_short_walsh4x4_armv6
;
/*cpi->rtcd.encodemb.berr = vp8_block_error_c;
...
...
vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
0 → 100644
View file @
4cde2ab7
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT
|
vp8_fast_fdct4x4_armv6
|
ARM
REQUIRE8
PRESERVE8
AREA
|
.text
|
,
CODE
,
READONLY
; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|
vp8_fast_fdct4x4_armv6
|
PROC
stmfd
sp
!
,
{
r4
-
r12
,
lr
}
; PART 1
; coeffs 0-3
ldrd
r4
,
r5
,
[
r0
]
; [i1 | i0] [i3 | i2]
ldr
r10
,
c7500
ldr
r11
,
c14500
ldr
r12
,
c0x22a453a0
; [2217*4 | 5352*4]
ldr
lr
,
c0x00080008
ror
r5
,
r5
,
#
16
; [i2 | i3]
qadd16
r6
,
r4
,
r5
; [i1+i2 | i0+i3] = [b1 | a1] without shift
qsub16
r7
,
r4
,
r5
; [i1-i2 | i0-i3] = [c1 | d1] without shift
add
r0
,
r0
,
r2
; update input pointer
qadd16
r7
,
r7
,
r7
; 2*[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad
r4
,
r6
,
lr
; o0 = (i1+i2)*8 + (i0+i3)*8
smusd
r5
,
r6
,
lr
; o2 = (i1+i2)*8 - (i0+i3)*8
smlad
r6
,
r7
,
r12
,
r11
; o1 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx
r7
,
r7
,
r12
,
r10
; o3 = (d1 * 2217 - c1 * 5352 + 7500)
ldrd
r8
,
r9
,
[
r0
]
; [i5 | i4] [i7 | i6]
pkhbt
r3
,
r4
,
r6
,
lsl
#
4
; [o1 | o0], keep in register for PART 2
pkhbt
r6
,
r5
,
r7
,
lsl
#
4
; [o3 | o2]
str
r6
,
[
r1
,
#
4
]
; coeffs 4-7
ror
r9
,
r9
,
#
16
; [i6 | i7]
qadd16
r6
,
r8
,
r9
; [i5+i6 | i4+i7] = [b1 | a1] without shift
qsub16
r7
,
r8
,
r9
; [i5-i6 | i4-i7] = [c1 | d1] without shift
add
r0
,
r0
,
r2
; update input pointer
qadd16
r7
,
r7
,
r7
; 2x[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad
r9
,
r6
,
lr
; o4 = (i5+i6)*8 + (i4+i7)*8
smusd
r8
,
r6
,
lr
; o6 = (i5+i6)*8 - (i4+i7)*8
smlad
r6
,
r7
,
r12
,
r11
; o5 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx
r7
,
r7
,
r12
,
r10
; o7 = (d1 * 2217 - c1 * 5352 + 7500)
ldrd
r4
,
r5
,
[
r0
]
; [i9 | i8] [i11 | i10]
pkhbt
r9
,
r9
,
r6
,
lsl
#
4
; [o5 | o4], keep in register for PART 2
pkhbt
r6
,
r8
,
r7
,
lsl
#
4
; [o7 | o6]
str
r6
,
[
r1
,
#
12
]
; coeffs 8-11
ror
r5
,
r5
,
#
16
; [i10 | i11]
qadd16
r6
,
r4
,
r5
; [i9+i10 | i8+i11]=[b1 | a1] without shift
qsub16
r7
,
r4
,
r5
; [i9-i10 | i8-i11]=[c1 | d1] without shift
add
r0
,
r0
,
r2
; update input pointer
qadd16
r7
,
r7
,
r7
; 2x[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad
r2
,
r6
,
lr
; o8 = (i9+i10)*8 + (i8+i11)*8
smusd
r8
,
r6
,
lr
; o10 = (i9+i10)*8 - (i8+i11)*8
smlad
r6
,
r7
,
r12
,
r11
; o9 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx
r7
,
r7
,
r12
,
r10
; o11 = (d1 * 2217 - c1 * 5352 + 7500)
ldrd
r4
,
r5
,
[
r0
]
; [i13 | i12] [i15 | i14]
pkhbt
r2
,
r2
,
r6
,
lsl
#
4
; [o9 | o8], keep in register for PART 2
pkhbt
r6
,
r8
,
r7
,
lsl
#
4
; [o11 | o10]
str
r6
,
[
r1
,
#
20
]
; coeffs 12-15
ror
r5
,
r5
,
#
16
; [i14 | i15]
qadd16
r6
,
r4
,
r5
; [i13+i14 | i12+i15]=[b1|a1] without shift
qsub16
r7
,
r4
,
r5
; [i13-i14 | i12-i15]=[c1|d1] without shift
qadd16
r7
,
r7
,
r7
; 2x[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad
r4
,
r6
,
lr
; o12 = (i13+i14)*8 + (i12+i15)*8
smusd
r5
,
r6
,
lr
; o14 = (i13+i14)*8 - (i12+i15)*8
smlad
r6
,
r7
,
r12
,
r11
; o13 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx
r7
,
r7
,
r12
,
r10
; o15 = (d1 * 2217 - c1 * 5352 + 7500)
pkhbt
r0
,
r4
,
r6
,
lsl
#
4
; [o13 | o12], keep in register for PART 2
pkhbt
r6
,
r5
,
r7
,
lsl
#
4
; [o15 | o14]
str
r6
,
[
r1
,
#
28
]
; PART 2 -------------------------------------------------
ldr
r11
,
c12000
ldr
r10
,
c51000
ldr
lr
,
c0x00070007
qadd16
r4
,
r3
,
r0
; a1 = [i1+i13 | i0+i12]
qadd16
r5
,
r9
,
r2
; b1 = [i5+i9 | i4+i8]
qsub16
r6
,
r9
,
r2
; c1 = [i5-i9 | i4-i8]
qsub16
r7
,
r3
,
r0
; d1 = [i1-i13 | i0-i12]
qadd16
r4
,
r4
,
lr
; a1 + 7
add
r0
,
r11
,
#
0x10000
; add (d!=0)
qadd16
r2
,
r4
,
r5
; a1 + b1 + 7
qsub16
r3
,
r4
,
r5
; a1 - b1 + 7
ldr
r12
,
c0x08a914e8
; [2217 | 5352]
lsl
r8
,
r2
,
#
16
; prepare bottom halfword for scaling
asr
r2
,
r2
,
#
4
; scale top halfword
lsl
r9
,
r3
,
#
16
; prepare bottom halfword for scaling
asr
r3
,
r3
,
#
4
; scale top halfword
pkhtb
r4
,
r2
,
r8
,
asr
#
20
; pack and scale bottom halfword
pkhtb
r5
,
r3
,
r9
,
asr
#
20
; pack and scale bottom halfword
smulbt
r2
,
r6
,
r12
; [ ------ | c1*2217]
str
r4
,
[
r1
,
#
0
]
; [ o1 | o0]
smultt
r3
,
r6
,
r12
; [c1*2217 | ------ ]
str
r5
,
[
r1
,
#
16
]
; [ o9 | o8]
smlabb
r8
,
r7
,
r12
,
r2
; [ ------ | d1*5352]
smlatb
r9
,
r7
,
r12
,
r3
; [d1*5352 | ------ ]
smulbb
r2
,
r6
,
r12
; [ ------ | c1*5352]
smultb
r3
,
r6
,
r12
; [c1*5352 | ------ ]
lsls
r6
,
r7
,
#
16
; d1 != 0 ?
addeq
r8
,
r8
,
r11
; c1_b*2217+d1_b*5352+12000 + (d==0)
addne
r8
,
r8
,
r0
; c1_b*2217+d1_b*5352+12000 + (d!=0)
asrs
r6
,
r7
,
#
16
addeq
r9
,
r9
,
r11
; c1_t*2217+d1_t*5352+12000 + (d==0)
addne
r9
,
r9
,
r0
; c1_t*2217+d1_t*5352+12000 + (d!=0)
smlabt
r4
,
r7
,
r12
,
r10
; [ ------ | d1*2217] + 51000
smlatt
r5
,
r7
,
r12
,
r10
; [d1*2217 | ------ ] + 51000
pkhtb
r9
,
r9
,
r8
,
asr
#
16
sub
r4
,
r4
,
r2
sub
r5
,
r5
,
r3
ldr
r3
,
[
r1
,
#
4
]
; [i3 | i2]
pkhtb
r5
,
r5
,
r4
,
asr
#
16
; [o13|o12]
str
r9
,
[
r1
,
#
8
]
; [o5 | 04]
ldr
r9
,
[
r1
,
#
12
]
; [i7 | i6]
ldr
r8
,
[
r1
,
#
28
]
; [i15|i14]
ldr
r2
,
[
r1
,
#
20
]
; [i11|i10]
str
r5
,
[
r1
,
#
24
]
; [o13|o12]
qadd16
r4
,
r3
,
r8
; a1 = [i3+i15 | i2+i14]
qadd16
r5
,
r9
,
r2
; b1 = [i7+i11 | i6+i10]
qadd16
r4
,
r4
,
lr
; a1 + 7
qsub16
r6
,
r9
,
r2
; c1 = [i7-i11 | i6-i10]
qadd16
r2
,
r4
,
r5
; a1 + b1 + 7
qsub16
r7
,
r3
,
r8
; d1 = [i3-i15 | i2-i14]
qsub16
r3
,
r4
,
r5
; a1 - b1 + 7
lsl
r8
,
r2
,
#
16
; prepare bottom halfword for scaling
asr
r2
,
r2
,
#
4
; scale top halfword
lsl
r9
,
r3
,
#
16
; prepare bottom halfword for scaling
asr
r3
,
r3
,
#
4
; scale top halfword
pkhtb
r4
,
r2
,
r8
,
asr
#
20
; pack and scale bottom halfword
pkhtb
r5
,
r3
,
r9
,
asr
#
20
; pack and scale bottom halfword
smulbt
r2
,
r6
,
r12
; [ ------ | c1*2217]
str
r4
,
[
r1
,
#
4
]
; [ o3 | o2]
smultt
r3
,
r6
,
r12
; [c1*2217 | ------ ]
str
r5
,
[
r1
,
#
20
]
; [ o11 | o10]
smlabb
r8
,
r7
,
r12
,
r2
; [ ------ | d1*5352]
smlatb
r9
,
r7
,
r12
,
r3
; [d1*5352 | ------ ]
smulbb
r2
,
r6
,
r12
; [ ------ | c1*5352]
smultb
r3
,
r6
,
r12
; [c1*5352 | ------ ]
lsls
r6
,
r7
,
#
16
; d1 != 0 ?
addeq
r8
,
r8
,
r11
; c1_b*2217+d1_b*5352+12000 + (d==0)
addne
r8
,
r8
,
r0
; c1_b*2217+d1_b*5352+12000 + (d!=0)
asrs
r6
,
r7
,
#
16
addeq
r9
,
r9
,
r11
; c1_t*2217+d1_t*5352+12000 + (d==0)
addne
r9
,
r9
,
r0
; c1_t*2217+d1_t*5352+12000 + (d!=0)
smlabt
r4
,
r7
,
r12
,
r10
; [ ------ | d1*2217] + 51000
smlatt
r5
,
r7
,
r12
,
r10
; [d1*2217 | ------ ] + 51000
pkhtb
r9
,
r9
,
r8
,
asr
#
16
sub
r4
,
r4
,
r2
sub
r5
,
r5
,
r3
str
r9
,
[
r1
,
#
12
]
; [o7 | o6]
pkhtb
r5
,
r5
,
r4
,
asr
#
16
; [o15|o14]
str
r5
,
[
r1
,
#
28
]
; [o15|o14]
ldmfd
sp
!
,
{
r4
-
r12
,
pc
}
ENDP
; Used constants
c7500
DCD
7500
c14500
DCD
14500
c0x22a453a0
DCD
0x22a453a0
c0x00080008
DCD
0x00080008
c12000
DCD
12000
c51000
DCD
51000
c0x00070007
DCD
0x00070007
c0x08a914e8
DCD
0x08a914e8
END
vp8/encoder/arm/dct_arm.c
0 → 100644
View file @
4cde2ab7
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include
"vpx_config.h"
#include
"vp8/encoder/dct.h"
#if HAVE_ARMV6
void
vp8_fast_fdct8x4_armv6
(
short
*
input
,
short
*
output
,
int
pitch
)
{
vp8_fast_fdct4x4_armv6
(
input
,
output
,
pitch
);
vp8_fast_fdct4x4_armv6
(
input
+
4
,
output
+
16
,
pitch
);
}
#endif
/* HAVE_ARMV6 */
vp8/encoder/arm/dct_arm.h
View file @
4cde2ab7
...
...
@@ -14,12 +14,21 @@
#if HAVE_ARMV6
extern
prototype_fdct
(
vp8_short_walsh4x4_armv6
);
extern
prototype_fdct
(
vp8_fast_fdct4x4_armv6
);
extern
prototype_fdct
(
vp8_fast_fdct8x4_armv6
);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
#undef vp8_fdct_fast4x4
#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
#undef vp8_fdct_fast8x4
#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
#endif
#endif
#endif
/* HAVE_ARMV6 */
#if HAVE_ARMV7
extern
prototype_fdct
(
vp8_short_fdct4x4_neon
);
...
...
vp8/vp8cx_arm.mk
View file @
4cde2ab7
...
...
@@ -19,6 +19,7 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/asm_enc_offsets.c
VP8_CX_SRCS-$(HAVE_ARMV7)
+=
encoder/arm/encodemb_arm.c
VP8_CX_SRCS-$(HAVE_ARMV7)
+=
encoder/arm/quantize_arm.c
VP8_CX_SRCS-$(HAVE_ARMV7)
+=
encoder/arm/picklpf_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/dct_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/variance_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/variance_arm.h
VP8_CX_SRCS-$(HAVE_ARMV5TE)
+=
encoder/arm/boolhuff_arm.c
...
...
@@ -34,6 +35,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar
#File list for armv6
# encoder
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/armv6/vp8_fast_fdct4x4_armv6
$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/armv6/vp8_fast_quantize_b_armv6
$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/armv6/vp8_sad16x16_armv6
$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6)
+=
encoder/arm/armv6/vp8_variance16x16_armv6
$(ASM)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment