Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
a522be29
Commit
a522be29
authored
Aug 19, 2010
by
Johann
Committed by
Code Review
Aug 19, 2010
Browse files
Merge "fix armv6 simpleloop filter"
parents
6ea5bb85
467a0b99
Changes
1
Hide whitespace changes
Inline
Side-by-side
vp8/common/arm/armv6/simpleloopfilter_v6.asm
View file @
a522be29
...
...
@@ -63,23 +63,22 @@ pstep RN r1
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb
sp
!
,
{
r4
-
r11
,
lr
}
sub
src
,
src
,
pstep
,
lsl
#
1
; move src pointer down by 2 lines
ldr
r12
,
[
r3
]
; limit
ldr
r3
,
[
src
,
-
pstep
,
lsl
#
1
]
; p1
ldr
r
12
,
[
r3
]
,
#
4
;
limit
ldr
r
3
,
[
src
]
,
pstep
; p
1
ldr
r
9
,
[
sp
,
#
4
0
]
;
count for 8-in-parallel
ldr
r
4
,
[
src
,
-
pstep
]
; p
0
ldr
r9
,
[
sp
,
#
36
]
; count for 8-in-parallel
ldr
r4
,
[
src
],
pstep
; p0
ldr
r7
,
[
r2
],
#
4
; flimit
ldr
r5
,
[
src
],
pstep
; q0
ldr
r7
,
[
r2
]
; flimit
ldr
r5
,
[
src
]
; q0
ldr
r2
,
c0x80808080
ldr
r6
,
[
src
]
; q1
ldr
r6
,
[
src
,
pstep
]
; q1
uadd8
r7
,
r7
,
r7
; flimit * 2
mov
r9
,
r9
,
lsl
#
1
;
4-in-parallel
mov
r9
,
r9
,
lsl
#
1
;
double the count. we're doing 4 at a time
uadd8
r12
,
r7
,
r12
; flimit * 2 + limit
mov
lr
,
#
0
|
simple_hnext8
|
; vp8_simple_filter_mask() function
...
...
@@ -89,22 +88,19 @@ pstep RN r1
uqsub8
r10
,
r4
,
r5
; p0 - q0
uqsub8
r11
,
r5
,
r4
; q0 - p0
orr
r8
,
r8
,
r7
; abs(p1 - q1)
ldr
lr
,
c0x7F7F7F7F
; 01111111 mask
orr
r10
,
r10
,
r11
; abs(p0 - q0)
and
r8
,
lr
,
r8
,
l
s
r
#
1
; abs(p1 - q
1
)
/ 2
uhadd8
r8
,
r8
,
lr
; abs(p1 - q
2
)
>> 1
uqadd8
r10
,
r10
,
r10
; abs(p0 - q0) * 2
mvn
lr
,
#
0
; r10 == -1
; STALL waiting on r10
uqadd8
r10
,
r10
,
r8
; abs(p0 - q0)*2 + abs(p1 - q1)/2
; STALL waiting on r10 :(
uqsub8
r10
,
r10
,
r12
; compare to flimit
mov
r8
,
#
0
usub8
r10
,
r8
,
r10
; use usub8 instead of ssub8
; STALL (maybe?) when are flags set? :/
sel
r10
,
lr
,
r8
; filter mask: lr
; STALL waiting on r10
mvn
r8
,
#
0
uqsub8
r10
,
r10
,
r12
; compare to flimit. need to do this twice because uqsub8 doesn't set GE flags
; and usub8 doesn't saturate
usub8
r10
,
lr
,
r10
; set GE flags for each byte
sel
r10
,
r8
,
lr
; filter mask: F or 0
cmp
r10
,
#
0
beq
si
mple_hskip_filter
; skip filtering
beq
si
mple_hskip_filter
; skip filtering
if we're &ing with 0s. would just write out the same values
;vp8_simple_filter() function
...
...
@@ -113,55 +109,45 @@ pstep RN r1
eor
r4
,
r4
,
r2
; p0 offset to convert to a signed value
eor
r5
,
r5
,
r2
; q0 offset to convert to a signed value
qsub8
r3
,
r3
,
r6
; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1)
qsub8
r6
,
r5
,
r4
; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0))
qsub8
r3
,
r3
,
r6
; vp8_signed_char_clamp(p1-q1)
qsub8
r6
,
r5
,
r4
; vp8_signed_char_clamp(q0-p0)
qadd8
r3
,
r3
,
r6
; += q0-p0
qadd8
r3
,
r3
,
r6
; += q0-p0
qadd8
r3
,
r3
,
r6
; p1-q1 + 3*(q0-p0))
and
r3
,
r3
,
r10
; &= mask
qadd8
r3
,
r3
,
r6
ldr
r8
,
c0x03030303
; r8 = 3
qadd8
r3
,
r3
,
r6
ldr
r7
,
c0x04040404
qadd8
r3
,
r3
,
r6
and
r3
,
r3
,
lr
; vp8_filter &= mask;
ldr
r8
,
c0x03030303
;save bottom 3 bits so that we round one side +4 and the other +3
qadd8
r7
,
r3
,
r7
; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
qadd8
r8
,
r3
,
r8
; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
qadd8
r3
,
r3
,
r7
; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
mov
r
7
,
#
0
shadd8
r
8
,
r
8
,
r
7
; Filter2 >>=
3
shadd8
r
3
,
r
3
,
r
7
; Filter1 >>=
3
shadd8
r
8
,
r
8
,
r
7
shadd8
r
3
,
r
3
,
r
7
shadd8
r
8
,
r
8
,
r
7
;
r8:
Filter
2
shadd8
r
3
,
r
3
,
r
7
;
r7: filter1
mov
r
3
,
#
0
shadd8
r
7
,
r
7
,
r3
shadd8
r
8
,
r
8
,
r3
shadd8
r
7
,
r
7
,
r
3
shadd8
r
8
,
r
8
,
r
3
shadd8
r
7
,
r
7
,
r
3
; Filter
1 >>= 3
shadd8
r
8
,
r
8
,
r
3
;
Filter2 >>= 3
;calculate output
sub
src
,
src
,
pstep
,
lsl
#
1
qsub8
r5
,
r5
,
r7
; u = vp8_signed_char_clamp(q0 - Filter1)
qadd8
r4
,
r4
,
r8
; u = vp8_signed_char_clamp(p0 + Filter2)
qsub8
r5
,
r5
,
r3
; u = vp8_signed_char_clamp(q0 - Filter1)
eor
r4
,
r4
,
r2
; *op0 = u^0x80
str
r4
,
[
src
],
pstep
; store op0 result
eor
r5
,
r5
,
r2
; *oq0 = u^0x80
str
r5
,
[
src
],
pstep
; store oq0 result
str
r5
,
[
src
]
; store oq0 result
eor
r4
,
r4
,
r2
; *op0 = u^0x80
str
r4
,
[
src
,
-
pstep
]
; store op0 result
|
simple_hskip_filter
|
add
src
,
src
,
#
4
sub
src
,
src
,
pstep
sub
src
,
src
,
pstep
,
lsl
#
1
subs
r9
,
r9
,
#
1
addne
src
,
src
,
#
4
; next row
;pld [src]
;pld [src, pstep]
;pld [src, pstep, lsl #1]
ldrne
r3
,
[
src
],
pstep
; p1
ldrne
r4
,
[
src
],
pstep
; p0
ldrne
r5
,
[
src
],
pstep
; q0
ldrne
r6
,
[
src
]
; q1
ldrne
r3
,
[
src
,
-
pstep
,
lsl
#
1
]
; p1
ldrne
r4
,
[
src
,
-
pstep
]
; p0
ldrne
r5
,
[
src
]
; q0
ldrne
r6
,
[
src
,
pstep
]
; q1
bne
si
mple_hnext8
...
...
@@ -174,9 +160,9 @@ pstep RN r1
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb
sp
!
,
{
r4
-
r11
,
lr
}
ldr
r12
,
[
r2
]
,
#
4
; r12: flimit
ldr
r12
,
[
r2
]
; r12: flimit
ldr
r2
,
c0x80808080
ldr
r7
,
[
r3
]
,
#
4
; limit
ldr
r7
,
[
r3
]
; limit
; load soure data to r7, r8, r9, r10
ldrh
r3
,
[
src
,
#
-
2
]
...
...
@@ -213,16 +199,15 @@ pstep RN r1
uqsub8
r10
,
r5
,
r4
; q0 - p0
orr
r7
,
r7
,
r8
; abs(p1 - q1)
orr
r9
,
r9
,
r10
; abs(p0 - q0)
ldr
lr
,
c0x7F7F7F7F
; 0111 1111 mask
uqadd8
r9
,
r9
,
r9
; abs(p0 - q0) * 2
and
r7
,
lr
,
r7
,
lsr
#
1
; abs(p1 - q1) / 2
mov
r8
,
#
0
uqadd8
r9
,
r9
,
r9
; abs(p0 - q0) * 2
uhadd8
r7
,
r7
,
r8
; abs(p1 - q1) / 2
uqadd8
r7
,
r7
,
r9
; abs(p0 - q0)*2 + abs(p1 - q1)/2
mvn
r10
,
#
0
; r10 == -1
uqsub8
r7
,
r7
,
r12
; compare to flimit
usub8
r7
,
r8
,
r7
sel
r
7
,
r10
,
r8
; filter mask
: lr
sel
l
r
,
r10
,
r8
; filter mask
cmp
lr
,
#
0
beq
si
mple_vskip_filter
; skip filtering
...
...
@@ -286,10 +271,6 @@ pstep RN r1
|
simple_vskip_filter
|
subs
r11
,
r11
,
#
1
;pld [src]
;pld [src, pstep]
;pld [src, pstep, lsl #1]
; load soure data to r7, r8, r9, r10
ldrneh
r3
,
[
src
,
#
-
2
]
ldrneh
r4
,
[
src
],
pstep
...
...
@@ -316,7 +297,5 @@ pstep RN r1
c0x80808080
DCD
0x80808080
c0x03030303
DCD
0x03030303
c0x04040404
DCD
0x04040404
c0x01010101
DCD
0x01010101
c0x7F7F7F7F
DCD
0x7F7F7F7F
END
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment