Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
A
aom-rav1e
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Xiph.Org
aom-rav1e
Commits
86e07525
Commit
86e07525
authored
13 years ago
by
Johann Koenig
Committed by
Gerrit Code Review
13 years ago
Browse files
Options
Downloads
Plain Diff
Merge "NEON walsh transform updated to match C"
parents
3a16276c
2a4b2a00
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
+69
-42
69 additions, 42 deletions
vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
with
69 additions
and
42 deletions
vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
+
69
−
42
View file @
86e07525
...
...
@@ -16,58 +16,85 @@
PRESERVE8
AREA
||
.text
||
,
CODE
,
READONLY
,
AL
IGN
=
2
;void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
; r0 short *input,
; r1 short *output,
; r2 int pitch
|
vp8_short_walsh4x4_neon
|
PROC
vld1.16
{
d2
}
,
[
r0
],
r2
;load input
vld1.16
{
d3
}
,
[
r0
],
r2
vld1.16
{
d4
}
,
[
r0
],
r2
vld1.16
{
d5
}
,
[
r0
],
r2
;First for-loop
;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
vtrn.32
d2
,
d4
vtrn.32
d3
,
d5
vtrn.16
d2
,
d3
vtrn.16
d4
,
d5
vld1.16
{
d0
}
,
[
r0@64
],
r2
; load input
vld1.16
{
d1
}
,
[
r0@64
],
r2
vld1.16
{
d2
}
,
[
r0@64
],
r2
vld1.16
{
d3
}
,
[
r0@64
]
vadd.s16
d6
,
d2
,
d5
;a1 = ip[0]+ip[3]
vadd.s16
d
7
,
d
3
,
d
4
;b1 = ip[1]+ip[2
]
v
sub.s16
d8
,
d
3
,
d
4
;c1 = ip[1]-ip[2]
v
sub.s16
d9
,
d
2
,
d
5
;d1 = ip[0]-ip[3]
;First for-loop
;transpose
d
0
, d
1
, d
2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3
]
v
trn.32
d
0
,
d
2
v
trn.32
d
1
,
d
3
vadd.s16
d2
,
d6
,
d7
;op[0] = a1 + b1
vsub.s16
d4
,
d6
,
d7
;op[2] = a1 - b1
vadd.s16
d3
,
d8
,
d9
;op[1] = c1 + d1
vsub.s16
d5
,
d9
,
d8
;op[3] = d1 - c1
vmov.s32
q15
,
#
3
; add 3 to all values
;Second for-loop
;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
vtrn.32
d2
,
d4
vtrn.32
d3
,
d5
vtrn.16
d0
,
d1
vtrn.16
d2
,
d3
vtrn.16
d4
,
d5
vadd.s16
d
6
,
d
2
,
d
5
;a1 =
ip[0]
+
ip[
1
2]
vadd.s16
d
7
,
d
3
,
d
4
;
b1 = ip[4]+
ip[
8
]
vsub.s16
d
8
,
d
3
,
d
4
;
c1 = ip[4]-
ip[
8
]
vsub.s16
d
9
,
d
2
,
d
5
;d1 =
ip[0]
-
ip[
1
2]
vadd.s16
d
4
,
d
0
,
d
2
;
ip[0]
+
ip[2]
vadd.s16
d
5
,
d
1
,
d
3
;
ip[1] +
ip[
3
]
vsub.s16
d
6
,
d
1
,
d
3
;
ip[1] -
ip[
3
]
vsub.s16
d
7
,
d
0
,
d
2
;
ip[0]
-
ip[2]
vadd.s16
d2
,
d6
,
d7
;a2 = a1 + b1;
vsub.s16
d4
,
d6
,
d7
;c2 = a1 - b1;
vadd.s16
d3
,
d8
,
d9
;b2 = c1 + d1;
vsub.s16
d5
,
d9
,
d8
;d2 = d1 - c1;
vshl.s16
d4
,
d4
,
#
2
; a1 = (ip[0] + ip[2]) << 2
vshl.s16
d5
,
d5
,
#
2
; d1 = (ip[1] + ip[3]) << 2
vshl.s16
d6
,
d6
,
#
2
; c1 = (ip[1] - ip[3]) << 2
vceq.s16
d16
,
d4
,
#
0
; a1 == 0
vshl.s16
d7
,
d7
,
#
2
; b1 = (ip[0] - ip[2]) << 2
vcgt.s16
q3
,
q1
,
#
0
vcgt.s16
q4
,
q2
,
#
0
vadd.s16
d0
,
d4
,
d5
; a1 + d1
vmvn
d16
,
d16
; a1 != 0
vsub.s16
d3
,
d4
,
d5
; op[3] = a1 - d1
vadd.s16
d1
,
d7
,
d6
; op[1] = b1 + c1
vsub.s16
d2
,
d7
,
d6
; op[2] = b1 - c1
vsub.s16
d0
,
d0
,
d16
; op[0] = a1 + d1 + (a1 != 0)
vsub.s16
q1
,
q1
,
q3
vsub.s16
q2
,
q2
,
q4
vshr.s16
q1
,
q1
,
#
1
vshr.s16
q2
,
q2
,
#
1
vst1.16
{
q1
,
q2
}
,
[
r1
]
;Second for-loop
;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
vtrn.32
d1
,
d3
vtrn.32
d0
,
d2
vtrn.16
d2
,
d3
vtrn.16
d0
,
d1
vaddl.s16
q8
,
d0
,
d2
; a1 = ip[0]+ip[8]
vaddl.s16
q9
,
d1
,
d3
; d1 = ip[4]+ip[12]
vsubl.s16
q10
,
d1
,
d3
; c1 = ip[4]-ip[12]
vsubl.s16
q11
,
d0
,
d2
; b1 = ip[0]-ip[8]
vadd.s32
q0
,
q8
,
q9
; a2 = a1 + d1
vadd.s32
q1
,
q11
,
q10
; b2 = b1 + c1
vsub.s32
q2
,
q11
,
q10
; c2 = b1 - c1
vsub.s32
q3
,
q8
,
q9
; d2 = a1 - d1
vclt.s32
q8
,
q0
,
#
0
vclt.s32
q9
,
q1
,
#
0
vclt.s32
q10
,
q2
,
#
0
vclt.s32
q11
,
q3
,
#
0
; subtract -1 (or 0)
vsub.s32
q0
,
q0
,
q8
; a2 += a2 < 0
vsub.s32
q1
,
q1
,
q9
; b2 += b2 < 0
vsub.s32
q2
,
q2
,
q10
; c2 += c2 < 0
vsub.s32
q3
,
q3
,
q11
; d2 += d2 < 0
vadd.s32
q8
,
q0
,
q15
; a2 + 3
vadd.s32
q9
,
q1
,
q15
; b2 + 3
vadd.s32
q10
,
q2
,
q15
; c2 + 3
vadd.s32
q11
,
q3
,
q15
; d2 + 3
; vrshrn? would add 1 << 3-1 = 2
vshrn.s32
d0
,
q8
,
#
3
vshrn.s32
d1
,
q9
,
#
3
vshrn.s32
d2
,
q10
,
#
3
vshrn.s32
d3
,
q11
,
#
3
vst1.16
{
q0
,
q1
}
,
[
r1@128
]
bx
lr
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment