Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
A
aom-rav1e
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Xiph.Org
aom-rav1e
Commits
77119a5c
Commit
77119a5c
authored
13 years ago
by
Scott LaVarnway
Committed by
Gerrit Code Review
13 years ago
Browse files
Options
Downloads
Plain Diff
Merge "Improved sse2 version of simple loopfilter"
parents
5bfa29b6
1d7d18c6
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
vp8/common/x86/loopfilter_sse2.asm
+94
-105
94 additions, 105 deletions
vp8/common/x86/loopfilter_sse2.asm
with
94 additions
and
105 deletions
vp8/common/x86/loopfilter_sse2.asm
+
94
−
105
View file @
77119a5c
...
...
@@ -1385,52 +1385,54 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
SHADOW_ARGS_TO_STACK
3
SAVE_XMM
7
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
mov
r
si
,
arg
(
0
)
;src_ptr
mov
r
cx
,
arg
(
0
)
;src_ptr
movsxd
rax
,
dword
ptr
arg
(
1
)
;src_pixel_step ; destination pitch?
mov
rdx
,
arg
(
2
)
;blimit
movdqa
xmm3
,
XMMWORD
PTR
[
rdx
]
mov
rdi
,
rsi
; rdi points to row +1 for indirect addressing
add
rdi
,
rax
lea
rdx
,
[
rcx
+
rax
]
neg
rax
; calculate mask
movdqa
xmm1
,
[
rsi
+
2
*
rax
]
; p1
movdqa
xmm0
,
[
rdi
]
; q1
movdqa
xmm0
,
[
rdx
]
; q1
mov
rdx
,
arg
(
2
)
;blimit
movdqa
xmm1
,
[
rcx
+
2
*
rax
]
; p1
movdqa
xmm2
,
xmm1
movdqa
xmm7
,
xmm0
movdqa
xmm4
,
xmm0
psubusb
xmm0
,
xmm1
; q1-=p1
psubusb
xmm1
,
xmm
4
; p1-=q1
psubusb
xmm1
,
xmm
7
; p1-=q1
por
xmm1
,
xmm0
; abs(p1-q1)
pand
xmm1
,
[
GLOBAL
(
tfe
)]
; set lsb of each byte to zero
psrlw
xmm1
,
1
; abs(p1-q1)/2
movdqa
xmm5
,
[
rsi
+
rax
]
; p0
movdqa
xmm4
,
[
rsi
]
; q0
movdqa
xmm3
,
XMMWORD
PTR
[
rdx
]
movdqa
xmm5
,
[
rcx
+
rax
]
; p0
movdqa
xmm4
,
[
rcx
]
; q0
movdqa
xmm0
,
xmm4
; q0
movdqa
xmm6
,
xmm5
; p0
psubusb
xmm5
,
xmm4
; p0-=q0
psubusb
xmm4
,
xmm6
; q0-=p0
por
xmm5
,
xmm4
; abs(p0 - q0)
movdqa
xmm4
,
[
GLOBAL
(
t80
)]
paddusb
xmm5
,
xmm5
; abs(p0-q0)*2
paddusb
xmm5
,
xmm1
; abs (p0 - q0) *2 + abs(p1-q1)/2
psubusb
xmm5
,
xmm3
; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor
xmm3
,
xmm3
pcmpeqb
xmm5
,
xmm3
; start work on filters
pxor
xmm2
,
[
GLOBAL
(
t80
)]
; p1 offset to convert to signed values
pxor
xmm7
,
[
GLOBAL
(
t80
)]
; q1 offset to convert to signed values
pxor
xmm2
,
xmm4
; p1 offset to convert to signed values
pxor
xmm7
,
xmm4
; q1 offset to convert to signed values
psubsb
xmm2
,
xmm7
; p1 - q1
pxor
xmm6
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
pxor
xmm0
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
pxor
xmm6
,
xmm4
; offset to convert to signed values
pxor
xmm0
,
xmm4
; offset to convert to signed values
movdqa
xmm3
,
xmm0
; q0
psubsb
xmm0
,
xmm6
; q0 - p0
paddsb
xmm2
,
xmm0
; p1 - q1 + 1 * (q0 - p0)
...
...
@@ -1438,42 +1440,36 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
paddsb
xmm2
,
xmm0
; p1 - q1 + 3 * (q0 - p0)
pand
xmm5
,
xmm2
; mask filter values we don't care about
; do + 4 side
paddsb
xmm5
,
[
GLOBAL
(
t4
)]
; 3* (q0 - p0) + (p1 - q1) + 4
movdqa
xmm0
,
xmm5
; get a copy of filters
psllw
xmm0
,
8
; shift left 8
psraw
xmm0
,
3
; arithmetic shift right 11
psrlw
xmm0
,
8
movdqa
xmm1
,
xmm5
; get a copy of filters
psraw
xmm1
,
11
; arithmetic shift right 11
psllw
xmm1
,
8
; shift left 8 to put it back
por
xmm0
,
xmm1
; put the two together to get result
paddsb
xmm5
,
[
GLOBAL
(
t4
)]
; 3* (q0 - p0) + (p1 - q1) + 4
movdqa
xmm0
,
xmm5
psubsb
xmm5
,
[
GLOBAL
(
t1s
)]
; +3 instead of +4
psubsb
xmm3
,
xmm0
; q0-= q0 add
pxor
xmm3
,
[
GLOBAL
(
t80
)]
; unoffset
movdqa
[
rsi
],
xmm3
; write back
movdqa
xmm1
,
[
GLOBAL
(
te0
)]
movdqa
xmm2
,
[
GLOBAL
(
t1f
)]
; now do +3 side
psubsb
xmm5
,
[
GLOBAL
(
t1s
)]
; +3 instead of +4
pxor
xmm7
,
xmm7
pcmpgtb
xmm7
,
xmm0
;save sign
pand
xmm7
,
xmm1
;preserve the upper 3 bits
psrlw
xmm0
,
3
pand
xmm0
,
xmm2
;clear out upper 3 bits
por
xmm0
,
xmm7
;add sign
psubsb
xmm3
,
xmm0
; q0-= q0sz add
movdqa
xmm
0
,
xmm
5
; get a copy of filters
p
sllw
xmm
0
,
8
;
shift left 8
p
sraw
xmm
0
,
3
;
arithmetic shift right 11
psrlw
xmm
0
,
8
p
sraw
xmm5
,
11
; arithmetic shift right 11
p
sllw
xmm5
,
8
;
shift left 8 to put it back
p
or
xmm
0
,
xmm5
; p
ut the two together to get result
pxor
xmm
7
,
xmm
7
p
cmpgtb
xmm
7
,
xmm5
;
save sign
p
and
xmm
7
,
xmm1
;
preserve the upper 3 bits
psrlw
xmm
5
,
3
p
and
xmm5
,
xmm2
;clear out upper 3 bits
p
or
xmm5
,
xmm7
;
add sign
p
addsb
xmm
6
,
xmm5
; p
0+= p0 add
pxor
xmm3
,
xmm4
; unoffset
movdqa
[
rcx
],
xmm3
; write back
paddsb
xmm6
,
xmm0
; p0+= p0 add
pxor
xmm6
,
[
GLOBAL
(
t80
)]
; unoffset
movdqa
[
rsi
+
rax
],
xmm6
; write back
pxor
xmm6
,
xmm4
; unoffset
movdqa
[
rcx
+
rax
],
xmm6
; write back
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
REST
ORE_XMM
UNSHADOW_ARGS
...
...
@@ -1536,9 +1532,6 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
punpckldq
xmm0
,
xmm1
; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
punpckhdq
xmm2
,
xmm1
; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
movdqa
t0
,
xmm0
; save to t0
movdqa
t1
,
xmm2
; save to t1
lea
rsi
,
[
rsi
+
rax
*
8
]
lea
rdi
,
[
rsi
+
rax
]
lea
rdx
,
[
rsi
+
rax
*
4
]
...
...
@@ -1551,26 +1544,24 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
punpckldq
xmm4
,
xmm1
; c3 c2 c1 c0 83 82 81 80
punpckldq
xmm6
,
xmm3
; d3 d2 d1 d0 93 92 91 90
movd
xmm
0
,
[
rsi
+
rax
*
2
]
; a3 a2 a1 a0
movd
xmm
1
,
[
rsi
+
rax
*
2
]
; a3 a2 a1 a0
movd
xmm5
,
[
rdx
+
rax
*
2
]
; e3 e2 e1 e0
movd
xmm
2
,
[
rdi
+
rax
*
2
]
; b3 b2 b1 b0
movd
xmm
3
,
[
rdi
+
rax
*
2
]
; b3 b2 b1 b0
movd
xmm7
,
[
rcx
+
rax
*
2
]
; f3 f2 f1 f0
punpckldq
xmm
0
,
xmm5
; e3 e2 e1 e0 a3 a2 a1 a0
punpckldq
xmm
2
,
xmm7
; f3 f2 f1 f0 b3 b2 b1 b0
punpckldq
xmm
1
,
xmm5
; e3 e2 e1 e0 a3 a2 a1 a0
punpckldq
xmm
3
,
xmm7
; f3 f2 f1 f0 b3 b2 b1 b0
punpcklbw
xmm4
,
xmm6
; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
punpcklbw
xmm
0
,
xmm
2
; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
punpcklbw
xmm
1
,
xmm
3
; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
movdqa
xmm
1
,
xmm4
punpcklwd
xmm4
,
xmm
0
; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
punpckhwd
xmm
1
,
xmm
0
; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
movdqa
xmm
7
,
xmm4
punpcklwd
xmm4
,
xmm
1
; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
punpckhwd
xmm
7
,
xmm
1
; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
movdqa
xmm6
,
xmm4
punpckldq
xmm4
,
xmm
1
; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
punpckhdq
xmm6
,
xmm
1
; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
punpckldq
xmm4
,
xmm
7
; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
punpckhdq
xmm6
,
xmm
7
; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
movdqa
xmm0
,
t0
; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
movdqa
xmm2
,
t1
; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
movdqa
xmm1
,
xmm0
movdqa
xmm3
,
xmm2
...
...
@@ -1579,6 +1570,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
punpcklqdq
xmm2
,
xmm6
; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
punpckhqdq
xmm3
,
xmm6
; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
mov
rdx
,
arg
(
2
)
;blimit
; calculate mask
movdqa
xmm6
,
xmm0
; p1
movdqa
xmm7
,
xmm3
; q1
...
...
@@ -1588,6 +1581,8 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
pand
xmm6
,
[
GLOBAL
(
tfe
)]
; set lsb of each byte to zero
psrlw
xmm6
,
1
; abs(p1-q1)/2
movdqa
xmm7
,
[
rdx
]
movdqa
xmm5
,
xmm1
; p0
movdqa
xmm4
,
xmm2
; q0
psubusb
xmm5
,
xmm2
; p0-=q0
...
...
@@ -1596,8 +1591,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
paddusb
xmm5
,
xmm5
; abs(p0-q0)*2
paddusb
xmm5
,
xmm6
; abs (p0 - q0) *2 + abs(p1-q1)/2
mov
rdx
,
arg
(
2
)
;blimit
movdqa
xmm7
,
XMMWORD
PTR
[
rdx
]
movdqa
xmm4
,
[
GLOBAL
(
t80
)]
psubusb
xmm5
,
xmm7
; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor
xmm7
,
xmm7
...
...
@@ -1607,59 +1601,48 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
movdqa
t0
,
xmm0
movdqa
t1
,
xmm3
pxor
xmm0
,
[
GLOBAL
(
t80
)]
; p1 offset to convert to signed values
pxor
xmm3
,
[
GLOBAL
(
t80
)]
; q1 offset to convert to signed values
pxor
xmm0
,
xmm4
; p1 offset to convert to signed values
pxor
xmm3
,
xmm4
; q1 offset to convert to signed values
psubsb
xmm0
,
xmm3
; p1 - q1
movdqa
xmm6
,
xmm1
; p0
movdqa
xmm7
,
xmm2
; q0
pxor
xmm6
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
pxor
xmm7
,
[
GLOBAL
(
t80
)]
; offset to convert to signed values
movdqa
xmm3
,
xmm7
; offseted ; q0
psubsb
xmm7
,
xmm6
; q0 - p0
paddsb
xmm0
,
xmm7
; p1 - q1 + 1 * (q0 - p0)
movdqa
xmm6
,
xmm1
; p0
; movdqa xmm7, xmm2 ; q0
p
addsb
xmm
0
,
xmm
7
; p1 - q1 + 2 * (q0 - p0)
p
addsb
xmm
0
,
xmm
7
; p1 - q1 + 3 * (q0 - p0)
p
xor
xmm
6
,
xmm
4
; offset to convert to signed values
p
xor
xmm
2
,
xmm
4
; offset to convert to signed values
movdqa
xmm3
,
xmm2
; offseted ; q0
psubsb
xmm2
,
xmm6
; q0 - p0
paddsb
xmm0
,
xmm2
; p1 - q1 + 1 * (q0 - p0)
paddsb
xmm0
,
xmm2
; p1 - q1 + 2 * (q0 - p0)
paddsb
xmm0
,
xmm2
; p1 - q1 + 3 * (q0 - p0)
pand
xmm5
,
xmm0
; mask filter values we don't care about
paddsb
xmm5
,
[
GLOBAL
(
t4
)]
; 3* (q0 - p0) + (p1 - q1) + 4
movdqa
xmm0
,
xmm5
; get a copy of filters
psllw
xmm0
,
8
; shift left 8
psraw
xmm0
,
3
; arithmetic shift right 11
psrlw
xmm0
,
8
movdqa
xmm7
,
xmm5
; get a copy of filters
psraw
xmm7
,
11
; arithmetic shift right 11
psllw
xmm7
,
8
; shift left 8 to put it back
por
xmm0
,
xmm7
; put the two together to get result
psubsb
xmm3
,
xmm0
; q0-= q0sz add
pxor
xmm3
,
[
GLOBAL
(
t80
)]
; unoffset q0
; now do +3 side
movdqa
xmm0
,
xmm5
psubsb
xmm5
,
[
GLOBAL
(
t1s
)]
; +3 instead of +4
movdqa
xmm0
,
xmm5
; get a copy of filters
psllw
xmm
0
,
8
; shift left 8
psraw
xmm
0
,
3
; arithmetic shift right 11
movdqa
xmm
1
,
[
GLOBAL
(
te0
)]
movdqa
xmm
2
,
[
GLOBAL
(
t1f
)]
psrlw
xmm0
,
8
psraw
xmm5
,
11
; arithmetic shift right 11
pxor
xmm7
,
xmm7
pcmpgtb
xmm7
,
xmm0
;save sign
pand
xmm7
,
xmm1
;preserve the upper 3 bits
psrlw
xmm0
,
3
pand
xmm0
,
xmm2
;clear out upper 3 bits
por
xmm0
,
xmm7
;add sign
psubsb
xmm3
,
xmm0
; q0-= q0sz add
psllw
xmm5
,
8
; shift left 8 to put it back
por
xmm0
,
xmm5
; put the two together to get result
pxor
xmm7
,
xmm7
pcmpgtb
xmm7
,
xmm5
;save sign
pand
xmm7
,
xmm1
;preserve the upper 3 bits
psrlw
xmm5
,
3
pand
xmm5
,
xmm2
;clear out upper 3 bits
por
xmm5
,
xmm7
;add sign
paddsb
xmm6
,
xmm5
; p0+= p0 add
p
addsb
xmm
6
,
xmm
0
; p0+= p0 add
pxor
xmm6
,
[
GLOBAL
(
t80
)]
; unoffset p0
p
xor
xmm
3
,
xmm
4
; unoffset q0
pxor
xmm6
,
xmm4
; unoffset p0
movdqa
xmm0
,
t0
; p1
movdqa
xmm4
,
t1
; q1
...
...
@@ -1763,3 +1746,9 @@ s9:
align
16
s63:
times
8
dw
0x003f
align
16
te0:
times
16
db
0xe0
align
16
t1f:
times
16
db
0x1f
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment