Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Xiph.Org
aom-rav1e
Commits
afa1b661
Commit
afa1b661
authored
Dec 12, 2011
by
Scott LaVarnway
Committed by
Gerrit Code Review
Dec 12, 2011
Browse files
Options
Browse Files
Download
Plain Diff
Merge "Improved mmx/sse2 versions of iwalsh"
parents
a69810b8
9fa6132f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
169 additions
and
246 deletions
+169
-246
vp8/common/x86/iwalsh_mmx.asm
vp8/common/x86/iwalsh_mmx.asm
+94
-131
vp8/common/x86/iwalsh_sse2.asm
vp8/common/x86/iwalsh_sse2.asm
+75
-115
No files found.
vp8/common/x86/iwalsh_mmx.asm
View file @
afa1b661
...
@@ -17,160 +17,123 @@ sym(vp8_short_inv_walsh4x4_mmx):
...
@@ -17,160 +17,123 @@ sym(vp8_short_inv_walsh4x4_mmx):
push
rbp
push
rbp
mov
rbp
,
rsp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
2
SHADOW_ARGS_TO_STACK
2
push
rsi
push
rdi
; end prolog
; end prolog
mov
rax
,
3
mov
rdx
,
arg
(
0
)
mov
rsi
,
arg
(
0
)
mov
rax
,
30003h
mov
rdi
,
arg
(
1
)
shl
rax
,
16
movq
mm0
,
[
r
si
+
0
]
;ip[0]
movq
mm0
,
[
r
dx
+
0
]
;ip[0]
movq
mm1
,
[
r
si
+
8
]
;ip[4]
movq
mm1
,
[
r
dx
+
8
]
;ip[4]
or
rax
,
3
;00030003h
movd
mm7
,
rax
movq
mm2
,
[
rsi
+
16
]
;ip[8]
movq
mm2
,
[
rdx
+
16
]
;ip[8]
movq
mm3
,
[
rsi
+
24
]
;ip[12]
movq
mm3
,
[
rdx
+
24
]
;ip[12]
punpcklwd
mm7
,
mm7
;0003000300030003h
mov
rdx
,
arg
(
1
)
movq
mm
7
,
rax
movq
mm
4
,
mm0
movq
mm
4
,
mm
0
movq
mm
5
,
mm
1
p
unpcklwd
mm
7
,
mm
7
;
0003000300030003h
p
addw
mm
4
,
mm
3
;
ip[0] + ip[12] aka al
movq
mm5
,
mm
1
paddw
mm5
,
mm
2
;ip[4] + ip[8] aka bl
paddw
mm4
,
mm3
;ip[0] + ip[12] aka al
movq
mm6
,
mm4
;temp al
paddw
mm5
,
mm2
;ip[4] + ip[8] aka bl
paddw
mm4
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
movq
mm6
,
mm4
;temp al
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm1
,
mm2
;ip[4] - ip[8] aka c1
paddw
mm4
,
mm5
;al + bl
movq
mm5
,
mm0
;temp dl
psubw
mm6
,
mm5
;al - bl
paddw
mm0
,
mm1
;dl + cl
psubw
mm5
,
mm1
;dl - cl
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm1
,
mm2
;ip[4] - ip[8] aka c1
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm1
;dl + cl
psubw
mm5
,
mm1
;dl - cl
; 03 02 01 00
; 03 02 01 00
; 13 12 11 10
; 13 12 11 10
; 23 22 21 20
; 23 22 21 20
; 33 32 31 30
; 33 32 31 30
movq
mm3
,
mm4
; 03 02 01 00
movq
mm3
,
mm4
; 03 02 01 00
punpcklwd
mm4
,
mm0
; 11 01 10 00
punpcklwd
mm4
,
mm0
; 11 01 10 00
punpckhwd
mm3
,
mm0
; 13 03 12 02
punpckhwd
mm3
,
mm0
; 13 03 12 02
movq
mm1
,
mm6
; 23 22 21 20
movq
mm1
,
mm6
; 23 22 21 20
punpcklwd
mm6
,
mm5
; 31 21 30 20
punpcklwd
mm6
,
mm5
; 31 21 30 20
punpckhwd
mm1
,
mm5
; 33 23 32 22
punpckhwd
mm1
,
mm5
; 33 23 32 22
movq
mm0
,
mm4
; 11 01 10 00
movq
mm0
,
mm4
; 11 01 10 00
movq
mm2
,
mm3
; 13 03 12 02
movq
mm2
,
mm3
; 13 03 12 02
punpckldq
mm0
,
mm6
; 30 20 10 00 aka ip[0]
punpckldq
mm0
,
mm6
; 30 20 10 00 aka ip[0]
punpckhdq
mm4
,
mm6
; 31 21 11 01 aka ip[4]
punpckhdq
mm4
,
mm6
; 31 21 11 01 aka ip[4]
punpckldq
mm2
,
mm1
; 32 22 12 02 aka ip[8]
punpckldq
mm2
,
mm1
; 32 22 12 02 aka ip[8]
punpckhdq
mm3
,
mm1
; 33 23 13 03 aka ip[12]
punpckhdq
mm3
,
mm1
; 33 23 13 03 aka ip[12]
;~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~
movq
mm1
,
mm0
movq
mm1
,
mm0
movq
mm5
,
mm4
movq
mm5
,
mm4
paddw
mm1
,
mm3
;ip[0] + ip[12] aka al
paddw
mm1
,
mm3
;ip[0] + ip[12] aka al
paddw
mm5
,
mm2
;ip[4] + ip[8] aka bl
paddw
mm5
,
mm2
;ip[4] + ip[8] aka bl
movq
mm6
,
mm1
;temp al
movq
mm6
,
mm1
;temp al
paddw
mm1
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
paddw
mm1
,
mm5
;al + bl
paddw
mm1
,
mm7
psubw
mm6
,
mm5
;al - bl
paddw
mm6
,
mm7
psraw
mm1
,
3
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psraw
mm6
,
3
psubw
mm4
,
mm2
;ip[4] - ip[8] aka c1
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
movq
mm5
,
mm0
;temp dl
psubw
mm4
,
mm2
;ip[4] - ip[8] aka c1
paddw
mm0
,
mm4
;dl + cl
movq
mm5
,
mm0
;temp dl
psubw
mm5
,
mm4
;dl - cl
paddw
mm0
,
mm4
;dl + cl
psubw
mm5
,
mm4
;dl - cl
paddw
mm0
,
mm7
paddw
mm5
,
mm7
psraw
mm0
,
3
psraw
mm5
,
3
;~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~
movq
mm3
,
mm1
; 03 02 01 00
punpcklwd
mm1
,
mm0
; 11 01 10 00
movd
eax
,
mm1
punpckhwd
mm3
,
mm0
; 13 03 12 02
movd
ecx
,
mm0
psrlq
mm0
,
32
movq
mm4
,
mm6
; 23 22 21 20
psrlq
mm1
,
32
punpcklwd
mm6
,
mm5
; 31 21 30 20
mov
word
ptr
[
rdx
+
32
*
0
],
ax
punpckhwd
mm4
,
mm5
; 33 23 32 22
mov
word
ptr
[
rdx
+
32
*
1
],
cx
shr
eax
,
16
movq
mm0
,
mm1
; 11 01 10 00
shr
ecx
,
16
movq
mm2
,
mm3
; 13 03 12 02
mov
word
ptr
[
rdx
+
32
*
4
],
ax
mov
word
ptr
[
rdx
+
32
*
5
],
cx
punpckldq
mm0
,
mm6
; 30 20 10 00 aka ip[0]
movd
eax
,
mm1
punpckhdq
mm1
,
mm6
; 31 21 11 01 aka ip[4]
movd
ecx
,
mm0
mov
word
ptr
[
rdx
+
32
*
8
],
ax
punpckldq
mm2
,
mm4
; 32 22 12 02 aka ip[8]
mov
word
ptr
[
rdx
+
32
*
9
],
cx
punpckhdq
mm3
,
mm4
; 33 23 13 03 aka ip[12]
shr
eax
,
16
shr
ecx
,
16
paddw
mm0
,
mm7
mov
word
ptr
[
rdx
+
32
*
12
],
ax
paddw
mm1
,
mm7
mov
word
ptr
[
rdx
+
32
*
13
],
cx
paddw
mm2
,
mm7
paddw
mm3
,
mm7
movd
eax
,
mm6
movd
ecx
,
mm5
psraw
mm0
,
3
psrlq
mm5
,
32
psraw
mm1
,
3
psrlq
mm6
,
32
psraw
mm2
,
3
mov
word
ptr
[
rdx
+
32
*
2
],
ax
psraw
mm3
,
3
mov
word
ptr
[
rdx
+
32
*
3
],
cx
shr
eax
,
16
; movq [rdi + 0], mm0
shr
ecx
,
16
; movq [rdi + 8], mm1
mov
word
ptr
[
rdx
+
32
*
6
],
ax
; movq [rdi + 16], mm2
mov
word
ptr
[
rdx
+
32
*
7
],
cx
; movq [rdi + 24], mm3
movd
eax
,
mm6
movd
ecx
,
mm5
movd
eax
,
mm0
mov
word
ptr
[
rdx
+
32
*
10
],
ax
psrlq
mm0
,
32
mov
word
ptr
[
rdx
+
32
*
11
],
cx
mov
word
ptr
[
rdi
+
32
*
0
],
ax
shr
eax
,
16
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
1
],
ax
mov
word
ptr
[
rdx
+
32
*
14
],
ax
movd
eax
,
mm0
mov
word
ptr
[
rdx
+
32
*
15
],
cx
mov
word
ptr
[
rdi
+
32
*
2
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
3
],
ax
movd
ecx
,
mm1
psrlq
mm1
,
32
mov
word
ptr
[
rdi
+
32
*
4
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
5
],
cx
movd
ecx
,
mm1
mov
word
ptr
[
rdi
+
32
*
6
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
7
],
cx
movd
eax
,
mm2
psrlq
mm2
,
32
mov
word
ptr
[
rdi
+
32
*
8
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
9
],
ax
movd
eax
,
mm2
mov
word
ptr
[
rdi
+
32
*
10
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
11
],
ax
movd
ecx
,
mm3
psrlq
mm3
,
32
mov
word
ptr
[
rdi
+
32
*
12
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
13
],
cx
movd
ecx
,
mm3
mov
word
ptr
[
rdi
+
32
*
14
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
15
],
cx
; begin epilog
; begin epilog
pop
rdi
pop
rsi
UNSHADOW_ARGS
UNSHADOW_ARGS
pop
rbp
pop
rbp
ret
ret
...
...
vp8/common/x86/iwalsh_sse2.asm
View file @
afa1b661
...
@@ -17,145 +17,105 @@ sym(vp8_short_inv_walsh4x4_sse2):
...
@@ -17,145 +17,105 @@ sym(vp8_short_inv_walsh4x4_sse2):
push
rbp
push
rbp
mov
rbp
,
rsp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
2
SHADOW_ARGS_TO_STACK
2
SAVE_XMM
6
push
rsi
push
rdi
; end prolog
; end prolog
mov
rsi
,
arg
(
0
)
mov
rcx
,
arg
(
0
)
mov
rd
i
,
arg
(
1
)
mov
rd
x
,
arg
(
1
)
mov
rax
,
3
mov
rax
,
3
0003h
movdqa
xmm0
,
[
r
si
+
0
]
;ip[4] ip[0]
movdqa
xmm0
,
[
r
cx
+
0
]
;ip[4] ip[0]
movdqa
xmm1
,
[
r
si
+
16
]
;ip[12] ip[8]
movdqa
xmm1
,
[
r
cx
+
16
]
;ip[12] ip[8]
shl
rax
,
16
or
rax
,
3
;00030003h
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm0
;ip[4] ip[0]
movdqa
xmm3
,
xmm0
;ip[4] ip[0]
paddw
xmm0
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
paddw
xmm0
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
movdqa
xmm4
,
xmm0
movdqa
xmm4
,
xmm0
punpcklqdq
xmm0
,
xmm3
;d1 a1
punpcklqdq
xmm0
,
xmm3
;d1 a1
punpckhqdq
xmm4
,
xmm3
;c1 b1
punpckhqdq
xmm4
,
xmm3
;c1 b1
movd
xmm6
,
eax
movdqa
xmm1
,
xmm4
;c1 b1
movdqa
xmm1
,
xmm4
;c1 b1
paddw
xmm4
,
xmm0
;dl+cl a1+b1 aka op[4] op[0]
paddw
xmm4
,
xmm0
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm0
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
psubw
xmm0
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
;;;temp output
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;; movdqu [rdi + 0], xmm4
;; movdqu [rdi + 16], xmm3
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
; 13 12 11 10 03 02 01 00
;
;
; 33 32 31 30 23 22 21 20
; 33 32 31 30 23 22 21 20
;
;
movdqa
xmm3
,
xmm4
; 13 12 11 10 03 02 01 00
movdqa
xmm3
,
xmm4
; 13 12 11 10 03 02 01 00
punpcklwd
xmm4
,
xmm0
; 23 03 22 02 21 01 20 00
punpcklwd
xmm4
,
xmm0
; 23 03 22 02 21 01 20 00
punpckhwd
xmm3
,
xmm0
; 33 13 32 12 31 11 30 10
punpckhwd
xmm3
,
xmm0
; 33 13 32 12 31 11 30 10
movdqa
xmm1
,
xmm4
; 23 03 22 02 21 01 20 00
movdqa
xmm1
,
xmm4
; 23 03 22 02 21 01 20 00
punpcklwd
xmm4
,
xmm3
; 31 21 11 01 30 20 10 00
punpcklwd
xmm4
,
xmm3
; 31 21 11 01 30 20 10 00
punpckhwd
xmm1
,
xmm3
; 33 23 13 03 32 22 12 02
punpckhwd
xmm1
,
xmm3
; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movd
xmm0
,
eax
movdqa
xmm3
,
xmm4
;ip[4] ip[0]
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm4
;ip[4] ip[0]
pshufd
xmm
6
,
xmm
6
,
0
;03 03 03 03 03 03 03 03
pshufd
xmm
0
,
xmm
0
,
0
;03 03 03 03 03 03 03 03
paddw
xmm4
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
paddw
xmm4
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
movdqa
xmm5
,
xmm4
movdqa
xmm5
,
xmm4
punpcklqdq
xmm4
,
xmm3
;d1 a1
punpcklqdq
xmm4
,
xmm3
;d1 a1
punpckhqdq
xmm5
,
xmm3
;c1 b1
punpckhqdq
xmm5
,
xmm3
;c1 b1
movdqa
xmm1
,
xmm5
;c1 b1
movdqa
xmm1
,
xmm5
;c1 b1
paddw
xmm5
,
xmm4
;dl+cl a1+b1 aka op[4] op[0]
paddw
xmm5
,
xmm4
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm4
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
psubw
xmm4
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
paddw
xmm5
,
xmm0
;
paddw
xmm4
,
xmm0
; 33 32 31 30 23 22 21 20
psraw
xmm5
,
3
;
psraw
xmm4
,
3
movdqa
xmm0
,
xmm5
; 13 12 11 10 03 02 01 00
punpcklwd
xmm5
,
xmm4
; 23 03 22 02 21 01 20 00
movd
eax
,
xmm5
punpckhwd
xmm0
,
xmm4
; 33 13 32 12 31 11 30 10
movd
ecx
,
xmm4
movdqa
xmm1
,
xmm5
; 23 03 22 02 21 01 20 00
psrldq
xmm5
,
4
punpcklwd
xmm5
,
xmm0
; 31 21 11 01 30 20 10 00
psrldq
xmm4
,
4
punpckhwd
xmm1
,
xmm0
; 33 23 13 03 32 22 12 02
mov
word
ptr
[
rdx
+
32
*
0
],
ax
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mov
word
ptr
[
rdx
+
32
*
2
],
cx
paddw
xmm5
,
xmm6
shr
eax
,
16
paddw
xmm1
,
xmm6
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
4
],
ax
psraw
xmm5
,
3
mov
word
ptr
[
rdx
+
32
*
6
],
cx
psraw
xmm1
,
3
movd
eax
,
xmm5
movd
ecx
,
xmm4
;; movdqa [rdi + 0], xmm5
psrldq
xmm5
,
4
;; movdqa [rdi + 16], xmm1
psrldq
xmm4
,
4
mov
word
ptr
[
rdx
+
32
*
8
],
ax
movd
eax
,
xmm5
mov
word
ptr
[
rdx
+
32
*
10
],
cx
psrldq
xmm5
,
4
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
0
],
ax
shr
ecx
,
16
shr
eax
,
16
mov
word
ptr
[
rdx
+
32
*
12
],
ax
mov
word
ptr
[
rdi
+
32
*
1
],
ax
mov
word
ptr
[
rdx
+
32
*
14
],
cx
movd
eax
,
xmm5
psrldq
xmm5
,
4
movd
eax
,
xmm5
mov
word
ptr
[
rdi
+
32
*
2
],
ax
movd
ecx
,
xmm4
shr
eax
,
16
psrldq
xmm5
,
4
mov
word
ptr
[
rdi
+
32
*
3
],
ax
psrldq
xmm4
,
4
mov
word
ptr
[
rdx
+
32
*
1
],
ax
movd
eax
,
xmm5
mov
word
ptr
[
rdx
+
32
*
3
],
cx
psrldq
xmm5
,
4
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
4
],
ax
shr
ecx
,
16
shr
eax
,
16
mov
word
ptr
[
rdx
+
32
*
5
],
ax
mov
word
ptr
[
rdi
+
32
*
5
],
ax
mov
word
ptr
[
rdx
+
32
*
7
],
cx
movd
eax
,
xmm5
movd
eax
,
xmm5
mov
word
ptr
[
rdi
+
32
*
6
],
ax
movd
ecx
,
xmm4
shr
eax
,
16
mov
word
ptr
[
rdx
+
32
*
9
],
ax
mov
word
ptr
[
rdi
+
32
*
7
],
ax
mov
word
ptr
[
rdx
+
32
*
11
],
cx
shr
eax
,
16
movd
eax
,
xmm1
shr
ecx
,
16
psrldq
xmm1
,
4
mov
word
ptr
[
rdx
+
32
*
13
],
ax
mov
word
ptr
[
rdi
+
32
*
8
],
ax
mov
word
ptr
[
rdx
+
32
*
15
],
cx
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
9
],
ax
movd
eax
,
xmm1
psrldq
xmm1
,
4
mov
word
ptr
[
rdi
+
32
*
10
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
11
],
ax
movd
eax
,
xmm1
psrldq
xmm1
,
4
mov
word
ptr
[
rdi
+
32
*
12
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
13
],
ax
movd
eax
,
xmm1
mov
word
ptr
[
rdi
+
32
*
14
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
15
],
ax
; begin epilog
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
UNSHADOW_ARGS
pop
rbp
pop
rbp
ret
ret
SECTION
_RODATA
align
16
x_s1sqr2:
times
4
dw
0x8A8C
align
16
x_c1sqr2less1:
times
4
dw
0x4E7B
align
16
fours:
times
4
dw
0x0004
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment