Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Xiph.Org
aom-rav1e
Commits
afa1b661
Commit
afa1b661
authored
Dec 12, 2011
by
Scott LaVarnway
Committed by
Gerrit Code Review
Dec 12, 2011
Browse files
Options
Browse Files
Download
Plain Diff
Merge "Improved mmx/sse2 versions of iwalsh"
parents
a69810b8
9fa6132f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
169 additions
and
246 deletions
+169
-246
vp8/common/x86/iwalsh_mmx.asm
vp8/common/x86/iwalsh_mmx.asm
+94
-131
vp8/common/x86/iwalsh_sse2.asm
vp8/common/x86/iwalsh_sse2.asm
+75
-115
No files found.
vp8/common/x86/iwalsh_mmx.asm
View file @
afa1b661
...
...
@@ -17,160 +17,123 @@ sym(vp8_short_inv_walsh4x4_mmx):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
2
push
rsi
push
rdi
; end prolog
mov
rax
,
3
mov
rsi
,
arg
(
0
)
mov
rdi
,
arg
(
1
)
shl
rax
,
16
mov
rdx
,
arg
(
0
)
mov
rax
,
30003h
movq
mm0
,
[
r
si
+
0
]
;ip[0]
movq
mm1
,
[
r
si
+
8
]
;ip[4]
or
rax
,
3
;00030003h
movq
mm0
,
[
r
dx
+
0
]
;ip[0]
movq
mm1
,
[
r
dx
+
8
]
;ip[4]
movd
mm7
,
rax
movq
mm2
,
[
rsi
+
16
]
;ip[8]
movq
mm3
,
[
rsi
+
24
]
;ip[12]
movq
mm2
,
[
rdx
+
16
]
;ip[8]
movq
mm3
,
[
rdx
+
24
]
;ip[12]
punpcklwd
mm7
,
mm7
;0003000300030003h
mov
rdx
,
arg
(
1
)
movq
mm
7
,
rax
movq
mm
4
,
mm
0
movq
mm
4
,
mm0
movq
mm
5
,
mm
1
p
unpcklwd
mm
7
,
mm
7
;
0003000300030003h
movq
mm5
,
mm
1
p
addw
mm
4
,
mm
3
;
ip[0] + ip[12] aka al
paddw
mm5
,
mm
2
;ip[4] + ip[8] aka bl
paddw
mm4
,
mm3
;ip[0] + ip[12] aka al
paddw
mm5
,
mm2
;ip[4] + ip[8] aka bl
movq
mm6
,
mm4
;temp al
paddw
mm4
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
movq
mm6
,
mm4
;temp al
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm1
,
mm2
;ip[4] - ip[8] aka c1
paddw
mm4
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm1
,
mm2
;ip[4] - ip[8] aka c1
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm1
;dl + cl
psubw
mm5
,
mm1
;dl - cl
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm1
;dl + cl
psubw
mm5
,
mm1
;dl - cl
; 03 02 01 00
; 13 12 11 10
; 23 22 21 20
; 33 32 31 30
movq
mm3
,
mm4
; 03 02 01 00
punpcklwd
mm4
,
mm0
; 11 01 10 00
punpckhwd
mm3
,
mm0
; 13 03 12 02
movq
mm3
,
mm4
; 03 02 01 00
punpcklwd
mm4
,
mm0
; 11 01 10 00
punpckhwd
mm3
,
mm0
; 13 03 12 02
movq
mm1
,
mm6
; 23 22 21 20
punpcklwd
mm6
,
mm5
; 31 21 30 20
punpckhwd
mm1
,
mm5
; 33 23 32 22
movq
mm1
,
mm6
; 23 22 21 20
punpcklwd
mm6
,
mm5
; 31 21 30 20
punpckhwd
mm1
,
mm5
; 33 23 32 22
movq
mm0
,
mm4
; 11 01 10 00
movq
mm2
,
mm3
; 13 03 12 02
movq
mm0
,
mm4
; 11 01 10 00
movq
mm2
,
mm3
; 13 03 12 02
punpckldq
mm0
,
mm6
; 30 20 10 00 aka ip[0]
punpckhdq
mm4
,
mm6
; 31 21 11 01 aka ip[4]
punpckldq
mm0
,
mm6
; 30 20 10 00 aka ip[0]
punpckhdq
mm4
,
mm6
; 31 21 11 01 aka ip[4]
punpckldq
mm2
,
mm1
; 32 22 12 02 aka ip[8]
punpckhdq
mm3
,
mm1
; 33 23 13 03 aka ip[12]
punpckldq
mm2
,
mm1
; 32 22 12 02 aka ip[8]
punpckhdq
mm3
,
mm1
; 33 23 13 03 aka ip[12]
;~~~~~~~~~~~~~~~~~~~~~
movq
mm1
,
mm0
movq
mm5
,
mm4
paddw
mm1
,
mm3
;ip[0] + ip[12] aka al
paddw
mm5
,
mm2
;ip[4] + ip[8] aka bl
movq
mm6
,
mm1
;temp al
paddw
mm1
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm4
,
mm2
;ip[4] - ip[8] aka c1
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm4
;dl + cl
psubw
mm5
,
mm4
;dl - cl
movq
mm1
,
mm0
movq
mm5
,
mm4
paddw
mm1
,
mm3
;ip[0] + ip[12] aka al
paddw
mm5
,
mm2
;ip[4] + ip[8] aka bl
movq
mm6
,
mm1
;temp al
paddw
mm1
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
paddw
mm1
,
mm7
paddw
mm6
,
mm7
psraw
mm1
,
3
psraw
mm6
,
3
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm4
,
mm2
;ip[4] - ip[8] aka c1
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm4
;dl + cl
psubw
mm5
,
mm4
;dl - cl
paddw
mm0
,
mm7
paddw
mm5
,
mm7
psraw
mm0
,
3
psraw
mm5
,
3
;~~~~~~~~~~~~~~~~~~~~~
movq
mm3
,
mm1
; 03 02 01 00
punpcklwd
mm1
,
mm0
; 11 01 10 00
punpckhwd
mm3
,
mm0
; 13 03 12 02
movq
mm4
,
mm6
; 23 22 21 20
punpcklwd
mm6
,
mm5
; 31 21 30 20
punpckhwd
mm4
,
mm5
; 33 23 32 22
movq
mm0
,
mm1
; 11 01 10 00
movq
mm2
,
mm3
; 13 03 12 02
punpckldq
mm0
,
mm6
; 30 20 10 00 aka ip[0]
punpckhdq
mm1
,
mm6
; 31 21 11 01 aka ip[4]
punpckldq
mm2
,
mm4
; 32 22 12 02 aka ip[8]
punpckhdq
mm3
,
mm4
; 33 23 13 03 aka ip[12]
paddw
mm0
,
mm7
paddw
mm1
,
mm7
paddw
mm2
,
mm7
paddw
mm3
,
mm7
psraw
mm0
,
3
psraw
mm1
,
3
psraw
mm2
,
3
psraw
mm3
,
3
; movq [rdi + 0], mm0
; movq [rdi + 8], mm1
; movq [rdi + 16], mm2
; movq [rdi + 24], mm3
movd
eax
,
mm0
psrlq
mm0
,
32
mov
word
ptr
[
rdi
+
32
*
0
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
1
],
ax
movd
eax
,
mm0
mov
word
ptr
[
rdi
+
32
*
2
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
3
],
ax
movd
ecx
,
mm1
psrlq
mm1
,
32
mov
word
ptr
[
rdi
+
32
*
4
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
5
],
cx
movd
ecx
,
mm1
mov
word
ptr
[
rdi
+
32
*
6
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
7
],
cx
movd
eax
,
mm2
psrlq
mm2
,
32
mov
word
ptr
[
rdi
+
32
*
8
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
9
],
ax
movd
eax
,
mm2
mov
word
ptr
[
rdi
+
32
*
10
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
11
],
ax
movd
ecx
,
mm3
psrlq
mm3
,
32
mov
word
ptr
[
rdi
+
32
*
12
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
13
],
cx
movd
ecx
,
mm3
mov
word
ptr
[
rdi
+
32
*
14
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
15
],
cx
movd
eax
,
mm1
movd
ecx
,
mm0
psrlq
mm0
,
32
psrlq
mm1
,
32
mov
word
ptr
[
rdx
+
32
*
0
],
ax
mov
word
ptr
[
rdx
+
32
*
1
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
4
],
ax
mov
word
ptr
[
rdx
+
32
*
5
],
cx
movd
eax
,
mm1
movd
ecx
,
mm0
mov
word
ptr
[
rdx
+
32
*
8
],
ax
mov
word
ptr
[
rdx
+
32
*
9
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
12
],
ax
mov
word
ptr
[
rdx
+
32
*
13
],
cx
movd
eax
,
mm6
movd
ecx
,
mm5
psrlq
mm5
,
32
psrlq
mm6
,
32
mov
word
ptr
[
rdx
+
32
*
2
],
ax
mov
word
ptr
[
rdx
+
32
*
3
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
6
],
ax
mov
word
ptr
[
rdx
+
32
*
7
],
cx
movd
eax
,
mm6
movd
ecx
,
mm5
mov
word
ptr
[
rdx
+
32
*
10
],
ax
mov
word
ptr
[
rdx
+
32
*
11
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
14
],
ax
mov
word
ptr
[
rdx
+
32
*
15
],
cx
; begin epilog
pop
rdi
pop
rsi
UNSHADOW_ARGS
pop
rbp
ret
...
...
vp8/common/x86/iwalsh_sse2.asm
View file @
afa1b661
...
...
@@ -17,145 +17,105 @@ sym(vp8_short_inv_walsh4x4_sse2):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
2
SAVE_XMM
6
push
rsi
push
rdi
; end prolog
mov
rsi
,
arg
(
0
)
mov
rd
i
,
arg
(
1
)
mov
rax
,
3
mov
rcx
,
arg
(
0
)
mov
rd
x
,
arg
(
1
)
mov
rax
,
3
0003h
movdqa
xmm0
,
[
r
si
+
0
]
;ip[4] ip[0]
movdqa
xmm1
,
[
r
si
+
16
]
;ip[12] ip[8]
movdqa
xmm0
,
[
r
cx
+
0
]
;ip[4] ip[0]
movdqa
xmm1
,
[
r
cx
+
16
]
;ip[12] ip[8]
shl
rax
,
16
or
rax
,
3
;00030003h
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm0
;ip[4] ip[0]
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm0
;ip[4] ip[0]
paddw
xmm0
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
paddw
xmm0
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
movdqa
xmm4
,
xmm0
movdqa
xmm4
,
xmm0
punpcklqdq
xmm0
,
xmm3
;d1 a1
punpckhqdq
xmm4
,
xmm3
;c1 b1
movd
xmm6
,
eax
movdqa
xmm1
,
xmm4
;c1 b1
paddw
xmm4
,
xmm0
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm0
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
movdqa
xmm1
,
xmm4
;c1 b1
paddw
xmm4
,
xmm0
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm0
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
;;;temp output
;; movdqu [rdi + 0], xmm4
;; movdqu [rdi + 16], xmm3
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
;
; 33 32 31 30 23 22 21 20
;
movdqa
xmm3
,
xmm4
; 13 12 11 10 03 02 01 00
punpcklwd
xmm4
,
xmm0
; 23 03 22 02 21 01 20 00
punpckhwd
xmm3
,
xmm0
; 33 13 32 12 31 11 30 10
movdqa
xmm1
,
xmm4
; 23 03 22 02 21 01 20 00
punpcklwd
xmm4
,
xmm3
; 31 21 11 01 30 20 10 00
punpckhwd
xmm1
,
xmm3
; 33 23 13 03 32 22 12 02
movdqa
xmm3
,
xmm4
; 13 12 11 10 03 02 01 00
punpcklwd
xmm4
,
xmm0
; 23 03 22 02 21 01 20 00
punpckhwd
xmm3
,
xmm0
; 33 13 32 12 31 11 30 10
movdqa
xmm1
,
xmm4
; 23 03 22 02 21 01 20 00
punpcklwd
xmm4
,
xmm3
; 31 21 11 01 30 20 10 00
punpckhwd
xmm1
,
xmm3
; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm4
;ip[4] ip[0]
movd
xmm0
,
eax
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm4
;ip[4] ip[0]
pshufd
xmm
6
,
xmm
6
,
0
;03 03 03 03 03 03 03 03
pshufd
xmm
0
,
xmm
0
,
0
;03 03 03 03 03 03 03 03
paddw
xmm4
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
paddw
xmm4
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
movdqa
xmm5
,
xmm4
movdqa
xmm5
,
xmm4
punpcklqdq
xmm4
,
xmm3
;d1 a1
punpckhqdq
xmm5
,
xmm3
;c1 b1
movdqa
xmm1
,
xmm5
;c1 b1
paddw
xmm5
,
xmm4
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm4
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
;
; 33 32 31 30 23 22 21 20
;
movdqa
xmm0
,
xmm5
; 13 12 11 10 03 02 01 00
punpcklwd
xmm5
,
xmm4
; 23 03 22 02 21 01 20 00
punpckhwd
xmm0
,
xmm4
; 33 13 32 12 31 11 30 10
movdqa
xmm1
,
xmm5
; 23 03 22 02 21 01 20 00
punpcklwd
xmm5
,
xmm0
; 31 21 11 01 30 20 10 00
punpckhwd
xmm1
,
xmm0
; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
paddw
xmm5
,
xmm6
paddw
xmm1
,
xmm6
psraw
xmm5
,
3
psraw
xmm1
,
3
;; movdqa [rdi + 0], xmm5
;; movdqa [rdi + 16], xmm1
movd
eax
,
xmm5
psrldq
xmm5
,
4
mov
word
ptr
[
rdi
+
32
*
0
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
1
],
ax
movd
eax
,
xmm5
psrldq
xmm5
,
4
mov
word
ptr
[
rdi
+
32
*
2
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
3
],
ax
movd
eax
,
xmm5
psrldq
xmm5
,
4
mov
word
ptr
[
rdi
+
32
*
4
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
5
],
ax
movd
eax
,
xmm5
mov
word
ptr
[
rdi
+
32
*
6
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
7
],
ax
movd
eax
,
xmm1
psrldq
xmm1
,
4
mov
word
ptr
[
rdi
+
32
*
8
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
9
],
ax
movd
eax
,
xmm1
psrldq
xmm1
,
4
mov
word
ptr
[
rdi
+
32
*
10
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
11
],
ax
movd
eax
,
xmm1
psrldq
xmm1
,
4
mov
word
ptr
[
rdi
+
32
*
12
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
13
],
ax
movd
eax
,
xmm1
mov
word
ptr
[
rdi
+
32
*
14
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
15
],
ax
movdqa
xmm1
,
xmm5
;c1 b1
paddw
xmm5
,
xmm4
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm4
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
paddw
xmm5
,
xmm0
paddw
xmm4
,
xmm0
psraw
xmm5
,
3
psraw
xmm4
,
3
movd
eax
,
xmm5
movd
ecx
,
xmm4
psrldq
xmm5
,
4
psrldq
xmm4
,
4
mov
word
ptr
[
rdx
+
32
*
0
],
ax
mov
word
ptr
[
rdx
+
32
*
2
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
4
],
ax
mov
word
ptr
[
rdx
+
32
*
6
],
cx
movd
eax
,
xmm5
movd
ecx
,
xmm4
psrldq
xmm5
,
4
psrldq
xmm4
,
4
mov
word
ptr
[
rdx
+
32
*
8
],
ax
mov
word
ptr
[
rdx
+
32
*
10
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
12
],
ax
mov
word
ptr
[
rdx
+
32
*
14
],
cx
movd
eax
,
xmm5
movd
ecx
,
xmm4
psrldq
xmm5
,
4
psrldq
xmm4
,
4
mov
word
ptr
[
rdx
+
32
*
1
],
ax
mov
word
ptr
[
rdx
+
32
*
3
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
5
],
ax
mov
word
ptr
[
rdx
+
32
*
7
],
cx
movd
eax
,
xmm5
movd
ecx
,
xmm4
mov
word
ptr
[
rdx
+
32
*
9
],
ax
mov
word
ptr
[
rdx
+
32
*
11
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
13
],
ax
mov
word
ptr
[
rdx
+
32
*
15
],
cx
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
SECTION
_RODATA
align
16
x_s1sqr2:
times
4
dw
0x8A8C
align
16
x_c1sqr2less1:
times
4
dw
0x4E7B
align
16
fours:
times
4
dw
0x0004
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment