Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
afa1b661
Commit
afa1b661
authored
Dec 12, 2011
by
Scott LaVarnway
Committed by
Gerrit Code Review
Dec 12, 2011
Browse files
Merge "Improved mmx/sse2 versions of iwalsh"
parents
a69810b8
9fa6132f
Changes
2
Hide whitespace changes
Inline
Side-by-side
vp8/common/x86/iwalsh_mmx.asm
View file @
afa1b661
...
...
@@ -17,160 +17,123 @@ sym(vp8_short_inv_walsh4x4_mmx):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
2
push
rsi
push
rdi
; end prolog
mov
rax
,
3
mov
rsi
,
arg
(
0
)
mov
rdi
,
arg
(
1
)
shl
rax
,
16
mov
rdx
,
arg
(
0
)
mov
rax
,
30003h
movq
mm0
,
[
r
si
+
0
]
;ip[0]
movq
mm1
,
[
r
si
+
8
]
;ip[4]
or
rax
,
3
;00030003h
movq
mm0
,
[
r
dx
+
0
]
;ip[0]
movq
mm1
,
[
r
dx
+
8
]
;ip[4]
movd
mm7
,
rax
movq
mm2
,
[
rsi
+
16
]
;ip[8]
movq
mm3
,
[
rsi
+
24
]
;ip[12]
movq
mm2
,
[
rdx
+
16
]
;ip[8]
movq
mm3
,
[
rdx
+
24
]
;ip[12]
punpcklwd
mm7
,
mm7
;0003000300030003h
mov
rdx
,
arg
(
1
)
movq
mm
7
,
rax
movq
mm
4
,
mm
0
movq
mm
4
,
mm0
movq
mm
5
,
mm
1
p
unpcklwd
mm
7
,
mm
7
;
0003000300030003h
movq
mm5
,
mm
1
p
addw
mm
4
,
mm
3
;
ip[0] + ip[12] aka al
paddw
mm5
,
mm
2
;ip[4] + ip[8] aka bl
paddw
mm4
,
mm3
;ip[0] + ip[12] aka al
paddw
mm5
,
mm2
;ip[4] + ip[8] aka bl
movq
mm6
,
mm4
;temp al
paddw
mm4
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
movq
mm6
,
mm4
;temp al
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm1
,
mm2
;ip[4] - ip[8] aka c1
paddw
mm4
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm1
,
mm2
;ip[4] - ip[8] aka c1
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm1
;dl + cl
psubw
mm5
,
mm1
;dl - cl
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm1
;dl + cl
psubw
mm5
,
mm1
;dl - cl
; 03 02 01 00
; 13 12 11 10
; 23 22 21 20
; 33 32 31 30
movq
mm3
,
mm4
; 03 02 01 00
punpcklwd
mm4
,
mm0
; 11 01 10 00
punpckhwd
mm3
,
mm0
; 13 03 12 02
movq
mm3
,
mm4
; 03 02 01 00
punpcklwd
mm4
,
mm0
; 11 01 10 00
punpckhwd
mm3
,
mm0
; 13 03 12 02
movq
mm1
,
mm6
; 23 22 21 20
punpcklwd
mm6
,
mm5
; 31 21 30 20
punpckhwd
mm1
,
mm5
; 33 23 32 22
movq
mm1
,
mm6
; 23 22 21 20
punpcklwd
mm6
,
mm5
; 31 21 30 20
punpckhwd
mm1
,
mm5
; 33 23 32 22
movq
mm0
,
mm4
; 11 01 10 00
movq
mm2
,
mm3
; 13 03 12 02
movq
mm0
,
mm4
; 11 01 10 00
movq
mm2
,
mm3
; 13 03 12 02
punpckldq
mm0
,
mm6
; 30 20 10 00 aka ip[0]
punpckhdq
mm4
,
mm6
; 31 21 11 01 aka ip[4]
punpckldq
mm0
,
mm6
; 30 20 10 00 aka ip[0]
punpckhdq
mm4
,
mm6
; 31 21 11 01 aka ip[4]
punpckldq
mm2
,
mm1
; 32 22 12 02 aka ip[8]
punpckhdq
mm3
,
mm1
; 33 23 13 03 aka ip[12]
punpckldq
mm2
,
mm1
; 32 22 12 02 aka ip[8]
punpckhdq
mm3
,
mm1
; 33 23 13 03 aka ip[12]
;~~~~~~~~~~~~~~~~~~~~~
movq
mm1
,
mm0
movq
mm5
,
mm4
paddw
mm1
,
mm3
;ip[0] + ip[12] aka al
paddw
mm5
,
mm2
;ip[4] + ip[8] aka bl
movq
mm6
,
mm1
;temp al
paddw
mm1
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm4
,
mm2
;ip[4] - ip[8] aka c1
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm4
;dl + cl
psubw
mm5
,
mm4
;dl - cl
movq
mm1
,
mm0
movq
mm5
,
mm4
paddw
mm1
,
mm3
;ip[0] + ip[12] aka al
paddw
mm5
,
mm2
;ip[4] + ip[8] aka bl
movq
mm6
,
mm1
;temp al
paddw
mm1
,
mm5
;al + bl
psubw
mm6
,
mm5
;al - bl
paddw
mm1
,
mm7
paddw
mm6
,
mm7
psraw
mm1
,
3
psraw
mm6
,
3
psubw
mm0
,
mm3
;ip[0] - ip[12] aka d1
psubw
mm4
,
mm2
;ip[4] - ip[8] aka c1
movq
mm5
,
mm0
;temp dl
paddw
mm0
,
mm4
;dl + cl
psubw
mm5
,
mm4
;dl - cl
paddw
mm0
,
mm7
paddw
mm5
,
mm7
psraw
mm0
,
3
psraw
mm5
,
3
;~~~~~~~~~~~~~~~~~~~~~
movq
mm3
,
mm1
; 03 02 01 00
punpcklwd
mm1
,
mm0
; 11 01 10 00
punpckhwd
mm3
,
mm0
; 13 03 12 02
movq
mm4
,
mm6
; 23 22 21 20
punpcklwd
mm6
,
mm5
; 31 21 30 20
punpckhwd
mm4
,
mm5
; 33 23 32 22
movq
mm0
,
mm1
; 11 01 10 00
movq
mm2
,
mm3
; 13 03 12 02
punpckldq
mm0
,
mm6
; 30 20 10 00 aka ip[0]
punpckhdq
mm1
,
mm6
; 31 21 11 01 aka ip[4]
punpckldq
mm2
,
mm4
; 32 22 12 02 aka ip[8]
punpckhdq
mm3
,
mm4
; 33 23 13 03 aka ip[12]
paddw
mm0
,
mm7
paddw
mm1
,
mm7
paddw
mm2
,
mm7
paddw
mm3
,
mm7
psraw
mm0
,
3
psraw
mm1
,
3
psraw
mm2
,
3
psraw
mm3
,
3
; movq [rdi + 0], mm0
; movq [rdi + 8], mm1
; movq [rdi + 16], mm2
; movq [rdi + 24], mm3
movd
eax
,
mm0
psrlq
mm0
,
32
mov
word
ptr
[
rdi
+
32
*
0
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
1
],
ax
movd
eax
,
mm0
mov
word
ptr
[
rdi
+
32
*
2
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
3
],
ax
movd
ecx
,
mm1
psrlq
mm1
,
32
mov
word
ptr
[
rdi
+
32
*
4
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
5
],
cx
movd
ecx
,
mm1
mov
word
ptr
[
rdi
+
32
*
6
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
7
],
cx
movd
eax
,
mm2
psrlq
mm2
,
32
mov
word
ptr
[
rdi
+
32
*
8
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
9
],
ax
movd
eax
,
mm2
mov
word
ptr
[
rdi
+
32
*
10
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
11
],
ax
movd
ecx
,
mm3
psrlq
mm3
,
32
mov
word
ptr
[
rdi
+
32
*
12
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
13
],
cx
movd
ecx
,
mm3
mov
word
ptr
[
rdi
+
32
*
14
],
cx
shr
ecx
,
16
mov
word
ptr
[
rdi
+
32
*
15
],
cx
movd
eax
,
mm1
movd
ecx
,
mm0
psrlq
mm0
,
32
psrlq
mm1
,
32
mov
word
ptr
[
rdx
+
32
*
0
],
ax
mov
word
ptr
[
rdx
+
32
*
1
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
4
],
ax
mov
word
ptr
[
rdx
+
32
*
5
],
cx
movd
eax
,
mm1
movd
ecx
,
mm0
mov
word
ptr
[
rdx
+
32
*
8
],
ax
mov
word
ptr
[
rdx
+
32
*
9
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
12
],
ax
mov
word
ptr
[
rdx
+
32
*
13
],
cx
movd
eax
,
mm6
movd
ecx
,
mm5
psrlq
mm5
,
32
psrlq
mm6
,
32
mov
word
ptr
[
rdx
+
32
*
2
],
ax
mov
word
ptr
[
rdx
+
32
*
3
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
6
],
ax
mov
word
ptr
[
rdx
+
32
*
7
],
cx
movd
eax
,
mm6
movd
ecx
,
mm5
mov
word
ptr
[
rdx
+
32
*
10
],
ax
mov
word
ptr
[
rdx
+
32
*
11
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
14
],
ax
mov
word
ptr
[
rdx
+
32
*
15
],
cx
; begin epilog
pop
rdi
pop
rsi
UNSHADOW_ARGS
pop
rbp
ret
...
...
vp8/common/x86/iwalsh_sse2.asm
View file @
afa1b661
...
...
@@ -17,145 +17,105 @@ sym(vp8_short_inv_walsh4x4_sse2):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
2
SAVE_XMM
6
push
rsi
push
rdi
; end prolog
mov
rsi
,
arg
(
0
)
mov
rd
i
,
arg
(
1
)
mov
rax
,
3
mov
rcx
,
arg
(
0
)
mov
rd
x
,
arg
(
1
)
mov
rax
,
3
0003h
movdqa
xmm0
,
[
r
si
+
0
]
;ip[4] ip[0]
movdqa
xmm1
,
[
r
si
+
16
]
;ip[12] ip[8]
movdqa
xmm0
,
[
r
cx
+
0
]
;ip[4] ip[0]
movdqa
xmm1
,
[
r
cx
+
16
]
;ip[12] ip[8]
shl
rax
,
16
or
rax
,
3
;00030003h
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm0
;ip[4] ip[0]
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm0
;ip[4] ip[0]
paddw
xmm0
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
paddw
xmm0
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
movdqa
xmm4
,
xmm0
movdqa
xmm4
,
xmm0
punpcklqdq
xmm0
,
xmm3
;d1 a1
punpckhqdq
xmm4
,
xmm3
;c1 b1
movd
xmm6
,
eax
movdqa
xmm1
,
xmm4
;c1 b1
paddw
xmm4
,
xmm0
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm0
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
movdqa
xmm1
,
xmm4
;c1 b1
paddw
xmm4
,
xmm0
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm0
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
;;;temp output
;; movdqu [rdi + 0], xmm4
;; movdqu [rdi + 16], xmm3
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
;
; 33 32 31 30 23 22 21 20
;
movdqa
xmm3
,
xmm4
; 13 12 11 10 03 02 01 00
punpcklwd
xmm4
,
xmm0
; 23 03 22 02 21 01 20 00
punpckhwd
xmm3
,
xmm0
; 33 13 32 12 31 11 30 10
movdqa
xmm1
,
xmm4
; 23 03 22 02 21 01 20 00
punpcklwd
xmm4
,
xmm3
; 31 21 11 01 30 20 10 00
punpckhwd
xmm1
,
xmm3
; 33 23 13 03 32 22 12 02
movdqa
xmm3
,
xmm4
; 13 12 11 10 03 02 01 00
punpcklwd
xmm4
,
xmm0
; 23 03 22 02 21 01 20 00
punpckhwd
xmm3
,
xmm0
; 33 13 32 12 31 11 30 10
movdqa
xmm1
,
xmm4
; 23 03 22 02 21 01 20 00
punpcklwd
xmm4
,
xmm3
; 31 21 11 01 30 20 10 00
punpckhwd
xmm1
,
xmm3
; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm4
;ip[4] ip[0]
movd
xmm0
,
eax
pshufd
xmm2
,
xmm1
,
4eh
;ip[8] ip[12]
movdqa
xmm3
,
xmm4
;ip[4] ip[0]
pshufd
xmm
6
,
xmm
6
,
0
;03 03 03 03 03 03 03 03
pshufd
xmm
0
,
xmm
0
,
0
;03 03 03 03 03 03 03 03
paddw
xmm4
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
paddw
xmm4
,
xmm2
;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
psubw
xmm3
,
xmm2
;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
movdqa
xmm5
,
xmm4
movdqa
xmm5
,
xmm4
punpcklqdq
xmm4
,
xmm3
;d1 a1
punpckhqdq
xmm5
,
xmm3
;c1 b1
movdqa
xmm1
,
xmm5
;c1 b1
paddw
xmm5
,
xmm4
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm4
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
; 13 12 11 10 03 02 01 00
;
; 33 32 31 30 23 22 21 20
;
movdqa
xmm0
,
xmm5
; 13 12 11 10 03 02 01 00
punpcklwd
xmm5
,
xmm4
; 23 03 22 02 21 01 20 00
punpckhwd
xmm0
,
xmm4
; 33 13 32 12 31 11 30 10
movdqa
xmm1
,
xmm5
; 23 03 22 02 21 01 20 00
punpcklwd
xmm5
,
xmm0
; 31 21 11 01 30 20 10 00
punpckhwd
xmm1
,
xmm0
; 33 23 13 03 32 22 12 02
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
paddw
xmm5
,
xmm6
paddw
xmm1
,
xmm6
psraw
xmm5
,
3
psraw
xmm1
,
3
;; movdqa [rdi + 0], xmm5
;; movdqa [rdi + 16], xmm1
movd
eax
,
xmm5
psrldq
xmm5
,
4
mov
word
ptr
[
rdi
+
32
*
0
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
1
],
ax
movd
eax
,
xmm5
psrldq
xmm5
,
4
mov
word
ptr
[
rdi
+
32
*
2
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
3
],
ax
movd
eax
,
xmm5
psrldq
xmm5
,
4
mov
word
ptr
[
rdi
+
32
*
4
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
5
],
ax
movd
eax
,
xmm5
mov
word
ptr
[
rdi
+
32
*
6
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
7
],
ax
movd
eax
,
xmm1
psrldq
xmm1
,
4
mov
word
ptr
[
rdi
+
32
*
8
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
9
],
ax
movd
eax
,
xmm1
psrldq
xmm1
,
4
mov
word
ptr
[
rdi
+
32
*
10
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
11
],
ax
movd
eax
,
xmm1
psrldq
xmm1
,
4
mov
word
ptr
[
rdi
+
32
*
12
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
13
],
ax
movd
eax
,
xmm1
mov
word
ptr
[
rdi
+
32
*
14
],
ax
shr
eax
,
16
mov
word
ptr
[
rdi
+
32
*
15
],
ax
movdqa
xmm1
,
xmm5
;c1 b1
paddw
xmm5
,
xmm4
;dl+cl a1+b1 aka op[4] op[0]
psubw
xmm4
,
xmm1
;d1-c1 a1-b1 aka op[12] op[8]
paddw
xmm5
,
xmm0
paddw
xmm4
,
xmm0
psraw
xmm5
,
3
psraw
xmm4
,
3
movd
eax
,
xmm5
movd
ecx
,
xmm4
psrldq
xmm5
,
4
psrldq
xmm4
,
4
mov
word
ptr
[
rdx
+
32
*
0
],
ax
mov
word
ptr
[
rdx
+
32
*
2
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
4
],
ax
mov
word
ptr
[
rdx
+
32
*
6
],
cx
movd
eax
,
xmm5
movd
ecx
,
xmm4
psrldq
xmm5
,
4
psrldq
xmm4
,
4
mov
word
ptr
[
rdx
+
32
*
8
],
ax
mov
word
ptr
[
rdx
+
32
*
10
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
12
],
ax
mov
word
ptr
[
rdx
+
32
*
14
],
cx
movd
eax
,
xmm5
movd
ecx
,
xmm4
psrldq
xmm5
,
4
psrldq
xmm4
,
4
mov
word
ptr
[
rdx
+
32
*
1
],
ax
mov
word
ptr
[
rdx
+
32
*
3
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
5
],
ax
mov
word
ptr
[
rdx
+
32
*
7
],
cx
movd
eax
,
xmm5
movd
ecx
,
xmm4
mov
word
ptr
[
rdx
+
32
*
9
],
ax
mov
word
ptr
[
rdx
+
32
*
11
],
cx
shr
eax
,
16
shr
ecx
,
16
mov
word
ptr
[
rdx
+
32
*
13
],
ax
mov
word
ptr
[
rdx
+
32
*
15
],
cx
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
SECTION
_RODATA
align
16
x_s1sqr2:
times
4
dw
0x8A8C
align
16
x_c1sqr2less1:
times
4
dw
0x4E7B
align
16
fours:
times
4
dw
0x0004
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment