Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Xiph.Org
aom-rav1e
Commits
f5615b61
Commit
f5615b61
authored
Aug 11, 2010
by
Scott LaVarnway
Committed by
Code Review
Aug 11, 2010
Browse files
Options
Browse Files
Download
Plain Diff
Merge "Finished vp8_sixtap_predict4x4_ssse3 function"
parents
d22e2968
b07e5b6f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
184 additions
and
190 deletions
+184
-190
vp8/common/x86/subpixel_ssse3.asm
vp8/common/x86/subpixel_ssse3.asm
+143
-172
vp8/common/x86/subpixel_x86.h
vp8/common/x86/subpixel_x86.h
+2
-2
vp8/common/x86/vp8_asm_stubs.c
vp8/common/x86/vp8_asm_stubs.c
+38
-15
vp8/common/x86/x86_systemdependent.c
vp8/common/x86/x86_systemdependent.c
+1
-1
No files found.
vp8/common/x86/subpixel_ssse3.asm
View file @
f5615b61
...
...
@@ -316,21 +316,21 @@ sym(vp8_filter_block1d4_h6_ssse3):
; end prolog
movsxd
rdx
,
DWORD
PTR
arg
(
5
)
;table index
mov
rsi
,
arg
(
0
)
;src_ptr
xor
rsi
,
rsi
shl
rdx
,
4
;
lea
rax
,
[
k0_k5
GLOBAL
]
add
rax
,
rdx
movdqa
xmm7
,
[
rd
GLOBAL
]
cmp
esi
,
DWORD
PTR
[
rax
]
je
vp8_filter_block1d4_h4_ssse3
movdqa
xmm4
,
XMMWORD
PTR
[
rax
]
;k0_k5
movdqa
xmm5
,
XMMWORD
PTR
[
rax
+
256
]
;k2_k4
movdqa
xmm6
,
XMMWORD
PTR
[
rax
+
128
]
;k1_k3
mov
rsi
,
arg
(
0
)
;src_ptr
mov
rdi
,
arg
(
2
)
;output_ptr
movsxd
rax
,
dword
ptr
arg
(
1
)
;src_pixels_per_line
movsxd
rcx
,
dword
ptr
arg
(
4
)
;output_height
...
...
@@ -362,10 +362,8 @@ filter_block1d4_h6_rowloop_ssse3:
psraw
xmm0
,
7
packuswb
xmm0
,
xmm0
;
punpcklbw
xmm0
,
xmm1
movd
DWORD
PTR
[
rdi
],
xmm0
movq
MMWORD
PTR
[
rdi
],
xmm0
add
rdi
,
rdx
dec
rcx
jnz
filter_block1d4_h6_rowloop_ssse3
...
...
@@ -378,6 +376,53 @@ filter_block1d4_h6_rowloop_ssse3:
pop
rbp
ret
vp8_filter_block1d4_h4_ssse3:
movdqa
xmm5
,
XMMWORD
PTR
[
rax
+
256
]
;k2_k4
movdqa
xmm6
,
XMMWORD
PTR
[
rax
+
128
]
;k1_k3
movdqa
xmm0
,
XMMWORD
PTR
[
shuf2b
GLOBAL
]
movdqa
xmm3
,
XMMWORD
PTR
[
shuf3b
GLOBAL
]
mov
rsi
,
arg
(
0
)
;src_ptr
mov
rdi
,
arg
(
2
)
;output_ptr
movsxd
rax
,
dword
ptr
arg
(
1
)
;src_pixels_per_line
movsxd
rcx
,
dword
ptr
arg
(
4
)
;output_height
movsxd
rdx
,
dword
ptr
arg
(
3
)
;output_pitch
filter_block1d4_h4_rowloop_ssse3:
movdqu
xmm1
,
XMMWORD
PTR
[
rsi
-
2
]
movdqa
xmm2
,
xmm1
pshufb
xmm1
,
xmm0
;;[shuf2b GLOBAL]
pshufb
xmm2
,
xmm3
;;[shuf3b GLOBAL]
pmaddubsw
xmm1
,
xmm5
;--
pmaddubsw
xmm2
,
xmm6
lea
rsi
,
[
rsi
+
rax
]
;--
paddsw
xmm1
,
xmm7
paddsw
xmm1
,
xmm2
psraw
xmm1
,
7
packuswb
xmm1
,
xmm1
movd
DWORD
PTR
[
rdi
],
xmm1
add
rdi
,
rdx
dec
rcx
jnz
filter_block1d4_h4_rowloop_ssse3
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
UNSHADOW_ARGS
pop
rbp
ret
;void vp8_filter_block1d16_v6_ssse3
;(
; unsigned char *src_ptr,
...
...
@@ -700,81 +745,88 @@ vp8_filter_block1d8_v4_ssse3_loop:
UNSHADOW_ARGS
pop
rbp
ret
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
global
sym
(
vp8_filter_block1d8_h6_ssse3_slow
)
sym
(
vp8_filter_block1d8_h6_ssse3_slow
):
;void vp8_filter_block1d4_v6_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; unsigned int vp8_filter_index
;)
global
sym
(
vp8_filter_block1d4_v6_ssse3
)
sym
(
vp8_filter_block1d4_v6_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
7
SHADOW_ARGS_TO_STACK
6
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
mov
rdx
,
arg
(
6
)
;vp8_filter
mov
rsi
,
arg
(
0
)
;src_ptr
mov
rdi
,
arg
(
1
)
;output_ptr
movsxd
rcx
,
dword
ptr
arg
(
4
)
;output_height
movsxd
rax
,
dword
ptr
arg
(
2
)
;src_pixels_per_line
movq
xmm7
,
[
rdx
]
pxor
xmm4
,
xmm4
movdqa
xmm5
,
XMMWORD
PTR
[
shuf1
GLOBAL
]
movdqa
xmm6
,
XMMWORD
PTR
[
shuf2
GLOBAL
]
movsxd
rdx
,
dword
ptr
arg
(
5
)
;output_width
movsxd
rdx
,
DWORD
PTR
arg
(
5
)
;table index
xor
rsi
,
rsi
shl
rdx
,
4
;
punpcklqdq
xmm7
,
xmm7
;copy filter constants to upper 8 bytes
lea
rax
,
[
k0_k5
GLOBAL
]
add
rax
,
rdx
filter_block1d8_h6_rowloop3_slow:
movdqu
xmm0
,
XMMWORD
PTR
[
rsi
-
2
]
movsxd
rdx
,
DWORD
PTR
arg
(
1
)
;pixels_per_line
mov
rdi
,
arg
(
2
)
;output_ptr
%if ABI_IS_32BIT=0
movsxd
r8
,
DWORD
PTR
arg
(
3
)
; out_pitch
%endif
movsxd
rcx
,
DWORD
PTR
arg
(
4
)
;[output_height]
lea
rsi
,
[
rsi
+
rax
]
cmp
esi
,
DWORD
PTR
[
rax
]
je
vp8_filter_block1d4_v4_ssse3
movdqa
xmm1
,
xmm0
pshufb
xmm0
,
XMMWORD
PTR
[
shuf1
GLOBAL
]
movq
mm5
,
MMWORD
PTR
[
rax
]
;k0_k5
movq
mm6
,
MMWORD
PTR
[
rax
+
256
]
;k2_k4
movq
mm7
,
MMWORD
PTR
[
rax
+
128
]
;k1_k3
movdqa
xmm2
,
xmm1
pmaddubsw
xmm0
,
xmm7
pshufb
xmm1
,
XMMWORD
PTR
[
shuf2
GLOBAL
]
mov
rsi
,
arg
(
0
)
;src_ptr
movdqa
xmm3
,
xmm2
pmaddubsw
xmm1
,
xmm7
pshufb
xmm2
,
XMMWORD
PTR
[
shuf3
GLOBAL
]
mov
rax
,
rsi
add
rax
,
rdx
pshufb
xmm3
,
XMMWORD
PTR
[
shuf4
GLOBAL
]
vp8_filter_block1d4_v6_ssse3_loop:
movd
mm1
,
DWORD
PTR
[
rsi
]
;A
movd
mm2
,
DWORD
PTR
[
rsi
+
rdx
]
;B
movd
mm3
,
DWORD
PTR
[
rsi
+
rdx
*
2
]
;C
movd
mm4
,
DWORD
PTR
[
rax
+
rdx
*
2
]
;D
movd
mm0
,
DWORD
PTR
[
rsi
+
rdx
*
4
]
;E
pmaddubsw
xmm2
,
xmm7
pmaddubsw
xmm3
,
xmm7
;4 cycles
punpcklbw
mm2
,
mm4
;B D
punpcklbw
mm3
,
mm0
;C E
phaddsw
xmm0
,
xmm1
phaddsw
xmm2
,
xmm3
;7 cycles
phaddsw
xmm0
,
xmm2
;7 cycles
movd
mm0
,
DWORD
PTR
[
rax
+
rdx
*
4
]
;F
movq
mm4
,
[
rd
GLOBAL
]
paddsw
xmm0
,
[
rd
GLOBAL
]
psraw
xmm0
,
7
packuswb
xmm0
,
xmm0
pmaddubsw
mm3
,
mm6
punpcklbw
mm1
,
mm0
;A F
pmaddubsw
mm2
,
mm7
pmaddubsw
mm1
,
mm5
add
rsi
,
rdx
add
rax
,
rdx
;--
;--
paddsw
mm2
,
mm3
paddsw
mm2
,
mm1
paddsw
mm2
,
mm4
psraw
mm2
,
7
packuswb
mm2
,
mm2
;
punpcklbw
xmm0
,
xmm4
movd
DWORD
PTR
[
rdi
],
mm2
movdqa
XMMWORD
Ptr
[
rdi
],
xmm0
add
rdi
,
rdx
%if ABI_IS_32BIT
add
rdi
,
DWORD
PTR
arg
(
3
)
;[out_pitch]
%else
add
rdi
,
r8
%endif
dec
rcx
jnz
filter_block1d
8_h6_rowloop
3_
s
lo
w
; next row
jnz
vp8_
filter_block1d
4_v6_ssse
3_lo
op
; begin epilog
pop
rdi
...
...
@@ -783,111 +835,46 @@ filter_block1d8_h6_rowloop3_slow:
UNSHADOW_ARGS
pop
rbp
ret
;void vp8_filter_block1d16_h6_ssse3
;(
; unsigned char *src_ptr,
; unsigned short *output_ptr,
; unsigned int src_pixels_per_line,
; unsigned int pixel_step,
; unsigned int output_height,
; unsigned int output_width,
; short *vp8_filter
;)
global
sym
(
vp8_filter_block1d16_h6_ssse3_slow
)
sym
(
vp8_filter_block1d16_h6_ssse3_slow
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
7
SAVE_XMM
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
mov
rdx
,
arg
(
6
)
;vp8_filter
mov
rsi
,
arg
(
0
)
;src_ptr
mov
rdi
,
arg
(
1
)
;output_ptr
movsxd
rcx
,
dword
ptr
arg
(
4
)
;output_height
movsxd
rax
,
dword
ptr
arg
(
2
)
;src_pixels_per_line
movq
xmm7
,
[
rdx
]
pxor
xmm4
,
xmm4
movdqa
xmm5
,
XMMWORD
PTR
[
shuf1
GLOBAL
]
movdqa
xmm6
,
XMMWORD
PTR
[
shuf2
GLOBAL
]
movsxd
rdx
,
dword
ptr
arg
(
5
)
;output_width
vp8_filter_block1d4_v4_ssse3:
movq
mm6
,
MMWORD
PTR
[
rax
+
256
]
;k2_k4
movq
mm7
,
MMWORD
PTR
[
rax
+
128
]
;k1_k3
movq
mm5
,
MMWORD
PTR
[
rd
GLOBAL
]
punpcklqdq
xmm7
,
xmm7
;copy filter constants to upper 8 bytes
sub
rdi
,
rdx
filter_block1d16_h6_rowloop3_slow:
movdqu
xmm0
,
XMMWORD
PTR
[
rsi
-
2
]
movdqa
xmm1
,
xmm0
pshufb
xmm0
,
xmm5
mov
rsi
,
arg
(
0
)
;src_ptr
movdqa
xmm2
,
xmm1
pmaddubsw
xmm0
,
xmm7
pshufb
xmm1
,
xmm6
movdqa
xmm3
,
xmm2
pmaddubsw
xmm1
,
xmm7
pshufb
xmm2
,
XMMWORD
PTR
[
shuf3
GLOBAL
]
movdqu
xmm4
,
XMMWORD
PTR
[
rsi
+
6
]
pshufb
xmm3
,
XMMWORD
PTR
[
shuf4
GLOBAL
]
lea
rsi
,
[
rsi
+
rax
]
pmaddubsw
xmm2
,
xmm7
phaddsw
xmm0
,
xmm1
pmaddubsw
xmm3
,
xmm7
movdqa
xmm1
,
xmm4
pshufb
xmm4
,
xmm5
movdqa
xmm5
,
xmm1
pmaddubsw
xmm4
,
xmm7
pshufb
xmm1
,
xmm6
phaddsw
xmm2
,
xmm3
pmaddubsw
xmm1
,
xmm7
movdqa
xmm3
,
xmm5
pshufb
xmm5
,
XMMWORD
PTR
[
shuf3
GLOBAL
]
add
rdi
,
rdx
pmaddubsw
xmm5
,
xmm7
pshufb
xmm3
,
XMMWORD
PTR
[
shuf4
GLOBAL
]
phaddsw
xmm4
,
xmm1
dec
rcx
phaddsw
xmm0
,
xmm2
pmaddubsw
xmm3
,
xmm7
mov
rax
,
rsi
add
rax
,
rdx
vp8_filter_block1d4_v4_ssse3_loop:
movd
mm2
,
DWORD
PTR
[
rsi
+
rdx
]
;B
movd
mm3
,
DWORD
PTR
[
rsi
+
rdx
*
2
]
;C
movd
mm4
,
DWORD
PTR
[
rax
+
rdx
*
2
]
;D
movd
mm0
,
DWORD
PTR
[
rsi
+
rdx
*
4
]
;E
paddsw
xmm0
,
[
rd
GLOBAL
]
psraw
xmm0
,
7
packuswb
xmm0
,
xmm0
phaddsw
xmm5
,
xmm3
pxor
xmm3
,
xmm3
punpcklbw
xmm0
,
xmm3
;--
;--
;--
;--
punpcklbw
mm2
,
mm4
;B D
punpcklbw
mm3
,
mm0
;C E
phaddsw
xmm4
,
xmm5
movdqa
xmm5
,
XMMWORD
PTR
[
shuf1
GLOBAL
]
movdqa
XMMWORD
Ptr
[
rdi
],
xmm0
;--
;--
;--
pmaddubsw
mm3
,
mm6
pmaddubsw
mm2
,
mm7
add
rsi
,
rdx
add
rax
,
rdx
;--
;--
paddsw
xmm4
,
[
rd
GLOBAL
]
psraw
xmm4
,
7
packuswb
xmm4
,
xmm4
;
punpcklbw
xmm4
,
xmm3
paddsw
mm2
,
mm3
paddsw
mm2
,
mm5
psraw
mm2
,
7
packuswb
mm2
,
mm2
movdqa
XMMWORD
Ptr
[
rdi
+
16
],
xmm4
jnz
filter_block1d16_h6_rowloop3_slow
; next row
movd
DWORD
PTR
[
rdi
],
mm2
%if ABI_IS_32BIT
add
rdi
,
DWORD
PTR
arg
(
3
)
;[out_pitch]
%else
add
rdi
,
r8
%endif
dec
rcx
jnz
vp8_filter_block1d4_v4_ssse3_loop
; begin epilog
pop
rdi
...
...
@@ -899,22 +886,6 @@ filter_block1d16_h6_rowloop3_slow:
SECTION
_RODATA
align
16
shuf1:
db
0
,
1
,
2
,
4
,
3
,
5
,
128
,
128
,
1
,
2
,
3
,
5
,
4
,
6
,
128
,
128
shuf2:
db
2
,
3
,
4
,
6
,
5
,
7
,
128
,
128
,
3
,
4
,
5
,
7
,
6
,
8
,
128
,
128
shuf3:
db
4
,
5
,
6
,
8
,
7
,
9
,
128
,
128
,
5
,
6
,
7
,
9
,
8
,
10
,
128
,
128
shuf4:
db
6
,
7
,
8
,
10
,
9
,
11
,
128
,
128
,
7
,
8
,
9
,
11
,
10
,
12
,
128
,
128
shuf1a:
db
0
,
1
,
1
,
2
,
2
,
3
,
3
,
4
,
4
,
5
,
5
,
6
,
6
,
7
,
7
,
8
shuf2a:
db
2
,
4
,
3
,
5
,
4
,
6
,
5
,
7
,
6
,
8
,
7
,
9
,
8
,
10
,
9
,
11
shuf3a:
db
3
,
5
,
4
,
6
,
5
,
7
,
6
,
8
,
7
,
9
,
8
,
10
,
9
,
11
,
10
,
12
shuf1b:
db
0
,
5
,
1
,
6
,
2
,
7
,
3
,
8
,
4
,
9
,
5
,
10
,
6
,
11
,
7
,
12
shuf2b:
...
...
vp8/common/x86/subpixel_x86.h
View file @
f5615b61
...
...
@@ -104,8 +104,8 @@ extern prototype_subpixel_predict(vp8_sixtap_predict4x4_ssse3);
#undef vp8_subpix_sixtap8x4
#define vp8_subpix_sixtap8x4 vp8_sixtap_predict8x4_ssse3
//
#undef vp8_subpix_sixtap4x4
//
#define vp8_subpix_sixtap4x4 vp8_sixtap_predict4x4_ssse3
#undef vp8_subpix_sixtap4x4
#define vp8_subpix_sixtap4x4 vp8_sixtap_predict4x4_ssse3
//#undef vp8_subpix_bilinear16x16
...
...
vp8/common/x86/vp8_asm_stubs.c
View file @
f5615b61
...
...
@@ -402,6 +402,26 @@ extern void vp8_filter_block1d8_v6_ssse3
unsigned
int
vp8_filter_index
);
extern
void
vp8_filter_block1d4_h6_ssse3
(
unsigned
char
*
src_ptr
,
unsigned
int
src_pixels_per_line
,
unsigned
char
*
output_ptr
,
unsigned
int
output_pitch
,
unsigned
int
output_height
,
unsigned
int
vp8_filter_index
);
extern
void
vp8_filter_block1d4_v6_ssse3
(
unsigned
char
*
src_ptr
,
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
unsigned
int
vp8_filter_index
);
void
vp8_sixtap_predict16x16_ssse3
(
unsigned
char
*
src_ptr
,
...
...
@@ -509,21 +529,24 @@ void vp8_sixtap_predict4x4_ssse3
int
dst_pitch
)
{
DECLARE_ALIGNED_ARRAY
(
16
,
unsigned
char
,
FData2
,
16
*
16
);
if
(
xoffset
)
{
if
(
yoffset
)
{
}
else
{
}
}
else
{
}
DECLARE_ALIGNED_ARRAY
(
16
,
unsigned
char
,
FData2
,
4
*
9
);
if
(
xoffset
)
{
if
(
yoffset
)
{
vp8_filter_block1d4_h6_ssse3
(
src_ptr
-
(
2
*
src_pixels_per_line
),
src_pixels_per_line
,
FData2
,
4
,
9
,
xoffset
);
vp8_filter_block1d4_v6_ssse3
(
FData2
,
4
,
dst_ptr
,
dst_pitch
,
4
,
yoffset
);
}
else
{
vp8_filter_block1d4_h6_ssse3
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pitch
,
4
,
xoffset
);
}
}
else
{
vp8_filter_block1d4_v6_ssse3
(
src_ptr
-
(
2
*
src_pixels_per_line
),
src_pixels_per_line
,
dst_ptr
,
dst_pitch
,
4
,
yoffset
);
}
}
...
...
vp8/common/x86/x86_systemdependent.c
View file @
f5615b61
...
...
@@ -123,7 +123,7 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
rtcd
->
subpix
.
sixtap16x16
=
vp8_sixtap_predict16x16_ssse3
;
rtcd
->
subpix
.
sixtap8x8
=
vp8_sixtap_predict8x8_ssse3
;
rtcd
->
subpix
.
sixtap8x4
=
vp8_sixtap_predict8x4_ssse3
;
//
rtcd->subpix.sixtap4x4 = vp8_sixtap_predict4x4_ssse3;
rtcd
->
subpix
.
sixtap4x4
=
vp8_sixtap_predict4x4_ssse3
;
}
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment