Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Guillaume Martres
aom-rav1e
Commits
7ad56bf3
Commit
7ad56bf3
authored
Feb 05, 2014
by
Yunqing Wang
Committed by
Gerrit Code Review
Feb 05, 2014
Browse files
Merge "Optimize bilinear sub-pixel filters in ssse3"
parents
6432ae49
d1961e6f
Changes
3
Hide whitespace changes
Inline
Side-by-side
vp9/common/x86/vp9_asm_stubs.c
View file @
7ad56bf3
...
...
@@ -23,8 +23,8 @@ typedef void filter8_1dfunction (
const
short
*
filter
);
#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt
1, opt2
) \
void vp9_convolve8_##name##_##opt
1
(const uint8_t *src, ptrdiff_t src_stride, \
#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, \
...
...
@@ -32,7 +32,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \
if (step_q4 == 16 && filter[3] != 128) { \
if (filter[0] || filter[1] || filter[2]) { \
while (w >= 16) { \
vp9_filter_block1d16_##dir##8_##avg##opt
1
(src_start, src_stride, \
vp9_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, \
dst, dst_stride, \
h, filter); \
src += 16; \
...
...
@@ -40,7 +40,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \
w -= 16; \
} \
while (w >= 8) { \
vp9_filter_block1d8_##dir##8_##avg##opt
1
(src_start, src_stride, \
vp9_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, \
dst, dst_stride, \
h, filter); \
src += 8; \
...
...
@@ -48,7 +48,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \
w -= 8; \
} \
while (w >= 4) { \
vp9_filter_block1d4_##dir##8_##avg##opt
1
(src_start, src_stride, \
vp9_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, \
dst, dst_stride, \
h, filter); \
src += 4; \
...
...
@@ -57,7 +57,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \
} \
} else { \
while (w >= 16) { \
vp9_filter_block1d16_##dir##2_##avg##opt
2
(src, src_stride, \
vp9_filter_block1d16_##dir##2_##avg##opt(src, src_stride, \
dst, dst_stride, \
h, filter); \
src += 16; \
...
...
@@ -65,7 +65,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \
w -= 16; \
} \
while (w >= 8) { \
vp9_filter_block1d8_##dir##2_##avg##opt
2
(src, src_stride, \
vp9_filter_block1d8_##dir##2_##avg##opt(src, src_stride, \
dst, dst_stride, \
h, filter); \
src += 8; \
...
...
@@ -73,7 +73,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \
w -= 8; \
} \
while (w >= 4) { \
vp9_filter_block1d4_##dir##2_##avg##opt
2
(src, src_stride, \
vp9_filter_block1d4_##dir##2_##avg##opt(src, src_stride, \
dst, dst_stride, \
h, filter); \
src += 4; \
...
...
@@ -136,18 +136,18 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
filter8_1dfunction
vp9_filter_block1d4_v8_avg_ssse3
;
filter8_1dfunction
vp9_filter_block1d4_h8_avg_ssse3
;
filter8_1dfunction
vp9_filter_block1d16_v2_ss
e2
;
filter8_1dfunction
vp9_filter_block1d16_h2_ss
e2
;
filter8_1dfunction
vp9_filter_block1d8_v2_ss
e2
;
filter8_1dfunction
vp9_filter_block1d8_h2_ss
e2
;
filter8_1dfunction
vp9_filter_block1d4_v2_ss
e2
;
filter8_1dfunction
vp9_filter_block1d4_h2_ss
e2
;
filter8_1dfunction
vp9_filter_block1d16_v2_avg_ss
e2
;
filter8_1dfunction
vp9_filter_block1d16_h2_avg_ss
e2
;
filter8_1dfunction
vp9_filter_block1d8_v2_avg_ss
e2
;
filter8_1dfunction
vp9_filter_block1d8_h2_avg_ss
e2
;
filter8_1dfunction
vp9_filter_block1d4_v2_avg_ss
e2
;
filter8_1dfunction
vp9_filter_block1d4_h2_avg_ss
e2
;
filter8_1dfunction
vp9_filter_block1d16_v2_ss
se3
;
filter8_1dfunction
vp9_filter_block1d16_h2_ss
se3
;
filter8_1dfunction
vp9_filter_block1d8_v2_ss
se3
;
filter8_1dfunction
vp9_filter_block1d8_h2_ss
se3
;
filter8_1dfunction
vp9_filter_block1d4_v2_ss
se3
;
filter8_1dfunction
vp9_filter_block1d4_h2_ss
se3
;
filter8_1dfunction
vp9_filter_block1d16_v2_avg_ss
se3
;
filter8_1dfunction
vp9_filter_block1d16_h2_avg_ss
se3
;
filter8_1dfunction
vp9_filter_block1d8_v2_avg_ss
se3
;
filter8_1dfunction
vp9_filter_block1d8_h2_avg_ss
se3
;
filter8_1dfunction
vp9_filter_block1d4_v2_avg_ss
se3
;
filter8_1dfunction
vp9_filter_block1d4_h2_avg_ss
se3
;
// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
...
...
@@ -169,11 +169,11 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_1D
(
horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
,
ssse3
,
sse2
);
FUN_CONV_1D
(
vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
,
ssse3
,
sse2
);
FUN_CONV_1D
(
avg_horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
avg_
,
ssse3
,
sse2
);
FUN_CONV_1D
(
horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
,
ssse3
);
FUN_CONV_1D
(
vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
,
ssse3
);
FUN_CONV_1D
(
avg_horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
avg_
,
ssse3
);
FUN_CONV_1D
(
avg_vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
avg_
,
ssse3
,
sse2
);
ssse3
);
// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
...
...
@@ -236,11 +236,10 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_1D
(
horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
,
sse2
,
sse2
);
FUN_CONV_1D
(
vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
,
sse2
,
sse2
);
FUN_CONV_1D
(
avg_horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
avg_
,
sse2
,
sse2
);
FUN_CONV_1D
(
avg_vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
avg_
,
sse2
,
sse2
);
FUN_CONV_1D
(
horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
,
sse2
);
FUN_CONV_1D
(
vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
,
sse2
);
FUN_CONV_1D
(
avg_horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
avg_
,
sse2
);
FUN_CONV_1D
(
avg_vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
avg_
,
sse2
);
// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
...
...
vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm
0 → 100644
View file @
7ad56bf3
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro GET_PARAM_4 0
mov
rdx
,
arg
(
5
)
;filter ptr
mov
rsi
,
arg
(
0
)
;src_ptr
mov
rdi
,
arg
(
2
)
;output_ptr
mov
rcx
,
0x0400040
movdqa
xmm3
,
[
rdx
]
;load filters
psrldq
xmm3
,
6
packsswb
xmm3
,
xmm3
pshuflw
xmm3
,
xmm3
,
0b
;k3_k4
movq
xmm2
,
rcx
;rounding
pshufd
xmm2
,
xmm2
,
0
movsxd
rax
,
DWORD
PTR
arg
(
1
)
;pixels_per_line
movsxd
rdx
,
DWORD
PTR
arg
(
3
)
;out_pitch
movsxd
rcx
,
DWORD
PTR
arg
(
4
)
;output_height
%endm
%macro APPLY_FILTER_4 1
punpcklbw
xmm0
,
xmm1
pmaddubsw
xmm0
,
xmm3
paddsw
xmm0
,
xmm2
;rounding
psraw
xmm0
,
7
;shift
packuswb
xmm0
,
xmm0
;pack to byte
%if %1
movd
xmm1
,
[
rdi
]
pavgb
xmm0
,
xmm1
%endif
movd
[
rdi
],
xmm0
lea
rsi
,
[
rsi
+
rax
]
lea
rdi
,
[
rdi
+
rdx
]
dec
rcx
%endm
%macro GET_PARAM 0
mov
rdx
,
arg
(
5
)
;filter ptr
mov
rsi
,
arg
(
0
)
;src_ptr
mov
rdi
,
arg
(
2
)
;output_ptr
mov
rcx
,
0x0400040
movdqa
xmm7
,
[
rdx
]
;load filters
psrldq
xmm7
,
6
packsswb
xmm7
,
xmm7
pshuflw
xmm7
,
xmm7
,
0b
;k3_k4
punpcklwd
xmm7
,
xmm7
movq
xmm6
,
rcx
;rounding
pshufd
xmm6
,
xmm6
,
0
movsxd
rax
,
DWORD
PTR
arg
(
1
)
;pixels_per_line
movsxd
rdx
,
DWORD
PTR
arg
(
3
)
;out_pitch
movsxd
rcx
,
DWORD
PTR
arg
(
4
)
;output_height
%endm
%macro APPLY_FILTER_8 1
punpcklbw
xmm0
,
xmm1
pmaddubsw
xmm0
,
xmm7
paddsw
xmm0
,
xmm6
;rounding
psraw
xmm0
,
7
;shift
packuswb
xmm0
,
xmm0
;pack back to byte
%if %1
movq
xmm1
,
[
rdi
]
pavgb
xmm0
,
xmm1
%endif
movq
[
rdi
],
xmm0
;store the result
lea
rsi
,
[
rsi
+
rax
]
lea
rdi
,
[
rdi
+
rdx
]
dec
rcx
%endm
%macro APPLY_FILTER_16 1
punpcklbw
xmm0
,
xmm1
punpckhbw
xmm2
,
xmm1
pmaddubsw
xmm0
,
xmm7
pmaddubsw
xmm2
,
xmm7
paddsw
xmm0
,
xmm6
;rounding
paddsw
xmm2
,
xmm6
psraw
xmm0
,
7
;shift
psraw
xmm2
,
7
packuswb
xmm0
,
xmm2
;pack back to byte
%if %1
movdqu
xmm1
,
[
rdi
]
pavgb
xmm0
,
xmm1
%endif
movdqu
[
rdi
],
xmm0
;store the result
lea
rsi
,
[
rsi
+
rax
]
lea
rdi
,
[
rdi
+
rdx
]
dec
rcx
%endm
global
sym
(
vp9_filter_block1d4_v2_ssse3
)
PRIVATE
sym
(
vp9_filter_block1d4_v2_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
push
rsi
push
rdi
; end prolog
GET_PARAM_4
.loop:
movd
xmm0
,
[
rsi
]
;load src
movd
xmm1
,
[
rsi
+
rax
]
APPLY_FILTER_4
0
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d8_v2_ssse3
)
PRIVATE
sym
(
vp9_filter_block1d8_v2_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movq
xmm0
,
[
rsi
]
;0
movq
xmm1
,
[
rsi
+
rax
]
;1
APPLY_FILTER_8
0
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d16_v2_ssse3
)
PRIVATE
sym
(
vp9_filter_block1d16_v2_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movdqu
xmm0
,
[
rsi
]
;0
movdqu
xmm1
,
[
rsi
+
rax
]
;1
movdqa
xmm2
,
xmm0
APPLY_FILTER_16
0
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d4_v2_avg_ssse3
)
PRIVATE
sym
(
vp9_filter_block1d4_v2_avg_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
push
rsi
push
rdi
; end prolog
GET_PARAM_4
.loop:
movd
xmm0
,
[
rsi
]
;load src
movd
xmm1
,
[
rsi
+
rax
]
APPLY_FILTER_4
1
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d8_v2_avg_ssse3
)
PRIVATE
sym
(
vp9_filter_block1d8_v2_avg_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movq
xmm0
,
[
rsi
]
;0
movq
xmm1
,
[
rsi
+
rax
]
;1
APPLY_FILTER_8
1
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d16_v2_avg_ssse3
)
PRIVATE
sym
(
vp9_filter_block1d16_v2_avg_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movdqu
xmm0
,
[
rsi
]
;0
movdqu
xmm1
,
[
rsi
+
rax
]
;1
movdqa
xmm2
,
xmm0
APPLY_FILTER_16
1
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d4_h2_ssse3
)
PRIVATE
sym
(
vp9_filter_block1d4_h2_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
push
rsi
push
rdi
; end prolog
GET_PARAM_4
.loop:
movdqu
xmm0
,
[
rsi
]
;load src
movdqa
xmm1
,
xmm0
psrldq
xmm1
,
1
APPLY_FILTER_4
0
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d8_h2_ssse3
)
PRIVATE
sym
(
vp9_filter_block1d8_h2_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movdqu
xmm0
,
[
rsi
]
;load src
movdqa
xmm1
,
xmm0
psrldq
xmm1
,
1
APPLY_FILTER_8
0
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d16_h2_ssse3
)
PRIVATE
sym
(
vp9_filter_block1d16_h2_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movdqu
xmm0
,
[
rsi
]
;load src
movdqu
xmm1
,
[
rsi
+
1
]
movdqa
xmm2
,
xmm0
APPLY_FILTER_16
0
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d4_h2_avg_ssse3
)
PRIVATE
sym
(
vp9_filter_block1d4_h2_avg_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
push
rsi
push
rdi
; end prolog
GET_PARAM_4
.loop:
movdqu
xmm0
,
[
rsi
]
;load src
movdqa
xmm1
,
xmm0
psrldq
xmm1
,
1
APPLY_FILTER_4
1
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d8_h2_avg_ssse3
)
PRIVATE
sym
(
vp9_filter_block1d8_h2_avg_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movdqu
xmm0
,
[
rsi
]
;load src
movdqa
xmm1
,
xmm0
psrldq
xmm1
,
1
APPLY_FILTER_8
1
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d16_h2_avg_ssse3
)
PRIVATE
sym
(
vp9_filter_block1d16_h2_avg_ssse3
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movdqu
xmm0
,
[
rsi
]
;load src
movdqu
xmm1
,
[
rsi
+
1
]
movdqa
xmm2
,
xmm0
APPLY_FILTER_16
1
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
vp9/vp9_common.mk
View file @
7ad56bf3
...
...
@@ -76,6 +76,7 @@ VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2)
+=
common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2)
+=
common/x86/vp9_subpixel_bilinear_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3)
+=
common/x86/vp9_subpixel_8t_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3)
+=
common/x86/vp9_subpixel_bilinear_ssse3.asm
ifeq
($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX)
+=
common/x86/vp9_postproc_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2)
+=
common/x86/vp9_postproc_sse2.asm
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment