Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
cabfd505
Commit
cabfd505
authored
Feb 03, 2014
by
Yunqing Wang
Committed by
Gerrit Code Review
Feb 03, 2014
Browse files
Merge "Optimize bilinear sub-pixel filters in sse2"
parents
5470e0cf
2488cb34
Changes
3
Hide whitespace changes
Inline
Side-by-side
vp9/common/x86/vp9_asm_stubs.c
View file @
cabfd505
...
...
@@ -23,36 +23,63 @@ typedef void filter8_1dfunction (
const
short
*
filter
);
#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, \
int w, int h) { \
#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt
1, opt2
) \
void vp9_convolve8_##name##_##opt
1
(const uint8_t *src, ptrdiff_t src_stride, \
uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, \
int w, int h) { \
if (step_q4 == 16 && filter[3] != 128) { \
while (w >= 16) { \
vp9_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, \
dst, dst_stride, \
h, filter); \
src += 16; \
dst += 16; \
w -= 16; \
} \
while (w >= 8) { \
vp9_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, \
dst, dst_stride, \
h, filter); \
src += 8; \
dst += 8; \
w -= 8; \
} \
while (w >= 4) { \
vp9_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, \
dst, dst_stride, \
h, filter); \
src += 4; \
dst += 4; \
w -= 4; \
if (filter[0] || filter[1] || filter[2]) { \
while (w >= 16) { \
vp9_filter_block1d16_##dir##8_##avg##opt1(src_start, src_stride, \
dst, dst_stride, \
h, filter); \
src += 16; \
dst += 16; \
w -= 16; \
} \
while (w >= 8) { \
vp9_filter_block1d8_##dir##8_##avg##opt1(src_start, src_stride, \
dst, dst_stride, \
h, filter); \
src += 8; \
dst += 8; \
w -= 8; \
} \
while (w >= 4) { \
vp9_filter_block1d4_##dir##8_##avg##opt1(src_start, src_stride, \
dst, dst_stride, \
h, filter); \
src += 4; \
dst += 4; \
w -= 4; \
} \
} else { \
while (w >= 16) { \
vp9_filter_block1d16_##dir##2_##avg##opt2(src, src_stride, \
dst, dst_stride, \
h, filter); \
src += 16; \
dst += 16; \
w -= 16; \
} \
while (w >= 8) { \
vp9_filter_block1d8_##dir##2_##avg##opt2(src, src_stride, \
dst, dst_stride, \
h, filter); \
src += 8; \
dst += 8; \
w -= 8; \
} \
while (w >= 4) { \
vp9_filter_block1d4_##dir##2_##avg##opt2(src, src_stride, \
dst, dst_stride, \
h, filter); \
src += 4; \
dst += 4; \
w -= 4; \
} \
} \
} \
if (w) { \
...
...
@@ -68,17 +95,27 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, \
int w, int h) { \
DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
\
assert(w <= 64); \
assert(h <= 64); \
if (x_step_q4 == 16 && y_step_q4 == 16) { \
vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 7); \
vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h); \
if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 7); \
vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
filter_x, x_step_q4, filter_y, \
y_step_q4, w, h); \
} else { \
DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \
vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 1); \
vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
filter_x, x_step_q4, filter_y, \
y_step_q4, w, h); \
} \
} else { \
vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
...
...
@@ -99,6 +136,19 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
filter8_1dfunction
vp9_filter_block1d4_v8_avg_ssse3
;
filter8_1dfunction
vp9_filter_block1d4_h8_avg_ssse3
;
filter8_1dfunction
vp9_filter_block1d16_v2_sse2
;
filter8_1dfunction
vp9_filter_block1d16_h2_sse2
;
filter8_1dfunction
vp9_filter_block1d8_v2_sse2
;
filter8_1dfunction
vp9_filter_block1d8_h2_sse2
;
filter8_1dfunction
vp9_filter_block1d4_v2_sse2
;
filter8_1dfunction
vp9_filter_block1d4_h2_sse2
;
filter8_1dfunction
vp9_filter_block1d16_v2_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d16_h2_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d8_v2_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d8_h2_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d4_v2_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d4_h2_avg_sse2
;
// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
...
...
@@ -119,11 +169,11 @@ filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_1D
(
horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
,
ssse3
);
FUN_CONV_1D
(
vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
,
ssse3
);
FUN_CONV_1D
(
avg_horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
avg_
,
ssse3
);
FUN_CONV_1D
(
horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
,
ssse3
,
sse2
);
FUN_CONV_1D
(
vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
,
ssse3
,
sse2
);
FUN_CONV_1D
(
avg_horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
avg_
,
ssse3
,
sse2
);
FUN_CONV_1D
(
avg_vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
avg_
,
ssse3
);
ssse3
,
sse2
);
// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
...
...
@@ -153,6 +203,19 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
filter8_1dfunction
vp9_filter_block1d4_v8_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d4_h8_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d16_v2_sse2
;
filter8_1dfunction
vp9_filter_block1d16_h2_sse2
;
filter8_1dfunction
vp9_filter_block1d8_v2_sse2
;
filter8_1dfunction
vp9_filter_block1d8_h2_sse2
;
filter8_1dfunction
vp9_filter_block1d4_v2_sse2
;
filter8_1dfunction
vp9_filter_block1d4_h2_sse2
;
filter8_1dfunction
vp9_filter_block1d16_v2_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d16_h2_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d8_v2_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d8_h2_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d4_v2_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d4_h2_avg_sse2
;
// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
...
...
@@ -173,10 +236,11 @@ filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_1D
(
horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
,
sse2
);
FUN_CONV_1D
(
vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
,
sse2
);
FUN_CONV_1D
(
avg_horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
avg_
,
sse2
);
FUN_CONV_1D
(
avg_vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
avg_
,
sse2
);
FUN_CONV_1D
(
horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
,
sse2
,
sse2
);
FUN_CONV_1D
(
vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
,
sse2
,
sse2
);
FUN_CONV_1D
(
avg_horiz
,
x_step_q4
,
filter_x
,
h
,
src
,
avg_
,
sse2
,
sse2
);
FUN_CONV_1D
(
avg_vert
,
y_step_q4
,
filter_y
,
v
,
src
-
src_stride
*
3
,
avg_
,
sse2
,
sse2
);
// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
...
...
vp9/common/x86/vp9_subpixel_bilinear_sse2.asm
0 → 100644
View file @
cabfd505
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro GET_PARAM_4 0
mov
rdx
,
arg
(
5
)
;filter ptr
mov
rsi
,
arg
(
0
)
;src_ptr
mov
rdi
,
arg
(
2
)
;output_ptr
mov
rcx
,
0x0400040
movdqa
xmm3
,
[
rdx
]
;load filters
pshuflw
xmm4
,
xmm3
,
11111111b
;k3
psrldq
xmm3
,
8
pshuflw
xmm3
,
xmm3
,
0b
;k4
punpcklqdq
xmm4
,
xmm3
;k3k4
movq
xmm3
,
rcx
;rounding
pshufd
xmm3
,
xmm3
,
0
pxor
xmm2
,
xmm2
movsxd
rax
,
DWORD
PTR
arg
(
1
)
;pixels_per_line
movsxd
rdx
,
DWORD
PTR
arg
(
3
)
;out_pitch
movsxd
rcx
,
DWORD
PTR
arg
(
4
)
;output_height
%endm
%macro APPLY_FILTER_4 1
punpckldq
xmm0
,
xmm1
;two row in one register
punpcklbw
xmm0
,
xmm2
;unpack to word
pmullw
xmm0
,
xmm4
;multiply the filter factors
movdqa
xmm1
,
xmm0
psrldq
xmm1
,
8
paddsw
xmm0
,
xmm1
paddsw
xmm0
,
xmm3
;rounding
psraw
xmm0
,
7
;shift
packuswb
xmm0
,
xmm0
;pack to byte
%if %1
movd
xmm1
,
[
rdi
]
pavgb
xmm0
,
xmm1
%endif
movd
[
rdi
],
xmm0
lea
rsi
,
[
rsi
+
rax
]
lea
rdi
,
[
rdi
+
rdx
]
dec
rcx
%endm
%macro GET_PARAM 0
mov
rdx
,
arg
(
5
)
;filter ptr
mov
rsi
,
arg
(
0
)
;src_ptr
mov
rdi
,
arg
(
2
)
;output_ptr
mov
rcx
,
0x0400040
movdqa
xmm7
,
[
rdx
]
;load filters
pshuflw
xmm6
,
xmm7
,
11111111b
;k3
pshufhw
xmm7
,
xmm7
,
0b
;k4
punpcklwd
xmm6
,
xmm6
punpckhwd
xmm7
,
xmm7
movq
xmm4
,
rcx
;rounding
pshufd
xmm4
,
xmm4
,
0
pxor
xmm5
,
xmm5
movsxd
rax
,
DWORD
PTR
arg
(
1
)
;pixels_per_line
movsxd
rdx
,
DWORD
PTR
arg
(
3
)
;out_pitch
movsxd
rcx
,
DWORD
PTR
arg
(
4
)
;output_height
%endm
%macro APPLY_FILTER_8 1
punpcklbw
xmm0
,
xmm5
punpcklbw
xmm1
,
xmm5
pmullw
xmm0
,
xmm6
pmullw
xmm1
,
xmm7
paddsw
xmm0
,
xmm1
paddsw
xmm0
,
xmm4
;rounding
psraw
xmm0
,
7
;shift
packuswb
xmm0
,
xmm0
;pack back to byte
%if %1
movq
xmm1
,
[
rdi
]
pavgb
xmm0
,
xmm1
%endif
movq
[
rdi
],
xmm0
;store the result
lea
rsi
,
[
rsi
+
rax
]
lea
rdi
,
[
rdi
+
rdx
]
dec
rcx
%endm
%macro APPLY_FILTER_16 1
punpcklbw
xmm0
,
xmm5
punpcklbw
xmm1
,
xmm5
punpckhbw
xmm2
,
xmm5
punpckhbw
xmm3
,
xmm5
pmullw
xmm0
,
xmm6
pmullw
xmm1
,
xmm7
pmullw
xmm2
,
xmm6
pmullw
xmm3
,
xmm7
paddsw
xmm0
,
xmm1
paddsw
xmm2
,
xmm3
paddsw
xmm0
,
xmm4
;rounding
paddsw
xmm2
,
xmm4
psraw
xmm0
,
7
;shift
psraw
xmm2
,
7
packuswb
xmm0
,
xmm2
;pack back to byte
%if %1
movdqu
xmm1
,
[
rdi
]
pavgb
xmm0
,
xmm1
%endif
movdqu
[
rdi
],
xmm0
;store the result
lea
rsi
,
[
rsi
+
rax
]
lea
rdi
,
[
rdi
+
rdx
]
dec
rcx
%endm
global
sym
(
vp9_filter_block1d4_v2_sse2
)
PRIVATE
sym
(
vp9_filter_block1d4_v2_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
push
rsi
push
rdi
; end prolog
GET_PARAM_4
.loop:
movd
xmm0
,
[
rsi
]
;load src
movd
xmm1
,
[
rsi
+
rax
]
APPLY_FILTER_4
0
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d8_v2_sse2
)
PRIVATE
sym
(
vp9_filter_block1d8_v2_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movq
xmm0
,
[
rsi
]
;0
movq
xmm1
,
[
rsi
+
rax
]
;1
APPLY_FILTER_8
0
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d16_v2_sse2
)
PRIVATE
sym
(
vp9_filter_block1d16_v2_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movdqu
xmm0
,
[
rsi
]
;0
movdqu
xmm1
,
[
rsi
+
rax
]
;1
movdqa
xmm2
,
xmm0
movdqa
xmm3
,
xmm1
APPLY_FILTER_16
0
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d4_v2_avg_sse2
)
PRIVATE
sym
(
vp9_filter_block1d4_v2_avg_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
push
rsi
push
rdi
; end prolog
GET_PARAM_4
.loop:
movd
xmm0
,
[
rsi
]
;load src
movd
xmm1
,
[
rsi
+
rax
]
APPLY_FILTER_4
1
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d8_v2_avg_sse2
)
PRIVATE
sym
(
vp9_filter_block1d8_v2_avg_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movq
xmm0
,
[
rsi
]
;0
movq
xmm1
,
[
rsi
+
rax
]
;1
APPLY_FILTER_8
1
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d16_v2_avg_sse2
)
PRIVATE
sym
(
vp9_filter_block1d16_v2_avg_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movdqu
xmm0
,
[
rsi
]
;0
movdqu
xmm1
,
[
rsi
+
rax
]
;1
movdqa
xmm2
,
xmm0
movdqa
xmm3
,
xmm1
APPLY_FILTER_16
1
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d4_h2_sse2
)
PRIVATE
sym
(
vp9_filter_block1d4_h2_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
push
rsi
push
rdi
; end prolog
GET_PARAM_4
.loop:
movdqu
xmm0
,
[
rsi
]
;load src
movdqa
xmm1
,
xmm0
psrldq
xmm1
,
1
APPLY_FILTER_4
0
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d8_h2_sse2
)
PRIVATE
sym
(
vp9_filter_block1d8_h2_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movdqu
xmm0
,
[
rsi
]
;load src
movdqa
xmm1
,
xmm0
psrldq
xmm1
,
1
APPLY_FILTER_8
0
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d16_h2_sse2
)
PRIVATE
sym
(
vp9_filter_block1d16_h2_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movdqu
xmm0
,
[
rsi
]
;load src
movdqu
xmm1
,
[
rsi
+
1
]
movdqa
xmm2
,
xmm0
movdqa
xmm3
,
xmm1
APPLY_FILTER_16
0
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d4_h2_avg_sse2
)
PRIVATE
sym
(
vp9_filter_block1d4_h2_avg_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
push
rsi
push
rdi
; end prolog
GET_PARAM_4
.loop:
movdqu
xmm0
,
[
rsi
]
;load src
movdqa
xmm1
,
xmm0
psrldq
xmm1
,
1
APPLY_FILTER_4
1
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
UNSHADOW_ARGS
pop
rbp
ret
global
sym
(
vp9_filter_block1d8_h2_avg_sse2
)
PRIVATE
sym
(
vp9_filter_block1d8_h2_avg_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
6
SAVE_XMM
7
push
rsi
push
rdi
; end prolog
GET_PARAM
.loop:
movdqu
xmm0
,
[
rsi
]
;load src
movdqa
xmm1
,
xmm0
psrldq
xmm1
,
1
APPLY_FILTER_8
1
jnz
.loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret