Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
d3e9409b
Commit
d3e9409b
authored
Jan 21, 2011
by
Yunqing Wang
Committed by
Code Review
Jan 21, 2011
Browse files
Merge "Modify sub-pixel filters to eliminate unnecessary calculations"
parents
0cdfef1e
0822a62f
Changes
2
Hide whitespace changes
Inline
Side-by-side
vp8/encoder/x86/variance_impl_sse2.asm
View file @
d3e9409b
...
...
@@ -493,8 +493,8 @@ sym(vp8_get8x8var_sse2):
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
;
unsigned short *HFilter
,
;
unsigned short *VFilter
,
;
int xoffset
,
;
int yoffset
,
; int *sum,
; unsigned int *sumsquared;;
;
...
...
@@ -504,68 +504,80 @@ sym(vp8_filter_block2d_bil_var_sse2):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
9
SAVE_XMM
GET_GOT
rbx
push
rsi
push
rdi
sub
rsp
,
16
push
rbx
; end prolog
pxor
xmm6
,
xmm6
;
pxor
xmm7
,
xmm7
;
mov
rax
,
arg
(
5
)
;HFilter ;
mov
r
dx
,
arg
(
6
)
;VFilter ;
mov
rsi
,
arg
(
0
)
;ref_ptr ;
lea
r
si
,
[
GLOBAL
(
xmm_bi_rd
)]
; rounding
mov
dqa
xmm4
,
XMMWORD
PTR
[
rsi
]
mov
rdi
,
arg
(
2
)
;src_ptr ;
movsxd
rcx
,
dword
ptr
arg
(
4
)
;Height ;
lea
rcx
,
[
GLOBAL
(
vp8_bilinear_filters_sse2
)]
movsxd
rax
,
dword
ptr
arg
(
5
)
; xoffset
cmp
rax
,
0
; skip first_pass filter if xoffset=0
je
filter_block2d_bil_var_sse2_sp_only
shl
rax
,
5
; point to filter coeff with xoffset
lea
rax
,
[
rax
+
rcx
]
; HFilter
movsxd
rdx
,
dword
ptr
arg
(
6
)
; yoffset
cmp
rdx
,
0
; skip second_pass filter if yoffset=0
je
filter_block2d_bil_var_sse2_fp_only
shl
rdx
,
5
lea
rdx
,
[
rdx
+
rcx
]
; VFilter
mov
rsi
,
arg
(
0
)
;ref_ptr
mov
rdi
,
arg
(
2
)
;src_ptr
movsxd
rcx
,
dword
ptr
arg
(
4
)
;Height
pxor
xmm0
,
xmm0
;
movq
xmm1
,
QWORD
PTR
[
rsi
]
;
movq
xmm1
,
QWORD
PTR
[
rsi
]
;
movq
xmm3
,
QWORD
PTR
[
rsi
+
1
]
;
movq
xmm3
,
QWORD
PTR
[
rsi
+
1
]
;
punpcklbw
xmm1
,
xmm0
;
pmullw
xmm1
,
[
rax
]
;
pmullw
xmm1
,
[
rax
]
;
punpcklbw
xmm3
,
xmm0
;
pmullw
xmm3
,
[
rax
+
16
]
;
paddw
xmm1
,
xmm3
;
paddw
xmm1
,
[
GLOBAL
(
xmm_bi_rd
)]
;
psraw
xmm1
,
xmm_filter_shift
;
paddw
xmm1
,
xmm3
;
paddw
xmm1
,
xmm4
;
psraw
xmm1
,
xmm_filter_shift
;
movdqa
xmm5
,
xmm1
%if ABI_IS_32BIT
ad
d
rsi
,
dword
ptr
arg
(
1
)
;ref_pixels_per_line
;
%else
movsxd
r8
,
dword
ptr
arg
(
1
)
;ref_pixels_per_line ;
ad
d
rsi
,
r8
movsx
d
rbx
,
dword
ptr
arg
(
1
)
;ref_pixels_per_line
lea
rsi
,
[
rsi
+
rbx
]
%if ABI_IS_32BIT=0
movsx
d
r9
,
dword
ptr
arg
(
3
)
;src_pixels_per_line
%endif
filter_block2d_bil_var_sse2_loop:
filter_block2d_bil_var_sse2_loop:
movq
xmm1
,
QWORD
PTR
[
rsi
]
;
movq
xmm3
,
QWORD
PTR
[
rsi
+
1
]
;
punpcklbw
xmm1
,
xmm0
;
pmullw
xmm1
,
[
rax
]
;
punpcklbw
xmm3
,
xmm0
;
pmullw
xmm3
,
[
rax
+
16
]
;
paddw
xmm1
,
xmm3
;
paddw
xmm1
,
[
GLOBAL
(
xmm_bi_rd
)]
;
paddw
xmm1
,
xmm4
;
psraw
xmm1
,
xmm_filter_shift
;
movdqa
xmm3
,
xmm5
;
movdqa
xmm3
,
xmm5
;
movdqa
xmm5
,
xmm1
;
pmullw
xmm3
,
[
rdx
]
;
pmullw
xmm3
,
[
rdx
]
;
pmullw
xmm1
,
[
rdx
+
16
]
;
paddw
xmm1
,
xmm3
;
paddw
xmm1
,
[
GLOBAL
(
xmm_bi_rd
)]
;
paddw
xmm1
,
xmm4
;
psraw
xmm1
,
xmm_filter_shift
;
movq
xmm3
,
QWORD
PTR
[
rdi
]
;
...
...
@@ -577,20 +589,103 @@ filter_block2d_bil_var_sse2_loop:
pmaddwd
xmm1
,
xmm1
;
paddd
xmm7
,
xmm1
;
lea
rsi
,
[
rsi
+
rbx
]
;ref_pixels_per_line
%if ABI_IS_32BIT
add
rsi
,
dword
ptr
arg
(
1
)
;ref_pixels_per_line ;
add
rdi
,
dword
ptr
arg
(
3
)
;src_pixels_per_line ;
add
rdi
,
dword
ptr
arg
(
3
)
;src_pixels_per_line
%else
movsxd
r8
,
dword
ptr
arg
(
1
)
;ref_pixels_per_line ;
movsxd
r9
,
dword
ptr
arg
(
3
)
;src_pixels_per_line ;
add
rsi
,
r8
add
rdi
,
r9
lea
rdi
,
[
rdi
+
r9
]
%endif
sub
rcx
,
1
;
jnz
filter_block2d_bil_var_sse2_loop
;
jmp
filter_block2d_bil_variance
filter_block2d_bil_var_sse2_sp_only:
movsxd
rdx
,
dword
ptr
arg
(
6
)
; yoffset
shl
rdx
,
5
lea
rdx
,
[
rdx
+
rcx
]
; VFilter
mov
rsi
,
arg
(
0
)
;ref_ptr
mov
rdi
,
arg
(
2
)
;src_ptr
movsxd
rcx
,
dword
ptr
arg
(
4
)
;Height
movsxd
rax
,
dword
ptr
arg
(
1
)
;ref_pixels_per_line
pxor
xmm0
,
xmm0
;
movq
xmm1
,
QWORD
PTR
[
rsi
]
;
punpcklbw
xmm1
,
xmm0
;
movsxd
rbx
,
dword
ptr
arg
(
3
)
;src_pixels_per_line
lea
rsi
,
[
rsi
+
rax
]
filter_block2d_bil_sp_only_loop:
movq
xmm3
,
QWORD
PTR
[
rsi
]
;
punpcklbw
xmm3
,
xmm0
;
movdqa
xmm5
,
xmm3
pmullw
xmm1
,
[
rdx
]
;
pmullw
xmm3
,
[
rdx
+
16
]
;
paddw
xmm1
,
xmm3
;
paddw
xmm1
,
xmm4
;
psraw
xmm1
,
xmm_filter_shift
;
movq
xmm3
,
QWORD
PTR
[
rdi
]
;
punpcklbw
xmm3
,
xmm0
;
psubw
xmm1
,
xmm3
;
paddw
xmm6
,
xmm1
;
pmaddwd
xmm1
,
xmm1
;
paddd
xmm7
,
xmm1
;
movdqa
xmm1
,
xmm5
;
lea
rsi
,
[
rsi
+
rax
]
;ref_pixels_per_line
lea
rdi
,
[
rdi
+
rbx
]
;src_pixels_per_line
sub
rcx
,
1
;
jnz
filter_block2d_bil_sp_only_loop
;
jmp
filter_block2d_bil_variance
filter_block2d_bil_var_sse2_fp_only:
mov
rsi
,
arg
(
0
)
;ref_ptr
mov
rdi
,
arg
(
2
)
;src_ptr
movsxd
rcx
,
dword
ptr
arg
(
4
)
;Height
movsxd
rdx
,
dword
ptr
arg
(
1
)
;ref_pixels_per_line
pxor
xmm0
,
xmm0
;
movsxd
rbx
,
dword
ptr
arg
(
3
)
;src_pixels_per_line
filter_block2d_bil_fp_only_loop:
movq
xmm1
,
QWORD
PTR
[
rsi
]
;
movq
xmm3
,
QWORD
PTR
[
rsi
+
1
]
;
punpcklbw
xmm1
,
xmm0
;
pmullw
xmm1
,
[
rax
]
;
punpcklbw
xmm3
,
xmm0
;
pmullw
xmm3
,
[
rax
+
16
]
;
paddw
xmm1
,
xmm3
;
paddw
xmm1
,
xmm4
;
psraw
xmm1
,
xmm_filter_shift
;
movq
xmm3
,
QWORD
PTR
[
rdi
]
;
punpcklbw
xmm3
,
xmm0
;
psubw
xmm1
,
xmm3
;
paddw
xmm6
,
xmm1
;
pmaddwd
xmm1
,
xmm1
;
paddd
xmm7
,
xmm1
;
lea
rsi
,
[
rsi
+
rdx
]
lea
rdi
,
[
rdi
+
rbx
]
;src_pixels_per_line
sub
rcx
,
1
;
jnz
filter_block2d_bil_fp_only_loop
;
jmp
filter_block2d_bil_variance
filter_block2d_bil_variance:
movdq2q
mm6
,
xmm6
;
movdq2q
mm7
,
xmm7
;
...
...
@@ -627,12 +722,12 @@ filter_block2d_bil_var_sse2_loop:
movd
[
rsi
],
mm2
; xsum
movd
[
rdi
],
mm4
; xxsum
; begin epilog
add
rsp
,
16
pop
rbx
pop
rdi
pop
rsi
REST
ORE_GOT
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
...
...
@@ -974,3 +1069,13 @@ SECTION_RODATA
align
16
xmm_bi_rd:
times
8
dw
64
align
16
vp8_bilinear_filters_sse2:
dw
128
,
128
,
128
,
128
,
128
,
128
,
128
,
128
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
dw
112
,
112
,
112
,
112
,
112
,
112
,
112
,
112
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
dw
96
,
96
,
96
,
96
,
96
,
96
,
96
,
96
,
32
,
32
,
32
,
32
,
32
,
32
,
32
,
32
dw
80
,
80
,
80
,
80
,
80
,
80
,
80
,
80
,
48
,
48
,
48
,
48
,
48
,
48
,
48
,
48
dw
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
dw
48
,
48
,
48
,
48
,
48
,
48
,
48
,
48
,
80
,
80
,
80
,
80
,
80
,
80
,
80
,
80
dw
32
,
32
,
32
,
32
,
32
,
32
,
32
,
32
,
96
,
96
,
96
,
96
,
96
,
96
,
96
,
96
dw
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
112
,
112
,
112
,
112
,
112
,
112
,
112
,
112
vp8/encoder/x86/variance_sse2.c
View file @
d3e9409b
...
...
@@ -76,8 +76,8 @@ void vp8_filter_block2d_bil_var_sse2
const
unsigned
char
*
src_ptr
,
int
src_pixels_per_line
,
unsigned
int
Height
,
const
short
*
HFilter
,
const
short
*
VFilter
,
int
xoffset
,
int
yoffset
,
int
*
sum
,
unsigned
int
*
sumsquared
);
...
...
@@ -222,21 +222,6 @@ unsigned int vp8_variance8x16_wmt
}
///////////////////////////////////////////////////////////////////////////
// the mmx function that does the bilinear filtering and var calculation //
// int one pass //
///////////////////////////////////////////////////////////////////////////
DECLARE_ALIGNED
(
16
,
const
short
,
vp8_bilinear_filters_xmm
[
8
][
16
])
=
{
{
128
,
128
,
128
,
128
,
128
,
128
,
128
,
128
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
112
,
112
,
112
,
112
,
112
,
112
,
112
,
112
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
},
{
96
,
96
,
96
,
96
,
96
,
96
,
96
,
96
,
32
,
32
,
32
,
32
,
32
,
32
,
32
,
32
},
{
80
,
80
,
80
,
80
,
80
,
80
,
80
,
80
,
48
,
48
,
48
,
48
,
48
,
48
,
48
,
48
},
{
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
,
64
},
{
48
,
48
,
48
,
48
,
48
,
48
,
48
,
48
,
80
,
80
,
80
,
80
,
80
,
80
,
80
,
80
},
{
32
,
32
,
32
,
32
,
32
,
32
,
32
,
32
,
96
,
96
,
96
,
96
,
96
,
96
,
96
,
96
},
{
16
,
16
,
16
,
16
,
16
,
16
,
16
,
16
,
112
,
112
,
112
,
112
,
112
,
112
,
112
,
112
}
};
unsigned
int
vp8_sub_pixel_variance4x4_wmt
(
const
unsigned
char
*
src_ptr
,
...
...
@@ -272,15 +257,38 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
unsigned
int
*
sse
)
{
int
xsum
;
unsigned
int
xxsum
;
vp8_filter_block2d_bil_var_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
8
,
vp8_bilinear_filters_xmm
[
xoffset
],
vp8_bilinear_filters_xmm
[
yoffset
],
&
xsum
,
&
xxsum
);
if
(
xoffset
==
4
&&
yoffset
==
0
)
{
vp8_half_horiz_variance16x_h_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
8
,
&
xsum
,
&
xxsum
);
}
else
if
(
xoffset
==
0
&&
yoffset
==
4
)
{
vp8_half_vert_variance16x_h_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
8
,
&
xsum
,
&
xxsum
);
}
else
if
(
xoffset
==
4
&&
yoffset
==
4
)
{
vp8_half_horiz_vert_variance16x_h_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
8
,
&
xsum
,
&
xxsum
);
}
else
{
vp8_filter_block2d_bil_var_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
8
,
xoffset
,
yoffset
,
&
xsum
,
&
xxsum
);
}
*
sse
=
xxsum
;
return
(
xxsum
-
((
xsum
*
xsum
)
>>
6
));
...
...
@@ -344,7 +352,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
vp8_filter_block2d_bil_var_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
16
,
vp8_bilinear_filters_xmm
[
xoffset
]
,
vp8_bilinear_filters_xmm
[
yoffset
]
,
xoffset
,
yoffset
,
&
xsum0
,
&
xxsum0
);
...
...
@@ -352,7 +360,7 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
vp8_filter_block2d_bil_var_sse2
(
src_ptr
+
8
,
src_pixels_per_line
,
dst_ptr
+
8
,
dst_pixels_per_line
,
16
,
vp8_bilinear_filters_xmm
[
xoffset
]
,
vp8_bilinear_filters_xmm
[
yoffset
]
,
xoffset
,
yoffset
,
&
xsum1
,
&
xxsum1
);
}
...
...
@@ -392,21 +400,56 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
int
xsum0
,
xsum1
;
unsigned
int
xxsum0
,
xxsum1
;
if
(
xoffset
==
4
&&
yoffset
==
0
)
{
vp8_half_horiz_variance16x_h_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
8
,
&
xsum0
,
&
xxsum0
);
vp8_filter_block2d_bil_var_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
8
,
vp8_bilinear_filters_xmm
[
xoffset
],
vp8_bilinear_filters_xmm
[
yoffset
],
&
xsum0
,
&
xxsum0
);
vp8_half_horiz_variance16x_h_sse2
(
src_ptr
+
8
,
src_pixels_per_line
,
dst_ptr
+
8
,
dst_pixels_per_line
,
8
,
&
xsum1
,
&
xxsum1
);
}
else
if
(
xoffset
==
0
&&
yoffset
==
4
)
{
vp8_half_vert_variance16x_h_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
8
,
&
xsum0
,
&
xxsum0
);
vp8_half_vert_variance16x_h_sse2
(
src_ptr
+
8
,
src_pixels_per_line
,
dst_ptr
+
8
,
dst_pixels_per_line
,
8
,
&
xsum1
,
&
xxsum1
);
}
else
if
(
xoffset
==
4
&&
yoffset
==
4
)
{
vp8_half_horiz_vert_variance16x_h_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
8
,
&
xsum0
,
&
xxsum0
);
vp8_half_horiz_vert_variance16x_h_sse2
(
src_ptr
+
8
,
src_pixels_per_line
,
dst_ptr
+
8
,
dst_pixels_per_line
,
8
,
&
xsum1
,
&
xxsum1
);
}
else
{
vp8_filter_block2d_bil_var_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
8
,
xoffset
,
yoffset
,
&
xsum0
,
&
xxsum0
);
vp8_filter_block2d_bil_var_sse2
(
src_ptr
+
8
,
src_pixels_per_line
,
dst_ptr
+
8
,
dst_pixels_per_line
,
8
,
vp8_bilinear_filters_xmm
[
xoffset
]
,
vp8_bilinear_filters_xmm
[
yoffset
]
,
&
xsum1
,
&
xxsum1
);
vp8_filter_block2d_bil_var_sse2
(
src_ptr
+
8
,
src_pixels_per_line
,
dst_ptr
+
8
,
dst_pixels_per_line
,
8
,
xoffset
,
yoffset
,
&
xsum1
,
&
xxsum1
);
}
xsum0
+=
xsum1
;
xxsum0
+=
xxsum1
;
...
...
@@ -428,12 +471,36 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
{
int
xsum
;
unsigned
int
xxsum
;
vp8_filter_block2d_bil_var_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
16
,
vp8_bilinear_filters_xmm
[
xoffset
],
vp8_bilinear_filters_xmm
[
yoffset
],
&
xsum
,
&
xxsum
);
if
(
xoffset
==
4
&&
yoffset
==
0
)
{
vp8_half_horiz_variance16x_h_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
16
,
&
xsum
,
&
xxsum
);
}
else
if
(
xoffset
==
0
&&
yoffset
==
4
)
{
vp8_half_vert_variance16x_h_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
16
,
&
xsum
,
&
xxsum
);
}
else
if
(
xoffset
==
4
&&
yoffset
==
4
)
{
vp8_half_horiz_vert_variance16x_h_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
16
,
&
xsum
,
&
xxsum
);
}
else
{
vp8_filter_block2d_bil_var_sse2
(
src_ptr
,
src_pixels_per_line
,
dst_ptr
,
dst_pixels_per_line
,
16
,
xoffset
,
yoffset
,
&
xsum
,
&
xxsum
);
}
*
sse
=
xxsum
;
return
(
xxsum
-
((
xsum
*
xsum
)
>>
7
));
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment