Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Guillaume Martres
aom-rav1e
Commits
3a0b59e3
Commit
3a0b59e3
authored
Oct 11, 2013
by
Yunqing Wang
Committed by
Gerrit Code Review
Oct 11, 2013
Browse files
Merge "SSE2 8-tap sub-pixel filter optimization"
parents
899ab95c
3fb728c7
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
test/convolve_test.cc
View file @
3a0b59e3
...
...
@@ -599,6 +599,28 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
make_tuple
(
32
,
64
,
&
convolve8_c
),
make_tuple
(
64
,
64
,
&
convolve8_c
)));
#if HAVE_SSE2
const
ConvolveFunctions
convolve8_sse2
(
vp9_convolve8_horiz_sse2
,
vp9_convolve8_avg_horiz_sse2
,
vp9_convolve8_vert_sse2
,
vp9_convolve8_avg_vert_sse2
,
vp9_convolve8_sse2
,
vp9_convolve8_avg_sse2
);
INSTANTIATE_TEST_CASE_P
(
SSE2
,
ConvolveTest
,
::
testing
::
Values
(
make_tuple
(
4
,
4
,
&
convolve8_sse2
),
make_tuple
(
8
,
4
,
&
convolve8_sse2
),
make_tuple
(
4
,
8
,
&
convolve8_sse2
),
make_tuple
(
8
,
8
,
&
convolve8_sse2
),
make_tuple
(
16
,
8
,
&
convolve8_sse2
),
make_tuple
(
8
,
16
,
&
convolve8_sse2
),
make_tuple
(
16
,
16
,
&
convolve8_sse2
),
make_tuple
(
32
,
16
,
&
convolve8_sse2
),
make_tuple
(
16
,
32
,
&
convolve8_sse2
),
make_tuple
(
32
,
32
,
&
convolve8_sse2
),
make_tuple
(
64
,
32
,
&
convolve8_sse2
),
make_tuple
(
32
,
64
,
&
convolve8_sse2
),
make_tuple
(
64
,
64
,
&
convolve8_sse2
)));
#endif
#if HAVE_SSSE3
const
ConvolveFunctions
convolve8_ssse3
(
vp9_convolve8_horiz_ssse3
,
vp9_convolve8_avg_horiz_ssse3
,
...
...
vp9/common/vp9_rtcd_defs.sh
View file @
3a0b59e3
...
...
@@ -247,22 +247,22 @@ prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8
specialize vp9_convolve_avg
$sse2_x86inc
neon dspr2
prototype void vp9_convolve8
"const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8 ssse3 neon dspr2
specialize vp9_convolve8
sse2
ssse3 neon dspr2
prototype void vp9_convolve8_horiz
"const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_horiz ssse3 neon dspr2
specialize vp9_convolve8_horiz
sse2
ssse3 neon dspr2
prototype void vp9_convolve8_vert
"const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_vert ssse3 neon dspr2
specialize vp9_convolve8_vert
sse2
ssse3 neon dspr2
prototype void vp9_convolve8_avg
"const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg ssse3 neon dspr2
specialize vp9_convolve8_avg
sse2
ssse3 neon dspr2
prototype void vp9_convolve8_avg_horiz
"const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_horiz ssse3 neon dspr2
specialize vp9_convolve8_avg_horiz
sse2
ssse3 neon dspr2
prototype void vp9_convolve8_avg_vert
"const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_vert ssse3 neon dspr2
specialize vp9_convolve8_avg_vert
sse2
ssse3 neon dspr2
#
# dct
...
...
vp9/common/x86/vp9_asm_stubs.c
View file @
3a0b59e3
...
...
@@ -36,90 +36,28 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
{
8
,
8
,
8
,
8
,
120
,
120
,
120
,
120
}
};
#if HAVE_SSSE3
void
vp9_filter_block1d16_v8_ssse3
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
void
vp9_filter_block1d16_h8_ssse3
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
void
vp9_filter_block1d8_v8_ssse3
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
void
vp9_filter_block1d8_h8_ssse3
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
void
vp9_filter_block1d4_v8_ssse3
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
void
vp9_filter_block1d4_h8_ssse3
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
typedef
void
filter8_1dfunction
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
void
vp9_filter_block1d16_v8_avg_ssse3
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
void
vp9_filter_block1d16_h8_avg_ssse3
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
void
vp9_filter_block1d8_v8_avg_ssse3
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
void
vp9_filter_block1d8_h8_avg_ssse3
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
void
vp9_filter_block1d4_v8_avg_ssse3
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
void
vp9_filter_block1d4_h8_avg_ssse3
(
const
unsigned
char
*
src_ptr
,
const
unsigned
int
src_pitch
,
unsigned
char
*
output_ptr
,
unsigned
int
out_pitch
,
unsigned
int
output_height
,
const
short
*
filter
);
#if HAVE_SSSE3
filter8_1dfunction
vp9_filter_block1d16_v8_ssse3
;
filter8_1dfunction
vp9_filter_block1d16_h8_ssse3
;
filter8_1dfunction
vp9_filter_block1d8_v8_ssse3
;
filter8_1dfunction
vp9_filter_block1d8_h8_ssse3
;
filter8_1dfunction
vp9_filter_block1d4_v8_ssse3
;
filter8_1dfunction
vp9_filter_block1d4_h8_ssse3
;
filter8_1dfunction
vp9_filter_block1d16_v8_avg_ssse3
;
filter8_1dfunction
vp9_filter_block1d16_h8_avg_ssse3
;
filter8_1dfunction
vp9_filter_block1d8_v8_avg_ssse3
;
filter8_1dfunction
vp9_filter_block1d8_h8_avg_ssse3
;
filter8_1dfunction
vp9_filter_block1d4_v8_avg_ssse3
;
filter8_1dfunction
vp9_filter_block1d4_h8_avg_ssse3
;
void
vp9_convolve8_horiz_ssse3
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
,
...
...
@@ -317,3 +255,214 @@ void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
}
}
#endif
#if HAVE_SSE2
filter8_1dfunction
vp9_filter_block1d16_v8_sse2
;
filter8_1dfunction
vp9_filter_block1d16_h8_sse2
;
filter8_1dfunction
vp9_filter_block1d8_v8_sse2
;
filter8_1dfunction
vp9_filter_block1d8_h8_sse2
;
filter8_1dfunction
vp9_filter_block1d4_v8_sse2
;
filter8_1dfunction
vp9_filter_block1d4_h8_sse2
;
filter8_1dfunction
vp9_filter_block1d16_v8_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d16_h8_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d8_v8_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d8_h8_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d4_v8_avg_sse2
;
filter8_1dfunction
vp9_filter_block1d4_h8_avg_sse2
;
void
vp9_convolve8_horiz_sse2
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
,
const
int16_t
*
filter_x
,
int
x_step_q4
,
const
int16_t
*
filter_y
,
int
y_step_q4
,
int
w
,
int
h
)
{
/* Ensure the filter can be compressed to int16_t. */
if
(
x_step_q4
==
16
&&
filter_x
[
3
]
!=
128
)
{
while
(
w
>=
16
)
{
vp9_filter_block1d16_h8_sse2
(
src
,
src_stride
,
dst
,
dst_stride
,
h
,
filter_x
);
src
+=
16
;
dst
+=
16
;
w
-=
16
;
}
while
(
w
>=
8
)
{
vp9_filter_block1d8_h8_sse2
(
src
,
src_stride
,
dst
,
dst_stride
,
h
,
filter_x
);
src
+=
8
;
dst
+=
8
;
w
-=
8
;
}
while
(
w
>=
4
)
{
vp9_filter_block1d4_h8_sse2
(
src
,
src_stride
,
dst
,
dst_stride
,
h
,
filter_x
);
src
+=
4
;
dst
+=
4
;
w
-=
4
;
}
}
if
(
w
)
{
vp9_convolve8_horiz_c
(
src
,
src_stride
,
dst
,
dst_stride
,
filter_x
,
x_step_q4
,
filter_y
,
y_step_q4
,
w
,
h
);
}
}
void
vp9_convolve8_vert_sse2
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
,
const
int16_t
*
filter_x
,
int
x_step_q4
,
const
int16_t
*
filter_y
,
int
y_step_q4
,
int
w
,
int
h
)
{
if
(
y_step_q4
==
16
&&
filter_y
[
3
]
!=
128
)
{
while
(
w
>=
16
)
{
vp9_filter_block1d16_v8_sse2
(
src
-
src_stride
*
3
,
src_stride
,
dst
,
dst_stride
,
h
,
filter_y
);
src
+=
16
;
dst
+=
16
;
w
-=
16
;
}
while
(
w
>=
8
)
{
vp9_filter_block1d8_v8_sse2
(
src
-
src_stride
*
3
,
src_stride
,
dst
,
dst_stride
,
h
,
filter_y
);
src
+=
8
;
dst
+=
8
;
w
-=
8
;
}
while
(
w
>=
4
)
{
vp9_filter_block1d4_v8_sse2
(
src
-
src_stride
*
3
,
src_stride
,
dst
,
dst_stride
,
h
,
filter_y
);
src
+=
4
;
dst
+=
4
;
w
-=
4
;
}
}
if
(
w
)
{
vp9_convolve8_vert_c
(
src
,
src_stride
,
dst
,
dst_stride
,
filter_x
,
x_step_q4
,
filter_y
,
y_step_q4
,
w
,
h
);
}
}
void
vp9_convolve8_avg_horiz_sse2
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
,
const
int16_t
*
filter_x
,
int
x_step_q4
,
const
int16_t
*
filter_y
,
int
y_step_q4
,
int
w
,
int
h
)
{
if
(
x_step_q4
==
16
&&
filter_x
[
3
]
!=
128
)
{
while
(
w
>=
16
)
{
vp9_filter_block1d16_h8_avg_sse2
(
src
,
src_stride
,
dst
,
dst_stride
,
h
,
filter_x
);
src
+=
16
;
dst
+=
16
;
w
-=
16
;
}
while
(
w
>=
8
)
{
vp9_filter_block1d8_h8_avg_sse2
(
src
,
src_stride
,
dst
,
dst_stride
,
h
,
filter_x
);
src
+=
8
;
dst
+=
8
;
w
-=
8
;
}
while
(
w
>=
4
)
{
vp9_filter_block1d4_h8_avg_sse2
(
src
,
src_stride
,
dst
,
dst_stride
,
h
,
filter_x
);
src
+=
4
;
dst
+=
4
;
w
-=
4
;
}
}
if
(
w
)
{
vp9_convolve8_avg_horiz_c
(
src
,
src_stride
,
dst
,
dst_stride
,
filter_x
,
x_step_q4
,
filter_y
,
y_step_q4
,
w
,
h
);
}
}
void
vp9_convolve8_avg_vert_sse2
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
,
const
int16_t
*
filter_x
,
int
x_step_q4
,
const
int16_t
*
filter_y
,
int
y_step_q4
,
int
w
,
int
h
)
{
if
(
y_step_q4
==
16
&&
filter_y
[
3
]
!=
128
)
{
while
(
w
>=
16
)
{
vp9_filter_block1d16_v8_avg_sse2
(
src
-
src_stride
*
3
,
src_stride
,
dst
,
dst_stride
,
h
,
filter_y
);
src
+=
16
;
dst
+=
16
;
w
-=
16
;
}
while
(
w
>=
8
)
{
vp9_filter_block1d8_v8_avg_sse2
(
src
-
src_stride
*
3
,
src_stride
,
dst
,
dst_stride
,
h
,
filter_y
);
src
+=
8
;
dst
+=
8
;
w
-=
8
;
}
while
(
w
>=
4
)
{
vp9_filter_block1d4_v8_avg_sse2
(
src
-
src_stride
*
3
,
src_stride
,
dst
,
dst_stride
,
h
,
filter_y
);
src
+=
4
;
dst
+=
4
;
w
-=
4
;
}
}
if
(
w
)
{
vp9_convolve8_avg_vert_c
(
src
,
src_stride
,
dst
,
dst_stride
,
filter_x
,
x_step_q4
,
filter_y
,
y_step_q4
,
w
,
h
);
}
}
void
vp9_convolve8_sse2
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
,
const
int16_t
*
filter_x
,
int
x_step_q4
,
const
int16_t
*
filter_y
,
int
y_step_q4
,
int
w
,
int
h
)
{
DECLARE_ALIGNED_ARRAY
(
16
,
unsigned
char
,
fdata2
,
64
*
71
);
assert
(
w
<=
64
);
assert
(
h
<=
64
);
if
(
x_step_q4
==
16
&&
y_step_q4
==
16
)
{
vp9_convolve8_horiz_sse2
(
src
-
3
*
src_stride
,
src_stride
,
fdata2
,
64
,
filter_x
,
x_step_q4
,
filter_y
,
y_step_q4
,
w
,
h
+
7
);
vp9_convolve8_vert_sse2
(
fdata2
+
3
*
64
,
64
,
dst
,
dst_stride
,
filter_x
,
x_step_q4
,
filter_y
,
y_step_q4
,
w
,
h
);
}
else
{
vp9_convolve8_c
(
src
,
src_stride
,
dst
,
dst_stride
,
filter_x
,
x_step_q4
,
filter_y
,
y_step_q4
,
w
,
h
);
}
}
void
vp9_convolve8_avg_sse2
(
const
uint8_t
*
src
,
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
ptrdiff_t
dst_stride
,
const
int16_t
*
filter_x
,
int
x_step_q4
,
const
int16_t
*
filter_y
,
int
y_step_q4
,
int
w
,
int
h
)
{
DECLARE_ALIGNED_ARRAY
(
16
,
unsigned
char
,
fdata2
,
64
*
71
);
assert
(
w
<=
64
);
assert
(
h
<=
64
);
if
(
x_step_q4
==
16
&&
y_step_q4
==
16
)
{
vp9_convolve8_horiz_sse2
(
src
-
3
*
src_stride
,
src_stride
,
fdata2
,
64
,
filter_x
,
x_step_q4
,
filter_y
,
y_step_q4
,
w
,
h
+
7
);
vp9_convolve8_avg_vert_sse2
(
fdata2
+
3
*
64
,
64
,
dst
,
dst_stride
,
filter_x
,
x_step_q4
,
filter_y
,
y_step_q4
,
w
,
h
);
}
else
{
vp9_convolve8_avg_c
(
src
,
src_stride
,
dst
,
dst_stride
,
filter_x
,
x_step_q4
,
filter_y
,
y_step_q4
,
w
,
h
);
}
}
#endif
vp9/common/x86/vp9_subpixel_8t_sse2.asm
0 → 100644
View file @
3a0b59e3
This diff is collapsed.
Click to expand it.
vp9/vp9_common.mk
View file @
3a0b59e3
...
...
@@ -77,6 +77,7 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_ss
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC)
+=
common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC)
+=
common/vp9_postproc.c
VP9_COMMON_SRCS-$(HAVE_MMX)
+=
common/x86/vp9_loopfilter_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2)
+=
common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3)
+=
common/x86/vp9_subpixel_8t_ssse3.asm
ifeq
($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX)
+=
common/x86/vp9_postproc_mmx.asm
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment