Commit 3fb728c7 authored by Yunqing Wang's avatar Yunqing Wang
Browse files

SSE2 8-tap sub-pixel filter optimization

To ensure fast encoding/decoding on devices without ssse3 support,
SSE2 optimization of sub-pixel filters was done. Test using 1080p
clip showed the decoder speeds were ~70fps with ssse3 filters, ~60fps
with sse2 filters, and ~15fps with c filters.

Change-Id: Ie2088f87d83a889fba80a613e4d0e287aadd785c
parent 9603989c
......@@ -599,6 +599,28 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
make_tuple(32, 64, &convolve8_c),
make_tuple(64, 64, &convolve8_c)));
#if HAVE_SSE2
const ConvolveFunctions convolve8_sse2(
vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
vp9_convolve8_sse2, vp9_convolve8_avg_sse2);
INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_sse2),
make_tuple(8, 4, &convolve8_sse2),
make_tuple(4, 8, &convolve8_sse2),
make_tuple(8, 8, &convolve8_sse2),
make_tuple(16, 8, &convolve8_sse2),
make_tuple(8, 16, &convolve8_sse2),
make_tuple(16, 16, &convolve8_sse2),
make_tuple(32, 16, &convolve8_sse2),
make_tuple(16, 32, &convolve8_sse2),
make_tuple(32, 32, &convolve8_sse2),
make_tuple(64, 32, &convolve8_sse2),
make_tuple(32, 64, &convolve8_sse2),
make_tuple(64, 64, &convolve8_sse2)));
#endif
#if HAVE_SSSE3
const ConvolveFunctions convolve8_ssse3(
vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
......
......@@ -247,22 +247,22 @@ prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8
specialize vp9_convolve_avg $sse2_x86inc neon dspr2
prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8 ssse3 neon dspr2
specialize vp9_convolve8 sse2 ssse3 neon dspr2
prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_horiz ssse3 neon dspr2
specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2
prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_vert ssse3 neon dspr2
specialize vp9_convolve8_vert sse2 ssse3 neon dspr2
prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg ssse3 neon dspr2
specialize vp9_convolve8_avg sse2 ssse3 neon dspr2
prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_horiz ssse3 neon dspr2
specialize vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2
prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_vert ssse3 neon dspr2
specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
#
# dct
......
......@@ -36,90 +36,28 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
{ 8, 8, 8, 8, 120, 120, 120, 120 }
};
#if HAVE_SSSE3
void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
typedef void filter8_1dfunction (
const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter
);
void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
#if HAVE_SSSE3
filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
......@@ -317,3 +255,214 @@ void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
}
}
#endif
#if HAVE_SSE2
filter8_1dfunction vp9_filter_block1d16_v8_sse2;
filter8_1dfunction vp9_filter_block1d16_h8_sse2;
filter8_1dfunction vp9_filter_block1d8_v8_sse2;
filter8_1dfunction vp9_filter_block1d8_h8_sse2;
filter8_1dfunction vp9_filter_block1d4_v8_sse2;
filter8_1dfunction vp9_filter_block1d4_h8_sse2;
filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
/* Ensure the filter can be compressed to int16_t. */
if (x_step_q4 == 16 && filter_x[3] != 128) {
while (w >= 16) {
vp9_filter_block1d16_h8_sse2(src, src_stride,
dst, dst_stride,
h, filter_x);
src += 16;
dst += 16;
w -= 16;
}
while (w >= 8) {
vp9_filter_block1d8_h8_sse2(src, src_stride,
dst, dst_stride,
h, filter_x);
src += 8;
dst += 8;
w -= 8;
}
while (w >= 4) {
vp9_filter_block1d4_h8_sse2(src, src_stride,
dst, dst_stride,
h, filter_x);
src += 4;
dst += 4;
w -= 4;
}
}
if (w) {
vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
}
}
void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
if (y_step_q4 == 16 && filter_y[3] != 128) {
while (w >= 16) {
vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,
dst, dst_stride,
h, filter_y);
src += 16;
dst += 16;
w -= 16;
}
while (w >= 8) {
vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,
dst, dst_stride,
h, filter_y);
src += 8;
dst += 8;
w -= 8;
}
while (w >= 4) {
vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,
dst, dst_stride,
h, filter_y);
src += 4;
dst += 4;
w -= 4;
}
}
if (w) {
vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
}
}
void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
if (x_step_q4 == 16 && filter_x[3] != 128) {
while (w >= 16) {
vp9_filter_block1d16_h8_avg_sse2(src, src_stride,
dst, dst_stride,
h, filter_x);
src += 16;
dst += 16;
w -= 16;
}
while (w >= 8) {
vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
dst, dst_stride,
h, filter_x);
src += 8;
dst += 8;
w -= 8;
}
while (w >= 4) {
vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
dst, dst_stride,
h, filter_x);
src += 4;
dst += 4;
w -= 4;
}
}
if (w) {
vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
}
}
void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
if (y_step_q4 == 16 && filter_y[3] != 128) {
while (w >= 16) {
vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
dst, dst_stride,
h, filter_y);
src += 16;
dst += 16;
w -= 16;
}
while (w >= 8) {
vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
dst, dst_stride,
h, filter_y);
src += 8;
dst += 8;
w -= 8;
}
while (w >= 4) {
vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
dst, dst_stride,
h, filter_y);
src += 4;
dst += 4;
w -= 4;
}
}
if (w) {
vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
}
}
void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
assert(w <= 64);
assert(h <= 64);
if (x_step_q4 == 16 && y_step_q4 == 16) {
vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h + 7);
vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4, w, h);
} else {
vp9_convolve8_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4, w, h);
}
}
void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
assert(w <= 64);
assert(h <= 64);
if (x_step_q4 == 16 && y_step_q4 == 16) {
vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h + 7);
vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
} else {
vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4, w, h);
}
}
#endif
This diff is collapsed.
......@@ -75,6 +75,7 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_ss
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment