Commit 1e6a32f1 authored by Ronald S. Bultje's avatar Ronald S. Bultje

SSE2/SSSE3 optimizations and unit test for sub_pixel_avg_variance().

Encoding of bus @ 1500kbps (first 50 frames) goes from 3min57 to
3min35, i.e. approximately a 10.5% speedup. Note that the SIMD versions
which use a bilinear filter (x_offset & 7 || y_offset & 7) aren't
perfectly interleaved, and can probably be improved further in the
future. I've marked this with a few TODOs/FIXMEs in the code.

Change-Id: I5c9e900c0f0d32e431a50fecae213b510b2549f9
parent 8fb6c581
This diff is collapsed.
......@@ -269,81 +269,81 @@ prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int
specialize vp9_sub_pixel_variance64x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance64x64
specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x64
specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance64x32
specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x16
specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance16x32
specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x32
specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance16x16
specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance8x16
specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance16x8
specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance8x8
specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x4 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance8x4
specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance4x8 sse ssse3
prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance4x8
specialize vp9_sub_pixel_avg_variance4x8 sse ssse3
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance4x4 sse ssse3
#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance4x4
specialize vp9_sub_pixel_avg_variance4x4 sse ssse3
prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad64x64 sse2
......
This diff is collapsed.
......@@ -343,29 +343,22 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
return (var - (((int64_t)avg * avg) >> 11));
}
#define DECL(w, opt) \
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse)
#define DECLS(opt1, opt2) \
int vp9_sub_pixel_variance4xh_##opt2(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse); \
int vp9_sub_pixel_variance8xh_##opt1(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse); \
int vp9_sub_pixel_variance16xh_##opt1(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse)
DECL(4, opt2); \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
DECLS(ssse3, ssse3);
#undef DECLS
#undef DECL
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
......@@ -427,6 +420,86 @@ FNS(ssse3, ssse3);
#undef FNS
#undef FN
#define DECL(w, opt) \
int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
const uint8_t *sec, \
ptrdiff_t sec_stride, \
int height, unsigned int *sse)
#define DECLS(opt1, opt2) \
DECL(4, opt2); \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
DECLS(ssse3, ssse3);
#undef DECL
#undef DECLS
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst, \
int dst_stride, \
unsigned int *sseptr, \
const uint8_t *sec) { \
unsigned int sse; \
int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
sec, w, h, &sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
sec + 16, w, h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
sec + 32, w, h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
sec + 48, w, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
*sseptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
FN(16, 8, 16, 4, 3, opt1,); \
FN(8, 16, 8, 3, 4, opt1,); \
FN(8, 8, 8, 3, 3, opt1,); \
FN(8, 4, 8, 3, 2, opt1,); \
FN(4, 8, 4, 2, 3, opt2,); \
FN(4, 4, 4, 2, 2, opt2,)
FNS(sse2, sse);
FNS(ssse3, ssse3);
#undef FNS
#undef FN
unsigned int vp9_variance_halfpixvar16x16_h_wmt(
const unsigned char *src_ptr,
int src_pixels_per_line,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment