Commit 0c481f4d authored by Ronald S. Bultje's avatar Ronald S. Bultje
Browse files

Add SSE2 versions for rectangular sad and sad4d functions.

About 11% overall encoder speedup with the sbsegment experiment enabled.

Change-Id: Iffb1bdba6932d9f11a6c791cda8697ccf9327183
parent e6934722
......@@ -376,7 +376,8 @@ if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then
# variance
[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
#if CONFIG_SBSEGMENT
if [ "$CONFIG_SBSEGMENT" = "yes" ]; then
prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance32x16
......@@ -388,7 +389,8 @@ specialize vp9_variance64x32
prototype unsigned int vp9_variance32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance32x64
#endif
fi
prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance32x32
......@@ -424,7 +426,8 @@ vp9_variance4x4_mmx=vp9_variance4x4_mmx
prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x64 sse2
#if CONFIG_SBSEGMENT
if [ "$CONFIG_SBSEGMENT" = "yes" ]; then
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x64
......@@ -436,7 +439,8 @@ specialize vp9_sub_pixel_variance32x16
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x32
#endif
fi
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x32 sse2
......@@ -464,19 +468,21 @@ vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad64x64 sse2
#if CONFIG_SBSEGMENT
if [ "$CONFIG_SBSEGMENT" = "yes" ]; then
prototype unsigned int vp9_sad32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad32x64
specialize vp9_sad32x64 sse2
prototype unsigned int vp9_sad64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad64x32
specialize vp9_sad64x32 sse2
prototype unsigned int vp9_sad32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad32x16
specialize vp9_sad32x16 sse2
prototype unsigned int vp9_sad16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad16x32
#endif
specialize vp9_sad16x32 sse2
fi
prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad32x32 sse2
......@@ -571,19 +577,21 @@ specialize vp9_sad4x4x8 sse4
prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
specialize vp9_sad64x64x4d sse2
#if CONFIG_SBSEGMENT
if [ "$CONFIG_SBSEGMENT" = "yes" ]; then
prototype void vp9_sad32x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
specialize vp9_sad32x64x4d
specialize vp9_sad32x64x4d sse2
prototype void vp9_sad64x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
specialize vp9_sad64x32x4d
specialize vp9_sad64x32x4d sse2
prototype void vp9_sad32x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
specialize vp9_sad32x16x4d
specialize vp9_sad32x16x4d sse2
prototype void vp9_sad16x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
specialize vp9_sad16x32x4d
#endif
specialize vp9_sad16x32x4d sse2
fi
prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
specialize vp9_sad32x32x4d sse2
......
......@@ -215,7 +215,11 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
INIT_XMM sse2
SADNXN4D 64, 64
SADNXN4D 64, 32
SADNXN4D 32, 64
SADNXN4D 32, 32
SADNXN4D 32, 16
SADNXN4D 16, 32
SADNXN4D 16, 16
SADNXN4D 16, 8
SADNXN4D 8, 16
......
......@@ -14,11 +14,11 @@ SECTION .text
; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
INIT_XMM sse2
cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
%macro SAD64XN 1
cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
mov n_rowsd, 64
mov n_rowsd, %1
pxor m0, m0
.loop:
movu m1, [refq]
......@@ -42,14 +42,19 @@ cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
paddd m0, m1
movd eax, m0
RET
%endmacro
INIT_XMM sse2
SAD64XN 64 ; sad64x64_sse2
SAD64XN 32 ; sad64x32_sse2
; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
INIT_XMM sse2
cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
%macro SAD32XN 1
cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
mov n_rowsd, 16
mov n_rowsd, %1/2
pxor m0, m0
.loop:
......@@ -74,6 +79,12 @@ cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
paddd m0, m1
movd eax, m0
RET
%endmacro
INIT_XMM sse2
SAD32XN 64 ; sad32x64_sse2
SAD32XN 32 ; sad32x32_sse2
SAD32XN 16 ; sad32x16_sse2
; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
......@@ -112,6 +123,7 @@ cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
%endmacro
INIT_XMM sse2
SAD16XN 32 ; sad16x32_sse2
SAD16XN 16 ; sad16x16_sse2
SAD16XN 8 ; sad16x8_sse2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment