Commit e3efed7f authored by James Zern's avatar James Zern Committed by Gerrit Code Review
Browse files

Merge "convolve_copy_sse2: replace SSE w/SSE2 code"

parents f4832197 40dab589
......@@ -13,15 +13,21 @@
SECTION .text
%macro convolve_fn 1-2
INIT_XMM sse2
%ifidn %1, avg
%define AUX_XMM_REGS 4
%else
%define AUX_XMM_REGS 0
%endif
%ifidn %2, highbd
%define pavg pavgw
cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
fx, fxs, fy, fys, w, h, bd
cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
dst, dst_stride, \
fx, fxs, fy, fys, w, h, bd
%else
%define pavg pavgb
cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
fx, fxs, fy, fys, w, h
cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
dst, dst_stride, \
fx, fxs, fy, fys, w, h
%endif
mov r4d, dword wm
%ifidn %2, highbd
......@@ -152,27 +158,30 @@ cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
jnz .loop16
RET
INIT_MMX sse
.w8:
mov r4d, dword hm
lea r5q, [src_strideq*3]
lea r6q, [dst_strideq*3]
.loop8:
movu m0, [srcq]
movu m1, [srcq+src_strideq]
movu m2, [srcq+src_strideq*2]
movu m3, [srcq+r5q]
movh m0, [srcq]
movh m1, [srcq+src_strideq]
movh m2, [srcq+src_strideq*2]
movh m3, [srcq+r5q]
lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
pavg m0, [dstq]
pavg m1, [dstq+dst_strideq]
pavg m2, [dstq+dst_strideq*2]
pavg m3, [dstq+r6q]
movh m4, [dstq]
movh m5, [dstq+dst_strideq]
movh m6, [dstq+dst_strideq*2]
movh m7, [dstq+r6q]
pavg m0, m4
pavg m1, m5
pavg m2, m6
pavg m3, m7
%endif
mova [dstq ], m0
mova [dstq+dst_strideq ], m1
mova [dstq+dst_strideq*2], m2
mova [dstq+r6q ], m3
movh [dstq ], m0
movh [dstq+dst_strideq ], m1
movh [dstq+dst_strideq*2], m2
movh [dstq+r6q ], m3
lea dstq, [dstq+dst_strideq*4]
sub r4d, 4
jnz .loop8
......@@ -184,25 +193,25 @@ INIT_MMX sse
lea r5q, [src_strideq*3]
lea r6q, [dst_strideq*3]
.loop4:
movh m0, [srcq]
movh m1, [srcq+src_strideq]
movh m2, [srcq+src_strideq*2]
movh m3, [srcq+r5q]
movd m0, [srcq]
movd m1, [srcq+src_strideq]
movd m2, [srcq+src_strideq*2]
movd m3, [srcq+r5q]
lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
movh m4, [dstq]
movh m5, [dstq+dst_strideq]
movh m6, [dstq+dst_strideq*2]
movh m7, [dstq+r6q]
movd m4, [dstq]
movd m5, [dstq+dst_strideq]
movd m6, [dstq+dst_strideq*2]
movd m7, [dstq+r6q]
pavg m0, m4
pavg m1, m5
pavg m2, m6
pavg m3, m7
%endif
movh [dstq ], m0
movh [dstq+dst_strideq ], m1
movh [dstq+dst_strideq*2], m2
movh [dstq+r6q ], m3
movd [dstq ], m0
movd [dstq+dst_strideq ], m1
movd [dstq+dst_strideq*2], m2
movd [dstq+r6q ], m3
lea dstq, [dstq+dst_strideq*4]
sub r4d, 4
jnz .loop4
......@@ -210,6 +219,7 @@ INIT_MMX sse
%endif
%endmacro
INIT_XMM sse2
convolve_fn copy
convolve_fn avg
%if CONFIG_VP9_HIGHBITDEPTH
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment