Commit 8fb6c581 authored by Ronald S. Bultje's avatar Ronald S. Bultje
Browse files

Implement sse2 and ssse3 versions for all sub_pixel_variance sizes.

Overall speedup around 5% (bus @ 1500kbps first 50 frames 4min10 ->
3min58). Specific changes to timings for each function compared to
original assembly-optimized versions (or just new version timings if
no previous assembly-optimized version was available):

sse2   4x4:    99 ->   82 cycles
sse2   4x8:           128 cycles
sse2   8x4:           121 cycles
sse2   8x8:   149 ->  129 cycles
sse2   8x16:  235 ->  245 cycles (?)
sse2  16x8:   269 ->  203 cycles
sse2  16x16:  441 ->  349 cycles
sse2  16x32:          641 cycles
sse2  32x16:          643 cycles
sse2  32x32: 1733 -> 1154 cycles
sse2  32x64:         2247 cycles
sse2  64x32:         2323 cycles
sse2  64x64: 6984 -> 4442 cycles

ssse3  4x4:           100 cycles (?)
ssse3  4x8:           103 cycles
ssse3  8x4:            71 cycles
ssse3  8x8:           147 cycles
ssse3  8x16:          158 cycles
ssse3 16x8:   188 ->  162 cycles
ssse3 16x16:  316 ->  273 cycles
ssse3 16x32:          535 cycles
ssse3 32x16:          564 cycles
ssse3 32x32:          973 cycles
ssse3 32x64:         1930 cycles
ssse3 64x32:         1922 cycles
ssse3 64x64:         3760 cycles

Change-Id: I81ff6fe51daf35a40d19785167004664d7e0c59d
parent 36568357
This diff is collapsed.
......@@ -266,85 +266,81 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid
specialize vp9_variance4x4 mmx sse2
prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x64 sse2
specialize vp9_sub_pixel_variance64x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance64x64
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x64
specialize vp9_sub_pixel_variance32x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x64
prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x32
specialize vp9_sub_pixel_variance64x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance64x32
prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x16
specialize vp9_sub_pixel_variance32x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x16
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x32
specialize vp9_sub_pixel_variance16x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance16x32
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x32 sse2
specialize vp9_sub_pixel_variance32x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance32x32
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
specialize vp9_sub_pixel_variance16x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance16x16
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x16 sse2 mmx
vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
specialize vp9_sub_pixel_variance8x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance8x16
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
specialize vp9_sub_pixel_variance16x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance16x8
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x8 sse2 mmx
vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
specialize vp9_sub_pixel_variance8x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance8x8
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x4
specialize vp9_sub_pixel_variance8x4 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance8x4
prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance4x8
specialize vp9_sub_pixel_variance4x8 sse ssse3
prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance4x8
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance4x4 sse2 mmx
vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
specialize vp9_sub_pixel_variance4x4 sse ssse3
#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
specialize vp9_sub_pixel_avg_variance4x4
......@@ -390,15 +386,15 @@ prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, co
specialize vp9_sad4x4 mmx sse
prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance_halfpixvar16x16_h mmx sse2
specialize vp9_variance_halfpixvar16x16_h sse2
vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance_halfpixvar16x16_v mmx sse2
specialize vp9_variance_halfpixvar16x16_v sse2
vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_variance_halfpixvar16x16_hv mmx sse2
specialize vp9_variance_halfpixvar16x16_hv sse2
vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
......@@ -507,8 +503,8 @@ specialize vp9_sad4x8x4d sse
prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
specialize vp9_sad4x4x4d sse
prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
specialize vp9_sub_pixel_mse16x16 sse2 mmx
#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
#specialize vp9_sub_pixel_mse16x16 sse2 mmx
prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"
specialize vp9_mse16x16 mmx sse2
......
This diff is collapsed.
......@@ -8,292 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%define xmm_filter_shift 7
;void vp9_filter_block2d_bil_var_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned int Height,
; int xoffset,
; int yoffset,
; int *sum,
; unsigned int *sumsquared;;
;
;)
global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
sym(vp9_filter_block2d_bil_var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 9
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
push rbx
; end prolog
pxor xmm6, xmm6 ;
pxor xmm7, xmm7 ;
lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
movdqa xmm4, XMMWORD PTR [rsi]
lea rcx, [GLOBAL(bilinear_filters_sse2)]
movsxd rax, dword ptr arg(5) ; xoffset
cmp rax, 0 ; skip first_pass filter if xoffset=0
je filter_block2d_bil_var_sse2_sp_only
shl rax, 5 ; point to filter coeff with xoffset
lea rax, [rax + rcx] ; HFilter
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip second_pass filter if yoffset=0
je filter_block2d_bil_var_sse2_fp_only
shl rdx, 5
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
pxor xmm0, xmm0 ;
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movdqa xmm5, xmm1
movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
lea rsi, [rsi + rbx]
%if ABI_IS_32BIT=0
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
%endif
filter_block2d_bil_var_sse2_loop:
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0 ;
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movdqa xmm3, xmm5 ;
movdqa xmm5, xmm1 ;
pmullw xmm3, [rdx] ;
pmullw xmm1, [rdx+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
punpcklbw xmm3, xmm0 ;
psubw xmm1, xmm3 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
lea rsi, [rsi + rbx] ;ref_pixels_per_line
%if ABI_IS_32BIT
add rdi, dword ptr arg(3) ;src_pixels_per_line
%else
lea rdi, [rdi + r9]
%endif
sub rcx, 1 ;
jnz filter_block2d_bil_var_sse2_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_var_sse2_sp_only:
movsxd rdx, dword ptr arg(6) ; yoffset
cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
je filter_block2d_bil_var_sse2_full_pixel
shl rdx, 5
lea rdx, [rdx + rcx] ; VFilter
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ;
movq xmm1, QWORD PTR [rsi] ;
punpcklbw xmm1, xmm0 ;
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
lea rsi, [rsi + rax]
filter_block2d_bil_sp_only_loop:
movq xmm3, QWORD PTR [rsi] ;
punpcklbw xmm3, xmm0 ;
movdqa xmm5, xmm3
pmullw xmm1, [rdx] ;
pmullw xmm3, [rdx+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
punpcklbw xmm3, xmm0 ;
psubw xmm1, xmm3 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
movdqa xmm1, xmm5 ;
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1 ;
jnz filter_block2d_bil_sp_only_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_var_sse2_full_pixel:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
pxor xmm0, xmm0 ;
filter_block2d_bil_full_pixel_loop:
movq xmm1, QWORD PTR [rsi] ;
punpcklbw xmm1, xmm0 ;
movq xmm2, QWORD PTR [rdi] ;
punpcklbw xmm2, xmm0 ;
psubw xmm1, xmm2 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
lea rsi, [rsi + rax] ;ref_pixels_per_line
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1 ;
jnz filter_block2d_bil_full_pixel_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_var_sse2_fp_only:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
movsxd rcx, dword ptr arg(4) ;Height
movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ;
movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
filter_block2d_bil_fp_only_loop:
movq xmm1, QWORD PTR [rsi] ;
movq xmm3, QWORD PTR [rsi+1] ;
punpcklbw xmm1, xmm0 ;
pmullw xmm1, [rax] ;
punpcklbw xmm3, xmm0 ;
pmullw xmm3, [rax+16] ;
paddw xmm1, xmm3 ;
paddw xmm1, xmm4 ;
psraw xmm1, xmm_filter_shift ;
movq xmm3, QWORD PTR [rdi] ;
punpcklbw xmm3, xmm0 ;
psubw xmm1, xmm3 ;
paddw xmm6, xmm1 ;
pmaddwd xmm1, xmm1 ;
paddd xmm7, xmm1 ;
lea rsi, [rsi + rdx]
lea rdi, [rdi + rbx] ;src_pixels_per_line
sub rcx, 1 ;
jnz filter_block2d_bil_fp_only_loop ;
jmp filter_block2d_bil_variance
filter_block2d_bil_variance:
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
psrldq xmm6, 8
psrldq xmm7, 8
movdq2q mm2, xmm6
movdq2q mm3, xmm7
paddw mm6, mm2
paddd mm7, mm3
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rsi, arg(7) ; sum
mov rdi, arg(8) ; sumsquared
movd [rsi], mm2 ; xsum
movd [rdi], mm4 ; xxsum
; begin epilog
pop rbx
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_half_horiz_vert_variance16x_h_sse2
;(
; unsigned char *ref_ptr,
......@@ -619,27 +335,3 @@ sym(vp9_half_horiz_variance16x_h_sse2):
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
align 16
xmm_bi_rd:
times 8 dw 64
align 16
bilinear_filters_sse2:
dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8
dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
......@@ -508,344 +508,3 @@ sym(vp9_get4x4sse_cs_mmx):
UNSHADOW_ARGS
pop rbp
ret
%define mmx_filter_shift 7
;void vp9_filter_block2d_bil4x4_var_mmx
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
; unsigned char *src_ptr,
; int src_pixels_per_line,
; unsigned short *HFilter,
; unsigned short *VFilter,
; int *sum,
; unsigned int *sumsquared
;)
global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE
sym(vp9_filter_block2d_bil4x4_var_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
pxor mm6, mm6 ;
pxor mm7, mm7 ;
mov rax, arg(4) ;HFilter ;
mov rdx, arg(5) ;VFilter ;
mov rsi, arg(0) ;ref_ptr ;
mov rdi, arg(2) ;src_ptr ;
mov rcx, 4 ;
pxor mm0, mm0 ;
movd mm1, [rsi] ;
movd mm3, [rsi+1] ;
punpcklbw mm1, mm0 ;
pmullw mm1, [rax] ;
punpcklbw mm3, mm0 ;
pmullw mm3, [rax+8] ;
paddw mm1, mm3 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movq mm5, mm1
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
add rsi, r8
%endif
.filter_block2d_bil4x4_var_mmx_loop:
movd mm1, [rsi] ;
movd mm3, [rsi+1] ;
punpcklbw mm1, mm0 ;
pmullw mm1, [rax] ;
punpcklbw mm3, mm0 ;
pmullw mm3, [rax+8] ;
paddw mm1, mm3 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movq mm3, mm5 ;
movq mm5, mm1 ;
pmullw mm3, [rdx] ;
pmullw mm1, [rdx+8] ;
paddw mm1, mm3 ;
paddw mm1, [GLOBAL(mmx_bi_rd)] ;
psraw mm1, mmx_filter_shift ;
movd mm3, [rdi] ;
punpcklbw mm3, mm0 ;
psubw mm1, mm3 ;
paddw mm6, mm1 ;
pmaddwd mm1, mm1 ;
paddd mm7, mm1 ;
%if ABI_IS_32BIT
add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
add rdi, dword ptr arg(3) ;src_pixels_per_line ;
%else
movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
add rsi, r8
add rdi, r9
%endif
sub rcx, 1 ;
jnz .filter_block2d_bil4x4_var_mmx_loop ;
pxor mm3, mm3 ;
pxor mm2, mm2 ;
punpcklwd mm2, mm6 ;
punpckhwd mm3, mm6 ;
paddd mm2, mm3 ;
movq mm6, mm2 ;
psrlq mm6, 32 ;
paddd mm2, mm6 ;
psrad mm2, 16 ;
movq mm4, mm7 ;
psrlq mm4, 32 ;
paddd mm4, mm7 ;
mov rdi, arg(6) ;sum
mov rsi, arg(7) ;sumsquared
movd dword ptr [rdi], mm2 ;
movd dword ptr [rsi], mm4 ;
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT