Commit d2a2d5a6 authored by John Koleszar's avatar John Koleszar
Browse files

Merge remote branch 'origin/master' into experimental

Change-Id: If53ec5c1219b31e5ef9ae552d9cc79432ebda267
parents 7cb25d9c c5f890af
......@@ -40,7 +40,7 @@ sym(vp8_loop_filter_horizontal_edge_mmx):
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
movsxd rcx, dword ptr arg(5) ;count
next8_h:
.next8_h:
mov rdx, arg(3) ;limit
movq mm7, [rdx]
mov rdi, rsi ; rdi points to row +1 for indirect addressing
......@@ -211,7 +211,7 @@ next8_h:
add rsi,8
neg rax
dec rcx
jnz next8_h
jnz .next8_h
add rsp, 32
pop rsp
......@@ -255,7 +255,7 @@ sym(vp8_loop_filter_vertical_edge_mmx):
lea rsi, [rsi + rax*4 - 4]
movsxd rcx, dword ptr arg(5) ;count
next8_v:
.next8_v:
mov rdi, rsi ; rdi points to row +1 for indirect addressing
add rdi, rax
......@@ -581,7 +581,7 @@ next8_v:
lea rsi, [rsi+rax*8]
dec rcx
jnz next8_v
jnz .next8_v
add rsp, 64
pop rsp
......@@ -622,7 +622,7 @@ sym(vp8_mbloop_filter_horizontal_edge_mmx):
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
movsxd rcx, dword ptr arg(5) ;count
next8_mbh:
.next8_mbh:
mov rdx, arg(3) ;limit
movq mm7, [rdx]
mov rdi, rsi ; rdi points to row +1 for indirect addressing
......@@ -898,7 +898,7 @@ next8_mbh:
neg rax
add rsi,8
dec rcx
jnz next8_mbh
jnz .next8_mbh
add rsp, 32
pop rsp
......@@ -942,7 +942,7 @@ sym(vp8_mbloop_filter_vertical_edge_mmx):
lea rsi, [rsi + rax*4 - 4]
movsxd rcx, dword ptr arg(5) ;count
next8_mbv:
.next8_mbv:
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
;transpose
......@@ -1365,7 +1365,7 @@ next8_mbv:
lea rsi, [rsi+rax*8]
dec rcx
jnz next8_mbv
jnz .next8_mbv
add rsp, 96
pop rsp
......@@ -1398,7 +1398,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_mmx):
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
mov rcx, 2 ; count
nexts8_h:
.nexts8_h:
mov rdx, arg(2) ;blimit ; get blimit
movq mm3, [rdx] ;
......@@ -1483,7 +1483,7 @@ nexts8_h:
add rsi,8
neg rax
dec rcx
jnz nexts8_h
jnz .nexts8_h
; begin epilog
pop rdi
......@@ -1520,7 +1520,7 @@ sym(vp8_loop_filter_simple_vertical_edge_mmx):
lea rsi, [rsi + rax*4- 2]; ;
mov rcx, 2 ; count
nexts8_v:
.nexts8_v:
lea rdi, [rsi + rax];
movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
......@@ -1695,7 +1695,7 @@ nexts8_v:
lea rsi, [rsi+rax*8] ; next 8
dec rcx
jnz nexts8_v
jnz .nexts8_v
add rsp, 32
pop rsp
......
......@@ -58,10 +58,10 @@ sym(vp8_post_proc_down_and_across_mmx):
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
pxor mm0, mm0 ; mm0 = 00000000
nextrow:
.nextrow:
xor rdx, rdx ; clear out rdx for use as loop counter
nextcol:
.nextcol:
pxor mm7, mm7 ; mm7 = 00000000
movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
......@@ -146,7 +146,7 @@ nextcol:
add rdx, 4
cmp edx, dword ptr arg(5) ;cols
jl nextcol
jl .nextcol
; done with the all cols, start the across filtering in place
sub rsi, rdx
sub rdi, rdx
......@@ -156,7 +156,7 @@ nextcol:
xor rdx, rdx
mov rax, [rdi-4];
acrossnextcol:
.acrossnextcol:
pxor mm7, mm7 ; mm7 = 00000000
movq mm6, [rbx + 32 ] ;
movq mm4, [rdi+rdx] ; mm4 = p0..p7
......@@ -237,7 +237,7 @@ acrossnextcol:
add rdx, 4
cmp edx, dword ptr arg(5) ;cols
jl acrossnextcol;
jl .acrossnextcol;
mov DWORD PTR [rdi+rdx-4], eax
pop rax
......@@ -249,7 +249,7 @@ acrossnextcol:
movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
dec rcx ; decrement count
jnz nextrow ; next row
jnz .nextrow ; next row
pop rbx
; begin epilog
......@@ -293,7 +293,7 @@ sym(vp8_mbpost_proc_down_mmx):
add dword ptr arg(2), 8
;for(c=0; c<cols; c+=4)
loop_col:
.loop_col:
mov rsi, arg(0) ;s
pxor mm0, mm0 ;
......@@ -312,7 +312,7 @@ loop_col:
mov rcx, 15 ;
loop_initvar:
.loop_initvar:
movd mm1, DWORD PTR [rdi];
punpcklbw mm1, mm0 ;
......@@ -329,10 +329,10 @@ loop_initvar:
lea rdi, [rdi+rax] ;
dec rcx
jne loop_initvar
jne .loop_initvar
;save the var and sum
xor rdx, rdx
loop_row:
.loop_row:
movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
......@@ -438,13 +438,13 @@ loop_row:
add rdx, 1
cmp edx, dword arg(2) ;rows
jl loop_row
jl .loop_row
add dword arg(0), 4 ; s += 4
sub dword arg(3), 4 ; cols -= 4
cmp dword arg(3), 0
jg loop_col
jg .loop_col
add rsp, 136
pop rsp
......@@ -475,7 +475,7 @@ sym(vp8_plane_add_noise_mmx):
push rdi
; end prolog
addnoise_loop:
.addnoise_loop:
call sym(rand) WRT_PLT
mov rcx, arg(1) ;noise
and rax, 0xff
......@@ -492,7 +492,7 @@ addnoise_loop:
mov rsi, arg(0) ;Pos
xor rax,rax
addnoise_nextset:
.addnoise_nextset:
movq mm1,[rsi+rax] ; get the source
psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
......@@ -506,12 +506,12 @@ addnoise_nextset:
add rax,8 ; move to the next line
cmp rax, rcx
jl addnoise_nextset
jl .addnoise_nextset
movsxd rax, dword arg(7) ; Pitch
add arg(0), rax ; Start += Pitch
sub dword arg(6), 1 ; Height -= 1
jg addnoise_loop
jg .addnoise_loop
; begin epilog
pop rdi
......
......@@ -57,10 +57,10 @@ sym(vp8_post_proc_down_and_across_xmm):
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
pxor xmm0, xmm0 ; mm0 = 00000000
nextrow:
.nextrow:
xor rdx, rdx ; clear out rdx for use as loop counter
nextcol:
.nextcol:
movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
punpcklbw xmm3, xmm0 ; mm3 = p0..p3
movdqa xmm1, xmm3 ; mm1 = p0..p3
......@@ -133,7 +133,7 @@ nextcol:
add rdx, 8
cmp edx, dword arg(5) ;cols
jl nextcol
jl .nextcol
; done with the all cols, start the across filtering in place
sub rsi, rdx
......@@ -142,7 +142,7 @@ nextcol:
xor rdx, rdx
movq mm0, QWORD PTR [rdi-8];
acrossnextcol:
.acrossnextcol:
movq xmm7, QWORD PTR [rdi +rdx -2]
movd xmm4, DWORD PTR [rdi +rdx +6]
......@@ -219,7 +219,7 @@ acrossnextcol:
add rdx, 8
cmp edx, dword arg(5) ;cols
jl acrossnextcol;
jl .acrossnextcol;
; last 8 pixels
movq QWORD PTR [rdi+rdx-8], mm0
......@@ -231,7 +231,7 @@ acrossnextcol:
mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
dec rcx ; decrement count
jnz nextrow ; next row
jnz .nextrow ; next row
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
add rsp,16
......@@ -282,7 +282,7 @@ sym(vp8_mbpost_proc_down_xmm):
add dword arg(2), 8
;for(c=0; c<cols; c+=8)
loop_col:
.loop_col:
mov rsi, arg(0) ; s
pxor xmm0, xmm0 ;
......@@ -301,7 +301,7 @@ loop_col:
mov rcx, 15 ;
loop_initvar:
.loop_initvar:
movq xmm1, QWORD PTR [rdi];
punpcklbw xmm1, xmm0 ;
......@@ -318,10 +318,10 @@ loop_initvar:
lea rdi, [rdi+rax] ;
dec rcx
jne loop_initvar
jne .loop_initvar
;save the var and sum
xor rdx, rdx
loop_row:
.loop_row:
movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
......@@ -428,12 +428,12 @@ loop_row:
add rdx, 1
cmp edx, dword arg(2) ;rows
jl loop_row
jl .loop_row
add dword arg(0), 8 ; s += 8
sub dword arg(3), 8 ; cols -= 8
cmp dword arg(3), 0
jg loop_col
jg .loop_col
add rsp, 128+16
pop rsp
......@@ -475,13 +475,13 @@ sym(vp8_mbpost_proc_across_ip_xmm):
;for(r=0;r<rows;r++)
ip_row_loop:
.ip_row_loop:
xor rdx, rdx ;sumsq=0;
xor rcx, rcx ;sum=0;
mov rsi, arg(0); s
mov rdi, -8
ip_var_loop:
.ip_var_loop:
;for(i=-8;i<=6;i++)
;{
; sumsq += s[i]*s[i];
......@@ -493,7 +493,7 @@ ip_var_loop:
add edx, eax
add rdi, 1
cmp rdi, 6
jle ip_var_loop
jle .ip_var_loop
;mov rax, sumsq
......@@ -513,7 +513,7 @@ ip_var_loop:
pxor mm1, mm1
pxor xmm0, xmm0
nextcol4:
.nextcol4:
movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
......@@ -600,7 +600,7 @@ nextcol4:
add rcx, 4
cmp rcx, rdx
jl nextcol4
jl .nextcol4
;s+=pitch;
movsxd rax, dword arg(1)
......@@ -608,7 +608,7 @@ nextcol4:
sub dword arg(2), 1 ;rows-=1
cmp dword arg(2), 0
jg ip_row_loop
jg .ip_row_loop
add rsp, 16
pop rsp
......@@ -640,7 +640,7 @@ sym(vp8_plane_add_noise_wmt):
push rdi
; end prolog
addnoise_loop:
.addnoise_loop:
call sym(rand) WRT_PLT
mov rcx, arg(1) ;noise
and rax, 0xff
......@@ -657,7 +657,7 @@ addnoise_loop:
mov rsi, arg(0) ;Pos
xor rax,rax
addnoise_nextset:
.addnoise_nextset:
movdqu xmm1,[rsi+rax] ; get the source
psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
......@@ -671,12 +671,12 @@ addnoise_nextset:
add rax,16 ; move to the next line
cmp rax, rcx
jl addnoise_nextset
jl .addnoise_nextset
movsxd rax, dword arg(7) ; Pitch
add arg(0), rax ; Start += Pitch
sub dword arg(6), 1 ; Height -= 1
jg addnoise_loop
jg .addnoise_loop
; begin epilog
pop rdi
......
......@@ -503,7 +503,7 @@ sym(vp8_intra_pred_uv_tm_%1):
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
vp8_intra_pred_uv_tm_%1_loop:
.vp8_intra_pred_uv_tm_%1_loop:
movd xmm3, [rsi]
movd xmm5, [rsi+rax]
%ifidn %1, sse2
......@@ -525,7 +525,7 @@ vp8_intra_pred_uv_tm_%1_loop:
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rcx*2]
dec edx
jnz vp8_intra_pred_uv_tm_%1_loop
jnz .vp8_intra_pred_uv_tm_%1_loop
; begin epilog
pop rdi
......@@ -615,7 +615,7 @@ sym(vp8_intra_pred_uv_ho_%1):
%endif
dec rsi
%ifidn %1, mmx2
vp8_intra_pred_uv_ho_%1_loop:
.vp8_intra_pred_uv_ho_%1_loop:
movd mm0, [rsi]
movd mm1, [rsi+rax]
punpcklbw mm0, mm0
......@@ -627,7 +627,7 @@ vp8_intra_pred_uv_ho_%1_loop:
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rcx*2]
dec edx
jnz vp8_intra_pred_uv_ho_%1_loop
jnz .vp8_intra_pred_uv_ho_%1_loop
%else
movd xmm0, [rsi]
movd xmm3, [rsi+rax]
......
......@@ -50,7 +50,7 @@ sym(vp8_filter_block1d_h6_mmx):
movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
pxor mm0, mm0 ; mm0 = 00000000
nextrow:
.nextrow:
movq mm3, [rsi-2] ; mm3 = p-2..p5
movq mm4, mm3 ; mm4 = p-2..p5
psrlq mm3, 8 ; mm3 = p-1..p5
......@@ -102,7 +102,7 @@ nextrow:
%endif
dec rcx ; decrement count
jnz nextrow ; next row
jnz .nextrow ; next row
; begin epilog
pop rdi
......@@ -152,7 +152,7 @@ sym(vp8_filter_block1dc_v6_mmx):
pxor mm0, mm0 ; mm0 = 00000000
nextrow_cv:
.nextrow_cv:
movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
......@@ -190,7 +190,7 @@ nextrow_cv:
; avoidable!!!.
lea rdi, [rdi+rax] ;
dec rcx ; decrement count
jnz nextrow_cv ; next row
jnz .nextrow_cv ; next row
pop rbx
......@@ -282,7 +282,7 @@ sym(vp8_bilinear_predict8x8_mmx):
packuswb mm7, mm4 ;
add rsi, rdx ; next line
next_row_8x8:
.next_row_8x8:
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movq mm4, mm3 ; make a copy of current line
......@@ -349,7 +349,7 @@ next_row_8x8:
add rdi, r8 ;dst_pitch
%endif
cmp rdi, rcx ;
jne next_row_8x8
jne .next_row_8x8
; begin epilog
pop rdi
......@@ -437,7 +437,7 @@ sym(vp8_bilinear_predict8x4_mmx):
packuswb mm7, mm4 ;
add rsi, rdx ; next line
next_row_8x4:
.next_row_8x4:
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
movq mm4, mm3 ; make a copy of current line
......@@ -504,7 +504,7 @@ next_row_8x4:
add rdi, r8
%endif
cmp rdi, rcx ;
jne next_row_8x4
jne .next_row_8x4
; begin epilog
pop rdi
......@@ -579,7 +579,7 @@ sym(vp8_bilinear_predict4x4_mmx):
packuswb mm7, mm0 ;
add rsi, rdx ; next line
next_row_4x4:
.next_row_4x4:
movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
......@@ -622,7 +622,7 @@ next_row_4x4:
%endif
cmp rdi, rcx ;
jne next_row_4x4
jne .next_row_4x4
; begin epilog
pop rdi
......
......@@ -55,7 +55,7 @@ sym(vp8_filter_block1d8_h6_sse2):
%endif
pxor xmm0, xmm0 ; clear xmm0 for unpack
filter_block1d8_h6_rowloop:
.filter_block1d8_h6_rowloop:
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
......@@ -124,7 +124,7 @@ filter_block1d8_h6_rowloop:
%endif
dec rcx
jnz filter_block1d8_h6_rowloop ; next row
jnz .filter_block1d8_h6_rowloop ; next row
; begin epilog
pop rdi
......@@ -176,7 +176,7 @@ sym(vp8_filter_block1d16_h6_sse2):
pxor xmm0, xmm0 ; clear xmm0 for unpack
filter_block1d16_h6_sse2_rowloop:
.filter_block1d16_h6_sse2_rowloop:
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
......@@ -301,7 +301,7 @@ filter_block1d16_h6_sse2_rowloop:
%endif
dec rcx
jnz filter_block1d16_h6_sse2_rowloop ; next row
jnz .filter_block1d16_h6_sse2_rowloop ; next row
; begin epilog
pop rdi
......@@ -356,7 +356,7 @@ sym(vp8_filter_block1d8_v6_sse2):
movsxd r8, dword ptr arg(2) ; dst_ptich
%endif
vp8_filter_block1d8_v6_sse2_loop:
.vp8_filter_block1d8_v6_sse2_loop:
movdqa xmm1, XMMWORD PTR [rsi]
pmullw xmm1, [rax]
......@@ -396,7 +396,7 @@ vp8_filter_block1d8_v6_sse2_loop:
add rdi, r8
%endif
dec rcx ; decrement count
jnz vp8_filter_block1d8_v6_sse2_loop ; next row
jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
; begin epilog
pop rdi
......@@ -448,7 +448,7 @@ sym(vp8_filter_block1d16_v6_sse2):
movsxd r8, dword ptr arg(2) ; dst_ptich
%endif
vp8_filter_block1d16_v6_sse2_loop:
.vp8_filter_block1d16_v6_sse2_loop:
; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
......@@ -511,7 +511,7 @@ vp8_filter_block1d16_v6_sse2_loop:
add rdi, r8
%endif
dec rcx ; decrement count
jnz vp8_filter_block1d16_v6_sse2_loop ; next row
jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
; begin epilog
pop rdi
......@@ -556,7 +556,7 @@ sym(vp8_filter_block1d8_h6_only_sse2):
%endif
pxor xmm0, xmm0 ; clear xmm0 for unpack
filter_block1d8_h6_only_rowloop:
.filter_block1d8_h6_only_rowloop:
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
......@@ -624,7 +624,7 @@ filter_block1d8_h6_only_rowloop:
%endif
dec rcx
jnz filter_block1d8_h6_only_rowloop ; next row
jnz .filter_block1d8_h6_only_rowloop ; next row
; begin epilog
pop rdi
......@@ -670,7 +670,7 @@ sym(vp8_filter_block1d16_h6_only_sse2):
pxor xmm0, xmm0 ; clear xmm0 for unpack
filter_block1d16_h6_only_sse2_rowloop:
.filter_block1d16_h6_only_sse2_rowloop:
movq xmm3, MMWORD PTR [rsi - 2]
movq xmm1, MMWORD PTR [rsi + 6]
......@@ -789,7 +789,7 @@ filter_block1d16_h6_only_sse2_rowloop:
%endif
dec rcx
jnz filter_block1d16_h6_only_sse2_rowloop ; next row
jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
; begin epilog
pop rdi
......@@ -837,7 +837,7 @@ sym(vp8_filter_block1d8_v6_only_sse2):
movsxd r8, dword ptr arg(3) ; dst_ptich
%endif
vp8_filter_block1d8_v6_only_sse2_loop:
.vp8_filter_block1d8_v6_only_sse2_loop:
movq xmm1, MMWORD PTR [rsi]
movq xmm2, MMWORD PTR [rsi + rdx]
movq xmm3, MMWORD PTR [rsi + rdx * 2]
......@@ -883,7 +883,7 @@ vp8_filter_block1d8_v6_only_sse2_loop: