From c5f890af2cff951048cc41630f2523b61fb74a0b Mon Sep 17 00:00:00 2001 From: Fritz Koenig <frkoenig@google.com> Date: Mon, 22 Aug 2011 15:29:41 -0700 Subject: [PATCH] Use local labels for jumps/loops in x86 assembly. Prepend . to local labels in assembly code. This allows non unique labels within a file. Also makes profiling information more informative by keeping the function name with the loop name. Change-Id: I7a983cb3a5ba2413d5dafd0a37936b268fb9e37f --- vp8/common/x86/loopfilter_mmx.asm | 24 +-- vp8/common/x86/postproc_mmx.asm | 32 ++-- vp8/common/x86/postproc_sse2.asm | 44 ++--- vp8/common/x86/recon_sse2.asm | 8 +- vp8/common/x86/subpixel_mmx.asm | 20 +-- vp8/common/x86/subpixel_sse2.asm | 62 +++---- vp8/common/x86/subpixel_ssse3.asm | 100 +++++------ vp8/encoder/x86/encodeopt.asm | 16 +- vp8/encoder/x86/quantize_sse2.asm | 6 +- vp8/encoder/x86/quantize_sse4.asm | 6 +- vp8/encoder/x86/sad_mmx.asm | 16 +- vp8/encoder/x86/sad_sse2.asm | 40 ++--- vp8/encoder/x86/sad_sse3.asm | 12 +- vp8/encoder/x86/sad_ssse3.asm | 164 +++++++++--------- vp8/encoder/x86/ssim_opt.asm | 8 +- vp8/encoder/x86/subtract_mmx.asm | 4 +- vp8/encoder/x86/subtract_sse2.asm | 4 +- .../x86/temporal_filter_apply_sse2.asm | 18 +- vp8/encoder/x86/variance_impl_mmx.asm | 12 +- vp8/encoder/x86/variance_impl_sse2.asm | 8 +- vp8/encoder/x86/variance_impl_ssse3.asm | 38 ++-- 21 files changed, 321 insertions(+), 321 deletions(-) diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm index ad47284cf9..697a5dee60 100644 --- a/vp8/common/x86/loopfilter_mmx.asm +++ b/vp8/common/x86/loopfilter_mmx.asm @@ -40,7 +40,7 @@ sym(vp8_loop_filter_horizontal_edge_mmx): movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? movsxd rcx, dword ptr arg(5) ;count -next8_h: +.next8_h: mov rdx, arg(3) ;limit movq mm7, [rdx] mov rdi, rsi ; rdi points to row +1 for indirect addressing @@ -211,7 +211,7 @@ next8_h: add rsi,8 neg rax dec rcx - jnz next8_h + jnz .next8_h add rsp, 32 pop rsp @@ -255,7 +255,7 @@ sym(vp8_loop_filter_vertical_edge_mmx): lea rsi, [rsi + rax*4 - 4] movsxd rcx, dword ptr arg(5) ;count -next8_v: +.next8_v: mov rdi, rsi ; rdi points to row +1 for indirect addressing add rdi, rax @@ -581,7 +581,7 @@ next8_v: lea rsi, [rsi+rax*8] dec rcx - jnz next8_v + jnz .next8_v add rsp, 64 pop rsp @@ -622,7 +622,7 @@ sym(vp8_mbloop_filter_horizontal_edge_mmx): movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? movsxd rcx, dword ptr arg(5) ;count -next8_mbh: +.next8_mbh: mov rdx, arg(3) ;limit movq mm7, [rdx] mov rdi, rsi ; rdi points to row +1 for indirect addressing @@ -898,7 +898,7 @@ next8_mbh: neg rax add rsi,8 dec rcx - jnz next8_mbh + jnz .next8_mbh add rsp, 32 pop rsp @@ -942,7 +942,7 @@ sym(vp8_mbloop_filter_vertical_edge_mmx): lea rsi, [rsi + rax*4 - 4] movsxd rcx, dword ptr arg(5) ;count -next8_mbv: +.next8_mbv: lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing ;transpose @@ -1365,7 +1365,7 @@ next8_mbv: lea rsi, [rsi+rax*8] dec rcx - jnz next8_mbv + jnz .next8_mbv add rsp, 96 pop rsp @@ -1398,7 +1398,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_mmx): movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? mov rcx, 2 ; count -nexts8_h: +.nexts8_h: mov rdx, arg(2) ;blimit ; get blimit movq mm3, [rdx] ; @@ -1483,7 +1483,7 @@ nexts8_h: add rsi,8 neg rax dec rcx - jnz nexts8_h + jnz .nexts8_h ; begin epilog pop rdi @@ -1520,7 +1520,7 @@ sym(vp8_loop_filter_simple_vertical_edge_mmx): lea rsi, [rsi + rax*4- 2]; ; mov rcx, 2 ; count -nexts8_v: +.nexts8_v: lea rdi, [rsi + rax]; movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 @@ -1695,7 +1695,7 @@ nexts8_v: lea rsi, [rsi+rax*8] ; next 8 dec rcx - jnz nexts8_v + jnz .nexts8_v add rsp, 32 pop rsp diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm index 787e832687..81122181f2 100644 --- a/vp8/common/x86/postproc_mmx.asm +++ b/vp8/common/x86/postproc_mmx.asm @@ -58,10 +58,10 @@ sym(vp8_post_proc_down_and_across_mmx): movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? pxor mm0, mm0 ; mm0 = 00000000 -nextrow: +.nextrow: xor rdx, rdx ; clear out rdx for use as loop counter -nextcol: +.nextcol: pxor mm7, mm7 ; mm7 = 00000000 movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps @@ -146,7 +146,7 @@ nextcol: add rdx, 4 cmp edx, dword ptr arg(5) ;cols - jl nextcol + jl .nextcol ; done with the all cols, start the across filtering in place sub rsi, rdx sub rdi, rdx @@ -156,7 +156,7 @@ nextcol: xor rdx, rdx mov rax, [rdi-4]; -acrossnextcol: +.acrossnextcol: pxor mm7, mm7 ; mm7 = 00000000 movq mm6, [rbx + 32 ] ; movq mm4, [rdi+rdx] ; mm4 = p0..p7 @@ -237,7 +237,7 @@ acrossnextcol: add rdx, 4 cmp edx, dword ptr arg(5) ;cols - jl acrossnextcol; + jl .acrossnextcol; mov DWORD PTR [rdi+rdx-4], eax pop rax @@ -249,7 +249,7 @@ acrossnextcol: movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? dec rcx ; decrement count - jnz nextrow ; next row + jnz .nextrow ; next row pop rbx ; begin epilog @@ -293,7 +293,7 @@ sym(vp8_mbpost_proc_down_mmx): add dword ptr arg(2), 8 ;for(c=0; c<cols; c+=4) -loop_col: +.loop_col: mov rsi, arg(0) ;s pxor mm0, mm0 ; @@ -312,7 +312,7 @@ loop_col: mov rcx, 15 ; -loop_initvar: +.loop_initvar: movd mm1, DWORD PTR [rdi]; punpcklbw mm1, mm0 ; @@ -329,10 +329,10 @@ loop_initvar: lea rdi, [rdi+rax] ; dec rcx - jne loop_initvar + jne .loop_initvar ;save the var and sum xor rdx, rdx -loop_row: +.loop_row: movd mm1, DWORD PTR [rsi] ; [s-pitch*8] movd mm2, DWORD PTR [rdi] ; [s+pitch*7] @@ -438,13 +438,13 @@ loop_row: add rdx, 1 cmp edx, dword arg(2) ;rows - jl loop_row + jl .loop_row add dword arg(0), 4 ; s += 4 sub dword arg(3), 4 ; cols -= 4 cmp dword arg(3), 0 - jg loop_col + jg .loop_col add rsp, 136 pop rsp @@ -475,7 +475,7 @@ sym(vp8_plane_add_noise_mmx): push rdi ; end prolog -addnoise_loop: +.addnoise_loop: call sym(rand) WRT_PLT mov rcx, arg(1) ;noise and rax, 0xff @@ -492,7 +492,7 @@ addnoise_loop: mov rsi, arg(0) ;Pos xor rax,rax -addnoise_nextset: +.addnoise_nextset: movq mm1,[rsi+rax] ; get the source psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise @@ -506,12 +506,12 @@ addnoise_nextset: add rax,8 ; move to the next line cmp rax, rcx - jl addnoise_nextset + jl .addnoise_nextset movsxd rax, dword arg(7) ; Pitch add arg(0), rax ; Start += Pitch sub dword arg(6), 1 ; Height -= 1 - jg addnoise_loop + jg .addnoise_loop ; begin epilog pop rdi diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm index 06d51ec6fe..1f219ca878 100644 --- a/vp8/common/x86/postproc_sse2.asm +++ b/vp8/common/x86/postproc_sse2.asm @@ -57,10 +57,10 @@ sym(vp8_post_proc_down_and_across_xmm): movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? pxor xmm0, xmm0 ; mm0 = 00000000 -nextrow: +.nextrow: xor rdx, rdx ; clear out rdx for use as loop counter -nextcol: +.nextcol: movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7 punpcklbw xmm3, xmm0 ; mm3 = p0..p3 movdqa xmm1, xmm3 ; mm1 = p0..p3 @@ -133,7 +133,7 @@ nextcol: add rdx, 8 cmp edx, dword arg(5) ;cols - jl nextcol + jl .nextcol ; done with the all cols, start the across filtering in place sub rsi, rdx @@ -142,7 +142,7 @@ nextcol: xor rdx, rdx movq mm0, QWORD PTR [rdi-8]; -acrossnextcol: +.acrossnextcol: movq xmm7, QWORD PTR [rdi +rdx -2] movd xmm4, DWORD PTR [rdi +rdx +6] @@ -219,7 +219,7 @@ acrossnextcol: add rdx, 8 cmp edx, dword arg(5) ;cols - jl acrossnextcol; + jl .acrossnextcol; ; last 8 pixels movq QWORD PTR [rdi+rdx-8], mm0 @@ -231,7 +231,7 @@ acrossnextcol: mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch? dec rcx ; decrement count - jnz nextrow ; next row + jnz .nextrow ; next row %if ABI_IS_32BIT=1 && CONFIG_PIC=1 add rsp,16 @@ -282,7 +282,7 @@ sym(vp8_mbpost_proc_down_xmm): add dword arg(2), 8 ;for(c=0; c<cols; c+=8) -loop_col: +.loop_col: mov rsi, arg(0) ; s pxor xmm0, xmm0 ; @@ -301,7 +301,7 @@ loop_col: mov rcx, 15 ; -loop_initvar: +.loop_initvar: movq xmm1, QWORD PTR [rdi]; punpcklbw xmm1, xmm0 ; @@ -318,10 +318,10 @@ loop_initvar: lea rdi, [rdi+rax] ; dec rcx - jne loop_initvar + jne .loop_initvar ;save the var and sum xor rdx, rdx -loop_row: +.loop_row: movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] @@ -428,12 +428,12 @@ loop_row: add rdx, 1 cmp edx, dword arg(2) ;rows - jl loop_row + jl .loop_row add dword arg(0), 8 ; s += 8 sub dword arg(3), 8 ; cols -= 8 cmp dword arg(3), 0 - jg loop_col + jg .loop_col add rsp, 128+16 pop rsp @@ -475,13 +475,13 @@ sym(vp8_mbpost_proc_across_ip_xmm): ;for(r=0;r<rows;r++) -ip_row_loop: +.ip_row_loop: xor rdx, rdx ;sumsq=0; xor rcx, rcx ;sum=0; mov rsi, arg(0); s mov rdi, -8 -ip_var_loop: +.ip_var_loop: ;for(i=-8;i<=6;i++) ;{ ; sumsq += s[i]*s[i]; @@ -493,7 +493,7 @@ ip_var_loop: add edx, eax add rdi, 1 cmp rdi, 6 - jle ip_var_loop + jle .ip_var_loop ;mov rax, sumsq @@ -513,7 +513,7 @@ ip_var_loop: pxor mm1, mm1 pxor xmm0, xmm0 -nextcol4: +.nextcol4: movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 @@ -600,7 +600,7 @@ nextcol4: add rcx, 4 cmp rcx, rdx - jl nextcol4 + jl .nextcol4 ;s+=pitch; movsxd rax, dword arg(1) @@ -608,7 +608,7 @@ nextcol4: sub dword arg(2), 1 ;rows-=1 cmp dword arg(2), 0 - jg ip_row_loop + jg .ip_row_loop add rsp, 16 pop rsp @@ -640,7 +640,7 @@ sym(vp8_plane_add_noise_wmt): push rdi ; end prolog -addnoise_loop: +.addnoise_loop: call sym(rand) WRT_PLT mov rcx, arg(1) ;noise and rax, 0xff @@ -657,7 +657,7 @@ addnoise_loop: mov rsi, arg(0) ;Pos xor rax,rax -addnoise_nextset: +.addnoise_nextset: movdqu xmm1,[rsi+rax] ; get the source psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise @@ -671,12 +671,12 @@ addnoise_nextset: add rax,16 ; move to the next line cmp rax, rcx - jl addnoise_nextset + jl .addnoise_nextset movsxd rax, dword arg(7) ; Pitch add arg(0), rax ; Start += Pitch sub dword arg(6), 1 ; Height -= 1 - jg addnoise_loop + jg .addnoise_loop ; begin epilog pop rdi diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm index 0e23116ce6..f54cc4e7e7 100644 --- a/vp8/common/x86/recon_sse2.asm +++ b/vp8/common/x86/recon_sse2.asm @@ -503,7 +503,7 @@ sym(vp8_intra_pred_uv_tm_%1): mov rdi, arg(0) ;dst; movsxd rcx, dword ptr arg(1) ;dst_stride -vp8_intra_pred_uv_tm_%1_loop: +.vp8_intra_pred_uv_tm_%1_loop: movd xmm3, [rsi] movd xmm5, [rsi+rax] %ifidn %1, sse2 @@ -525,7 +525,7 @@ vp8_intra_pred_uv_tm_%1_loop: lea rsi, [rsi+rax*2] lea rdi, [rdi+rcx*2] dec edx - jnz vp8_intra_pred_uv_tm_%1_loop + jnz .vp8_intra_pred_uv_tm_%1_loop ; begin epilog pop rdi @@ -615,7 +615,7 @@ sym(vp8_intra_pred_uv_ho_%1): %endif dec rsi %ifidn %1, mmx2 -vp8_intra_pred_uv_ho_%1_loop: +.vp8_intra_pred_uv_ho_%1_loop: movd mm0, [rsi] movd mm1, [rsi+rax] punpcklbw mm0, mm0 @@ -627,7 +627,7 @@ vp8_intra_pred_uv_ho_%1_loop: lea rsi, [rsi+rax*2] lea rdi, [rdi+rcx*2] dec edx - jnz vp8_intra_pred_uv_ho_%1_loop + jnz .vp8_intra_pred_uv_ho_%1_loop %else movd xmm0, [rsi] movd xmm3, [rsi+rax] diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm index 9004b525d9..e68d950ad5 100644 --- a/vp8/common/x86/subpixel_mmx.asm +++ b/vp8/common/x86/subpixel_mmx.asm @@ -50,7 +50,7 @@ sym(vp8_filter_block1d_h6_mmx): movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? pxor mm0, mm0 ; mm0 = 00000000 -nextrow: +.nextrow: movq mm3, [rsi-2] ; mm3 = p-2..p5 movq mm4, mm3 ; mm4 = p-2..p5 psrlq mm3, 8 ; mm3 = p-1..p5 @@ -102,7 +102,7 @@ nextrow: %endif dec rcx ; decrement count - jnz nextrow ; next row + jnz .nextrow ; next row ; begin epilog pop rdi @@ -152,7 +152,7 @@ sym(vp8_filter_block1dc_v6_mmx): pxor mm0, mm0 ; mm0 = 00000000 -nextrow_cv: +.nextrow_cv: movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. @@ -190,7 +190,7 @@ nextrow_cv: ; avoidable!!!. lea rdi, [rdi+rax] ; dec rcx ; decrement count - jnz nextrow_cv ; next row + jnz .nextrow_cv ; next row pop rbx @@ -282,7 +282,7 @@ sym(vp8_bilinear_predict8x8_mmx): packuswb mm7, mm4 ; add rsi, rdx ; next line -next_row_8x8: +.next_row_8x8: movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 movq mm4, mm3 ; make a copy of current line @@ -349,7 +349,7 @@ next_row_8x8: add rdi, r8 ;dst_pitch %endif cmp rdi, rcx ; - jne next_row_8x8 + jne .next_row_8x8 ; begin epilog pop rdi @@ -437,7 +437,7 @@ sym(vp8_bilinear_predict8x4_mmx): packuswb mm7, mm4 ; add rsi, rdx ; next line -next_row_8x4: +.next_row_8x4: movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 movq mm4, mm3 ; make a copy of current line @@ -504,7 +504,7 @@ next_row_8x4: add rdi, r8 %endif cmp rdi, rcx ; - jne next_row_8x4 + jne .next_row_8x4 ; begin epilog pop rdi @@ -579,7 +579,7 @@ sym(vp8_bilinear_predict4x4_mmx): packuswb mm7, mm0 ; add rsi, rdx ; next line -next_row_4x4: +.next_row_4x4: movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 @@ -622,7 +622,7 @@ next_row_4x4: %endif cmp rdi, rcx ; - jne next_row_4x4 + jne .next_row_4x4 ; begin epilog pop rdi diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm index 83e3b1479b..b62b5c68d1 100644 --- a/vp8/common/x86/subpixel_sse2.asm +++ b/vp8/common/x86/subpixel_sse2.asm @@ -55,7 +55,7 @@ sym(vp8_filter_block1d8_h6_sse2): %endif pxor xmm0, xmm0 ; clear xmm0 for unpack -filter_block1d8_h6_rowloop: +.filter_block1d8_h6_rowloop: movq xmm3, MMWORD PTR [rsi - 2] movq xmm1, MMWORD PTR [rsi + 6] @@ -124,7 +124,7 @@ filter_block1d8_h6_rowloop: %endif dec rcx - jnz filter_block1d8_h6_rowloop ; next row + jnz .filter_block1d8_h6_rowloop ; next row ; begin epilog pop rdi @@ -176,7 +176,7 @@ sym(vp8_filter_block1d16_h6_sse2): pxor xmm0, xmm0 ; clear xmm0 for unpack -filter_block1d16_h6_sse2_rowloop: +.filter_block1d16_h6_sse2_rowloop: movq xmm3, MMWORD PTR [rsi - 2] movq xmm1, MMWORD PTR [rsi + 6] @@ -301,7 +301,7 @@ filter_block1d16_h6_sse2_rowloop: %endif dec rcx - jnz filter_block1d16_h6_sse2_rowloop ; next row + jnz .filter_block1d16_h6_sse2_rowloop ; next row ; begin epilog pop rdi @@ -356,7 +356,7 @@ sym(vp8_filter_block1d8_v6_sse2): movsxd r8, dword ptr arg(2) ; dst_ptich %endif -vp8_filter_block1d8_v6_sse2_loop: +.vp8_filter_block1d8_v6_sse2_loop: movdqa xmm1, XMMWORD PTR [rsi] pmullw xmm1, [rax] @@ -396,7 +396,7 @@ vp8_filter_block1d8_v6_sse2_loop: add rdi, r8 %endif dec rcx ; decrement count - jnz vp8_filter_block1d8_v6_sse2_loop ; next row + jnz .vp8_filter_block1d8_v6_sse2_loop ; next row ; begin epilog pop rdi @@ -448,7 +448,7 @@ sym(vp8_filter_block1d16_v6_sse2): movsxd r8, dword ptr arg(2) ; dst_ptich %endif -vp8_filter_block1d16_v6_sse2_loop: +.vp8_filter_block1d16_v6_sse2_loop: ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] @@ -511,7 +511,7 @@ vp8_filter_block1d16_v6_sse2_loop: add rdi, r8 %endif dec rcx ; decrement count - jnz vp8_filter_block1d16_v6_sse2_loop ; next row + jnz .vp8_filter_block1d16_v6_sse2_loop ; next row ; begin epilog pop rdi @@ -556,7 +556,7 @@ sym(vp8_filter_block1d8_h6_only_sse2): %endif pxor xmm0, xmm0 ; clear xmm0 for unpack -filter_block1d8_h6_only_rowloop: +.filter_block1d8_h6_only_rowloop: movq xmm3, MMWORD PTR [rsi - 2] movq xmm1, MMWORD PTR [rsi + 6] @@ -624,7 +624,7 @@ filter_block1d8_h6_only_rowloop: %endif dec rcx - jnz filter_block1d8_h6_only_rowloop ; next row + jnz .filter_block1d8_h6_only_rowloop ; next row ; begin epilog pop rdi @@ -670,7 +670,7 @@ sym(vp8_filter_block1d16_h6_only_sse2): pxor xmm0, xmm0 ; clear xmm0 for unpack -filter_block1d16_h6_only_sse2_rowloop: +.filter_block1d16_h6_only_sse2_rowloop: movq xmm3, MMWORD PTR [rsi - 2] movq xmm1, MMWORD PTR [rsi + 6] @@ -789,7 +789,7 @@ filter_block1d16_h6_only_sse2_rowloop: %endif dec rcx - jnz filter_block1d16_h6_only_sse2_rowloop ; next row + jnz .filter_block1d16_h6_only_sse2_rowloop ; next row ; begin epilog pop rdi @@ -837,7 +837,7 @@ sym(vp8_filter_block1d8_v6_only_sse2): movsxd r8, dword ptr arg(3) ; dst_ptich %endif -vp8_filter_block1d8_v6_only_sse2_loop: +.vp8_filter_block1d8_v6_only_sse2_loop: movq xmm1, MMWORD PTR [rsi] movq xmm2, MMWORD PTR [rsi + rdx] movq xmm3, MMWORD PTR [rsi + rdx * 2] @@ -883,7 +883,7 @@ vp8_filter_block1d8_v6_only_sse2_loop: add rdi, r8 %endif dec rcx ; decrement count - jnz vp8_filter_block1d8_v6_only_sse2_loop ; next row + jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row ; begin epilog pop rdi @@ -924,7 +924,7 @@ sym(vp8_unpack_block1d16_h6_sse2): movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source %endif -unpack_block1d16_h6_sse2_rowloop: +.unpack_block1d16_h6_sse2_rowloop: movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 @@ -941,7 +941,7 @@ unpack_block1d16_h6_sse2_rowloop: add rdi, r8 %endif dec rcx - jnz unpack_block1d16_h6_sse2_rowloop ; next row + jnz .unpack_block1d16_h6_sse2_rowloop ; next row ; begin epilog pop rdi @@ -980,7 +980,7 @@ sym(vp8_bilinear_predict16x16_sse2): movsxd rax, dword ptr arg(2) ;xoffset cmp rax, 0 ;skip first_pass filter if xoffset=0 - je b16x16_sp_only + je .b16x16_sp_only shl rax, 5 add rax, rcx ;HFilter @@ -995,7 +995,7 @@ sym(vp8_bilinear_predict16x16_sse2): movsxd rax, dword ptr arg(3) ;yoffset cmp rax, 0 ;skip second_pass filter if yoffset=0 - je b16x16_fp_only + je .b16x16_fp_only shl rax, 5 add rax, rcx ;VFilter @@ -1041,7 +1041,7 @@ sym(vp8_bilinear_predict16x16_sse2): packuswb xmm7, xmm4 add rsi, rdx ; next line -next_row: +.next_row: movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 movdqa xmm4, xmm3 ; make a copy of current line @@ -1104,11 +1104,11 @@ next_row: %endif cmp rdi, rcx - jne next_row + jne .next_row - jmp done + jmp .done -b16x16_sp_only: +.b16x16_sp_only: movsxd rax, dword ptr arg(3) ;yoffset shl rax, 5 add rax, rcx ;VFilter @@ -1130,7 +1130,7 @@ b16x16_sp_only: movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 add rsi, rax ; next line -next_row_spo: +.next_row_spo: movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 movdqa xmm5, xmm7 @@ -1164,17 +1164,17 @@ next_row_spo: add rsi, rax ; next line add rdi, rdx ;dst_pitch cmp rdi, rcx - jne next_row_spo + jne .next_row_spo - jmp done + jmp .done -b16x16_fp_only: +.b16x16_fp_only: lea rcx, [rdi+rdx*8] lea rcx, [rcx+rdx*8] movsxd rax, dword ptr arg(1) ;src_pixels_per_line pxor xmm0, xmm0 -next_row_fpo: +.next_row_fpo: movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 movdqa xmm4, xmm3 ; make a copy of current line @@ -1208,9 +1208,9 @@ next_row_fpo: add rsi, rax ; next line add rdi, rdx ; dst_pitch cmp rdi, rcx - jne next_row_fpo + jne .next_row_fpo -done: +.done: ; begin epilog pop rdi pop rsi @@ -1318,7 +1318,7 @@ sym(vp8_bilinear_predict8x8_sse2): movdqa xmm7, xmm3 add rsp, 16 ; next line -next_row8x8: +.next_row8x8: movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 movdqa xmm4, xmm3 ; make a copy of current line psrldq xmm4, 1 @@ -1352,7 +1352,7 @@ next_row8x8: add rdi, rdx cmp rdi, rcx - jne next_row8x8 + jne .next_row8x8 ;add rsp, 144 pop rsp diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm index 1ddbc54bd6..6bca82bfb3 100644 --- a/vp8/common/x86/subpixel_ssse3.asm +++ b/vp8/common/x86/subpixel_ssse3.asm @@ -70,7 +70,7 @@ sym(vp8_filter_block1d8_h6_ssse3): sub rdi, rdx ;xmm3 free -filter_block1d8_h6_rowloop_ssse3: +.filter_block1d8_h6_rowloop_ssse3: movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 @@ -102,7 +102,7 @@ filter_block1d8_h6_rowloop_ssse3: packuswb xmm0, xmm0 movq MMWORD Ptr [rdi], xmm0 - jnz filter_block1d8_h6_rowloop_ssse3 + jnz .filter_block1d8_h6_rowloop_ssse3 ; begin epilog pop rdi @@ -129,7 +129,7 @@ vp8_filter_block1d8_h4_ssse3: sub rdi, rdx -filter_block1d8_h4_rowloop_ssse3: +.filter_block1d8_h4_rowloop_ssse3: movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 @@ -158,7 +158,7 @@ filter_block1d8_h4_rowloop_ssse3: movq MMWORD Ptr [rdi], xmm0 - jnz filter_block1d8_h4_rowloop_ssse3 + jnz .filter_block1d8_h4_rowloop_ssse3 ; begin epilog pop rdi @@ -207,7 +207,7 @@ sym(vp8_filter_block1d16_h6_ssse3): movsxd rcx, dword ptr arg(4) ;output_height movsxd rdx, dword ptr arg(3) ;output_pitch -filter_block1d16_h6_rowloop_ssse3: +.filter_block1d16_h6_rowloop_ssse3: movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 @@ -264,7 +264,7 @@ filter_block1d16_h6_rowloop_ssse3: lea rdi, [rdi + rdx] dec rcx - jnz filter_block1d16_h6_rowloop_ssse3 + jnz .filter_block1d16_h6_rowloop_ssse3 ; begin epilog pop rdi @@ -304,7 +304,7 @@ sym(vp8_filter_block1d4_h6_ssse3): movdqa xmm7, [GLOBAL(rd)] cmp esi, DWORD PTR [rax] - je vp8_filter_block1d4_h4_ssse3 + je .vp8_filter_block1d4_h4_ssse3 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 @@ -318,7 +318,7 @@ sym(vp8_filter_block1d4_h6_ssse3): movsxd rdx, dword ptr arg(3) ;output_pitch ;xmm3 free -filter_block1d4_h6_rowloop_ssse3: +.filter_block1d4_h6_rowloop_ssse3: movdqu xmm0, XMMWORD PTR [rsi - 2] movdqa xmm1, xmm0 @@ -346,7 +346,7 @@ filter_block1d4_h6_rowloop_ssse3: add rdi, rdx dec rcx - jnz filter_block1d4_h6_rowloop_ssse3 + jnz .filter_block1d4_h6_rowloop_ssse3 ; begin epilog pop rdi @@ -356,7 +356,7 @@ filter_block1d4_h6_rowloop_ssse3: pop rbp ret -vp8_filter_block1d4_h4_ssse3: +.vp8_filter_block1d4_h4_ssse3: movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] @@ -369,7 +369,7 @@ vp8_filter_block1d4_h4_ssse3: movsxd rdx, dword ptr arg(3) ;output_pitch -filter_block1d4_h4_rowloop_ssse3: +.filter_block1d4_h4_rowloop_ssse3: movdqu xmm1, XMMWORD PTR [rsi - 2] movdqa xmm2, xmm1 @@ -391,7 +391,7 @@ filter_block1d4_h4_rowloop_ssse3: add rdi, rdx dec rcx - jnz filter_block1d4_h4_rowloop_ssse3 + jnz .filter_block1d4_h4_rowloop_ssse3 ; begin epilog pop rdi @@ -432,7 +432,7 @@ sym(vp8_filter_block1d16_v6_ssse3): add rax, rdx cmp esi, DWORD PTR [rax] - je vp8_filter_block1d16_v4_ssse3 + je .vp8_filter_block1d16_v4_ssse3 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 @@ -450,7 +450,7 @@ sym(vp8_filter_block1d16_v6_ssse3): add rax, rdx -vp8_filter_block1d16_v6_ssse3_loop: +.vp8_filter_block1d16_v6_ssse3_loop: movq xmm1, MMWORD PTR [rsi] ;A movq xmm2, MMWORD PTR [rsi + rdx] ;B movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C @@ -508,7 +508,7 @@ vp8_filter_block1d16_v6_ssse3_loop: add rdi, r8 %endif dec rcx - jnz vp8_filter_block1d16_v6_ssse3_loop + jnz .vp8_filter_block1d16_v6_ssse3_loop ; begin epilog pop rdi @@ -519,7 +519,7 @@ vp8_filter_block1d16_v6_ssse3_loop: pop rbp ret -vp8_filter_block1d16_v4_ssse3: +.vp8_filter_block1d16_v4_ssse3: movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 @@ -534,7 +534,7 @@ vp8_filter_block1d16_v4_ssse3: movsxd rcx, DWORD PTR arg(4) ;output_height add rax, rdx -vp8_filter_block1d16_v4_ssse3_loop: +.vp8_filter_block1d16_v4_ssse3_loop: movq xmm2, MMWORD PTR [rsi + rdx] ;B movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C movq xmm4, MMWORD PTR [rax + rdx * 2] ;D @@ -581,7 +581,7 @@ vp8_filter_block1d16_v4_ssse3_loop: add rdi, r8 %endif dec rcx - jnz vp8_filter_block1d16_v4_ssse3_loop + jnz .vp8_filter_block1d16_v4_ssse3_loop ; begin epilog pop rdi @@ -627,7 +627,7 @@ sym(vp8_filter_block1d8_v6_ssse3): movsxd rcx, DWORD PTR arg(4) ;[output_height] cmp esi, DWORD PTR [rax] - je vp8_filter_block1d8_v4_ssse3 + je .vp8_filter_block1d8_v4_ssse3 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 @@ -638,7 +638,7 @@ sym(vp8_filter_block1d8_v6_ssse3): mov rax, rsi add rax, rdx -vp8_filter_block1d8_v6_ssse3_loop: +.vp8_filter_block1d8_v6_ssse3_loop: movq xmm1, MMWORD PTR [rsi] ;A movq xmm2, MMWORD PTR [rsi + rdx] ;B movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C @@ -673,7 +673,7 @@ vp8_filter_block1d8_v6_ssse3_loop: add rdi, r8 %endif dec rcx - jnz vp8_filter_block1d8_v6_ssse3_loop + jnz .vp8_filter_block1d8_v6_ssse3_loop ; begin epilog pop rdi @@ -684,7 +684,7 @@ vp8_filter_block1d8_v6_ssse3_loop: pop rbp ret -vp8_filter_block1d8_v4_ssse3: +.vp8_filter_block1d8_v4_ssse3: movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 movdqa xmm5, [GLOBAL(rd)] @@ -694,7 +694,7 @@ vp8_filter_block1d8_v4_ssse3: mov rax, rsi add rax, rdx -vp8_filter_block1d8_v4_ssse3_loop: +.vp8_filter_block1d8_v4_ssse3_loop: movq xmm2, MMWORD PTR [rsi + rdx] ;B movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C movq xmm4, MMWORD PTR [rax + rdx * 2] ;D @@ -722,7 +722,7 @@ vp8_filter_block1d8_v4_ssse3_loop: add rdi, r8 %endif dec rcx - jnz vp8_filter_block1d8_v4_ssse3_loop + jnz .vp8_filter_block1d8_v4_ssse3_loop ; begin epilog pop rdi @@ -766,7 +766,7 @@ sym(vp8_filter_block1d4_v6_ssse3): movsxd rcx, DWORD PTR arg(4) ;[output_height] cmp esi, DWORD PTR [rax] - je vp8_filter_block1d4_v4_ssse3 + je .vp8_filter_block1d4_v4_ssse3 movq mm5, MMWORD PTR [rax] ;k0_k5 movq mm6, MMWORD PTR [rax+256] ;k2_k4 @@ -777,7 +777,7 @@ sym(vp8_filter_block1d4_v6_ssse3): mov rax, rsi add rax, rdx -vp8_filter_block1d4_v6_ssse3_loop: +.vp8_filter_block1d4_v6_ssse3_loop: movd mm1, DWORD PTR [rsi] ;A movd mm2, DWORD PTR [rsi + rdx] ;B movd mm3, DWORD PTR [rsi + rdx * 2] ;C @@ -813,7 +813,7 @@ vp8_filter_block1d4_v6_ssse3_loop: add rdi, r8 %endif dec rcx - jnz vp8_filter_block1d4_v6_ssse3_loop + jnz .vp8_filter_block1d4_v6_ssse3_loop ; begin epilog pop rdi @@ -823,7 +823,7 @@ vp8_filter_block1d4_v6_ssse3_loop: pop rbp ret -vp8_filter_block1d4_v4_ssse3: +.vp8_filter_block1d4_v4_ssse3: movq mm6, MMWORD PTR [rax+256] ;k2_k4 movq mm7, MMWORD PTR [rax+128] ;k1_k3 movq mm5, MMWORD PTR [GLOBAL(rd)] @@ -833,7 +833,7 @@ vp8_filter_block1d4_v4_ssse3: mov rax, rsi add rax, rdx -vp8_filter_block1d4_v4_ssse3_loop: +.vp8_filter_block1d4_v4_ssse3_loop: movd mm2, DWORD PTR [rsi + rdx] ;B movd mm3, DWORD PTR [rsi + rdx * 2] ;C movd mm4, DWORD PTR [rax + rdx * 2] ;D @@ -861,7 +861,7 @@ vp8_filter_block1d4_v4_ssse3_loop: add rdi, r8 %endif dec rcx - jnz vp8_filter_block1d4_v4_ssse3_loop + jnz .vp8_filter_block1d4_v4_ssse3_loop ; begin epilog pop rdi @@ -895,7 +895,7 @@ sym(vp8_bilinear_predict16x16_ssse3): movsxd rax, dword ptr arg(2) ; xoffset cmp rax, 0 ; skip first_pass filter if xoffset=0 - je b16x16_sp_only + je .b16x16_sp_only shl rax, 4 lea rax, [rax + rcx] ; HFilter @@ -909,7 +909,7 @@ sym(vp8_bilinear_predict16x16_ssse3): movsxd rax, dword ptr arg(3) ; yoffset cmp rax, 0 ; skip second_pass filter if yoffset=0 - je b16x16_fp_only + je .b16x16_fp_only shl rax, 4 lea rax, [rax + rcx] ; VFilter @@ -996,9 +996,9 @@ sym(vp8_bilinear_predict16x16_ssse3): cmp rdi, rcx jne .next_row - jmp done + jmp .done -b16x16_sp_only: +.b16x16_sp_only: movsxd rax, dword ptr arg(3) ; yoffset shl rax, 4 lea rax, [rax + rcx] ; VFilter @@ -1018,7 +1018,7 @@ b16x16_sp_only: movq xmm2, [rsi + 8] ; load row 0 lea rsi, [rsi + rax] ; next line -.next_row: +.next_row_sp: movq xmm3, [rsi] ; load row + 1 movq xmm5, [rsi + 8] ; load row + 1 @@ -1062,16 +1062,16 @@ b16x16_sp_only: lea rdi, [rdi + 2*rdx] cmp rdi, rcx - jne .next_row + jne .next_row_sp - jmp done + jmp .done -b16x16_fp_only: +.b16x16_fp_only: lea rcx, [rdi+rdx*8] lea rcx, [rcx+rdx*8] movsxd rax, dword ptr arg(1) ; src_pixels_per_line -.next_row: +.next_row_fp: movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 @@ -1122,9 +1122,9 @@ b16x16_fp_only: cmp rdi, rcx - jne .next_row + jne .next_row_fp -done: +.done: ; begin epilog pop rdi pop rsi @@ -1191,7 +1191,7 @@ sym(vp8_bilinear_predict8x8_ssse3): movsxd rax, dword ptr arg(2) ; xoffset cmp rax, 0 ; skip first_pass filter if xoffset=0 - je b8x8_sp_only + je .b8x8_sp_only shl rax, 4 add rax, rcx ; HFilter @@ -1203,7 +1203,7 @@ sym(vp8_bilinear_predict8x8_ssse3): movsxd rax, dword ptr arg(3) ; yoffset cmp rax, 0 ; skip second_pass filter if yoffset=0 - je b8x8_fp_only + je .b8x8_fp_only shl rax, 4 lea rax, [rax + rcx] ; VFilter @@ -1260,9 +1260,9 @@ sym(vp8_bilinear_predict8x8_ssse3): cmp rdi, rcx jne .next_row - jmp done8x8 + jmp .done8x8 -b8x8_sp_only: +.b8x8_sp_only: movsxd rax, dword ptr arg(3) ; yoffset shl rax, 4 lea rax, [rax + rcx] ; VFilter @@ -1364,12 +1364,12 @@ b8x8_sp_only: movq [rdi+rdx], xmm1 lea rsp, [rsp + 144] - jmp done8x8 + jmp .done8x8 -b8x8_fp_only: +.b8x8_fp_only: lea rcx, [rdi+rdx*8] -.next_row: +.next_row_fp: movdqa xmm1, XMMWORD PTR [rsp] movdqa xmm3, XMMWORD PTR [rsp+16] @@ -1430,11 +1430,11 @@ b8x8_fp_only: lea rdi, [rdi + 2*rdx] cmp rdi, rcx - jne .next_row + jne .next_row_fp lea rsp, [rsp + 16] -done8x8: +.done8x8: ;add rsp, 144 pop rsp ; begin epilog diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm index 9946294995..7ec7d603c9 100644 --- a/vp8/encoder/x86/encodeopt.asm +++ b/vp8/encoder/x86/encodeopt.asm @@ -148,7 +148,7 @@ sym(vp8_mbblock_error_mmx_impl): pcmpeqw mm1, mm7 mov rcx, 16 -mberror_loop_mmx: +.mberror_loop_mmx: movq mm3, [rsi] movq mm4, [rdi] @@ -186,7 +186,7 @@ mberror_loop_mmx: add rdi, 32 sub rcx, 1 - jnz mberror_loop_mmx + jnz .mberror_loop_mmx movq mm0, mm2 psrlq mm2, 32 @@ -226,7 +226,7 @@ sym(vp8_mbblock_error_xmm_impl): pcmpeqw xmm5, xmm6 mov rcx, 16 -mberror_loop: +.mberror_loop: movdqa xmm0, [rsi] movdqa xmm1, [rdi] @@ -249,7 +249,7 @@ mberror_loop: paddd xmm4, xmm2 paddd xmm4, xmm0 - jnz mberror_loop + jnz .mberror_loop movdqa xmm0, xmm4 punpckldq xmm0, xmm6 @@ -289,7 +289,7 @@ sym(vp8_mbuverror_mmx_impl): mov rcx, 16 pxor mm7, mm7 -mbuverror_loop_mmx: +.mbuverror_loop_mmx: movq mm1, [rsi] movq mm2, [rdi] @@ -313,7 +313,7 @@ mbuverror_loop_mmx: add rdi, 16 dec rcx - jnz mbuverror_loop_mmx + jnz .mbuverror_loop_mmx movq mm0, mm7 psrlq mm7, 32 @@ -346,7 +346,7 @@ sym(vp8_mbuverror_xmm_impl): mov rcx, 16 pxor xmm3, xmm3 -mbuverror_loop: +.mbuverror_loop: movdqa xmm1, [rsi] movdqa xmm2, [rdi] @@ -360,7 +360,7 @@ mbuverror_loop: add rdi, 16 dec rcx - jnz mbuverror_loop + jnz .mbuverror_loop pxor xmm0, xmm0 movdqa xmm1, xmm3 diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index 056b64c390..c483933df1 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -137,17 +137,17 @@ sym(vp8_regular_quantize_b_sse2): ; if (x >= zbin) sub cx, WORD PTR[rdx] ; x - zbin lea rdx, [rdx + 2] ; zbin_boost_ptr++ - jl rq_zigzag_loop_%1 ; x < zbin + jl .rq_zigzag_loop_%1 ; x < zbin movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2] ; downshift by quant_shift[rc] movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc] sar edi, cl ; also sets Z bit - je rq_zigzag_loop_%1 ; !y + je .rq_zigzag_loop_%1 ; !y mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc] mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost -rq_zigzag_loop_%1: +.rq_zigzag_loop_%1: %endmacro ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c ZIGZAG_LOOP 0 diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm index 258899eedd..95e1c20744 100644 --- a/vp8/encoder/x86/quantize_sse4.asm +++ b/vp8/encoder/x86/quantize_sse4.asm @@ -140,21 +140,21 @@ sym(vp8_regular_quantize_b_sse4): ; if (x >= zbin) sub cx, WORD PTR[rdx] ; x - zbin lea rdx, [rdx + 2] ; zbin_boost_ptr++ - jl rq_zigzag_loop_%1 ; x < zbin + jl .rq_zigzag_loop_%1 ; x < zbin pextrw edi, %3, %2 ; y ; downshift by quant_shift[rc] pextrb ecx, xmm5, %1 ; quant_shift[rc] sar edi, cl ; also sets Z bit - je rq_zigzag_loop_%1 ; !y + je .rq_zigzag_loop_%1 ; !y %if ABI_IS_32BIT mov WORD PTR[rsp + qcoeff + %1 *2], di %else pinsrw %5, edi, %2 ; qcoeff[rc] %endif mov rdx, rax ; reset to b->zrun_zbin_boost -rq_zigzag_loop_%1: +.rq_zigzag_loop_%1: %endmacro ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4 diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm index 85cb023a48..407b399790 100644 --- a/vp8/encoder/x86/sad_mmx.asm +++ b/vp8/encoder/x86/sad_mmx.asm @@ -43,7 +43,7 @@ sym(vp8_sad16x16_mmx): pxor mm6, mm6 -x16x16sad_mmx_loop: +.x16x16sad_mmx_loop: movq mm0, QWORD PTR [rsi] movq mm2, QWORD PTR [rsi+8] @@ -83,7 +83,7 @@ x16x16sad_mmx_loop: paddw mm7, mm1 cmp rsi, rcx - jne x16x16sad_mmx_loop + jne .x16x16sad_mmx_loop movq mm0, mm7 @@ -135,7 +135,7 @@ sym(vp8_sad8x16_mmx): pxor mm6, mm6 -x8x16sad_mmx_loop: +.x8x16sad_mmx_loop: movq mm0, QWORD PTR [rsi] movq mm1, QWORD PTR [rdi] @@ -158,7 +158,7 @@ x8x16sad_mmx_loop: paddw mm7, mm2 cmp rsi, rcx - jne x8x16sad_mmx_loop + jne .x8x16sad_mmx_loop movq mm0, mm7 punpcklwd mm0, mm6 @@ -205,7 +205,7 @@ sym(vp8_sad8x8_mmx): pxor mm6, mm6 -x8x8sad_mmx_loop: +.x8x8sad_mmx_loop: movq mm0, QWORD PTR [rsi] movq mm1, QWORD PTR [rdi] @@ -228,7 +228,7 @@ x8x8sad_mmx_loop: paddw mm7, mm0 cmp rsi, rcx - jne x8x8sad_mmx_loop + jne .x8x8sad_mmx_loop movq mm0, mm7 punpcklwd mm0, mm6 @@ -364,7 +364,7 @@ sym(vp8_sad16x8_mmx): pxor mm6, mm6 -x16x8sad_mmx_loop: +.x16x8sad_mmx_loop: movq mm0, [rsi] movq mm1, [rdi] @@ -404,7 +404,7 @@ x16x8sad_mmx_loop: paddw mm7, mm0 cmp rsi, rcx - jne x16x8sad_mmx_loop + jne .x16x8sad_mmx_loop movq mm0, mm7 punpcklwd mm0, mm6 diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm index 1011c95538..fa8e3e3f8c 100644 --- a/vp8/encoder/x86/sad_sse2.asm +++ b/vp8/encoder/x86/sad_sse2.asm @@ -37,7 +37,7 @@ sym(vp8_sad16x16_wmt): lea rcx, [rcx+rax*8] pxor xmm6, xmm6 -x16x16sad_wmt_loop: +.x16x16sad_wmt_loop: movq xmm0, QWORD PTR [rsi] movq xmm2, QWORD PTR [rsi+8] @@ -68,7 +68,7 @@ x16x16sad_wmt_loop: paddw xmm6, xmm4 cmp rsi, rcx - jne x16x16sad_wmt_loop + jne .x16x16sad_wmt_loop movq xmm0, xmm6 psrldq xmm6, 8 @@ -111,11 +111,11 @@ sym(vp8_sad8x16_wmt): lea rcx, [rcx+rbx*8] pxor mm7, mm7 -x8x16sad_wmt_loop: +.x8x16sad_wmt_loop: movq rax, mm7 cmp eax, arg(4) - jg x8x16sad_wmt_early_exit + jg .x8x16sad_wmt_early_exit movq mm0, QWORD PTR [rsi] movq mm1, QWORD PTR [rdi] @@ -133,11 +133,11 @@ x8x16sad_wmt_loop: paddw mm7, mm2 cmp rsi, rcx - jne x8x16sad_wmt_loop + jne .x8x16sad_wmt_loop movq rax, mm7 -x8x16sad_wmt_early_exit: +.x8x16sad_wmt_early_exit: ; begin epilog pop rdi @@ -172,11 +172,11 @@ sym(vp8_sad8x8_wmt): lea rcx, [rsi+rbx*8] pxor mm7, mm7 -x8x8sad_wmt_loop: +.x8x8sad_wmt_loop: movq rax, mm7 cmp eax, arg(4) - jg x8x8sad_wmt_early_exit + jg .x8x8sad_wmt_early_exit movq mm0, QWORD PTR [rsi] movq mm1, QWORD PTR [rdi] @@ -188,10 +188,10 @@ x8x8sad_wmt_loop: paddw mm7, mm0 cmp rsi, rcx - jne x8x8sad_wmt_loop + jne .x8x8sad_wmt_loop movq rax, mm7 -x8x8sad_wmt_early_exit: +.x8x8sad_wmt_early_exit: ; begin epilog pop rdi @@ -281,11 +281,11 @@ sym(vp8_sad16x8_wmt): lea rcx, [rsi+rbx*8] pxor mm7, mm7 -x16x8sad_wmt_loop: +.x16x8sad_wmt_loop: movq rax, mm7 cmp eax, arg(4) - jg x16x8sad_wmt_early_exit + jg .x16x8sad_wmt_early_exit movq mm0, QWORD PTR [rsi] movq mm2, QWORD PTR [rsi+8] @@ -315,11 +315,11 @@ x16x8sad_wmt_loop: paddw mm7, mm4 cmp rsi, rcx - jne x16x8sad_wmt_loop + jne .x16x8sad_wmt_loop movq rax, mm7 -x16x8sad_wmt_early_exit: +.x16x8sad_wmt_early_exit: ; begin epilog pop rdi @@ -352,7 +352,7 @@ sym(vp8_copy32xn_sse2): movsxd rdx, dword ptr arg(3) ;dst_stride movsxd rcx, dword ptr arg(4) ;height -block_copy_sse2_loopx4: +.block_copy_sse2_loopx4: movdqu xmm0, XMMWORD PTR [rsi] movdqu xmm1, XMMWORD PTR [rsi + 16] movdqu xmm2, XMMWORD PTR [rsi + rax] @@ -383,12 +383,12 @@ block_copy_sse2_loopx4: sub rcx, 4 cmp rcx, 4 - jge block_copy_sse2_loopx4 + jge .block_copy_sse2_loopx4 cmp rcx, 0 - je copy_is_done + je .copy_is_done -block_copy_sse2_loop: +.block_copy_sse2_loop: movdqu xmm0, XMMWORD PTR [rsi] movdqu xmm1, XMMWORD PTR [rsi + 16] lea rsi, [rsi+rax] @@ -398,9 +398,9 @@ block_copy_sse2_loop: lea rdi, [rdi+rdx] sub rcx, 1 - jne block_copy_sse2_loop + jne .block_copy_sse2_loop -copy_is_done: +.copy_is_done: ; begin epilog pop rdi pop rsi diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm index 9e05521664..a2550974cf 100644 --- a/vp8/encoder/x86/sad_sse3.asm +++ b/vp8/encoder/x86/sad_sse3.asm @@ -647,7 +647,7 @@ sym(vp8_copy32xn_sse3): STACK_FRAME_CREATE_X3 -block_copy_sse3_loopx4: +.block_copy_sse3_loopx4: lea end_ptr, [src_ptr+src_stride*2] movdqu xmm0, XMMWORD PTR [src_ptr] @@ -676,13 +676,13 @@ block_copy_sse3_loopx4: sub height, 4 cmp height, 4 - jge block_copy_sse3_loopx4 + jge .block_copy_sse3_loopx4 ;Check to see if there is more rows need to be copied. cmp height, 0 - je copy_is_done + je .copy_is_done -block_copy_sse3_loop: +.block_copy_sse3_loop: movdqu xmm0, XMMWORD PTR [src_ptr] movdqu xmm1, XMMWORD PTR [src_ptr + 16] lea src_ptr, [src_ptr+src_stride] @@ -692,9 +692,9 @@ block_copy_sse3_loop: lea ref_ptr, [ref_ptr+ref_stride] sub height, 1 - jne block_copy_sse3_loop + jne .block_copy_sse3_loop -copy_is_done: +.copy_is_done: STACK_FRAME_DESTROY_X3 ;void vp8_sad16x16x4d_sse3( diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm index 6ecf081843..95b6c89e64 100644 --- a/vp8/encoder/x86/sad_ssse3.asm +++ b/vp8/encoder/x86/sad_ssse3.asm @@ -169,30 +169,30 @@ sym(vp8_sad16x16x3_ssse3): mov rdx, 0xf and rdx, rdi - jmp vp8_sad16x16x3_ssse3_skiptable -vp8_sad16x16x3_ssse3_jumptable: - dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump - dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump -vp8_sad16x16x3_ssse3_skiptable: - - call vp8_sad16x16x3_ssse3_do_jump -vp8_sad16x16x3_ssse3_do_jump: + jmp .vp8_sad16x16x3_ssse3_skiptable +.vp8_sad16x16x3_ssse3_jumptable: + dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump + dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump +.vp8_sad16x16x3_ssse3_skiptable: + + call .vp8_sad16x16x3_ssse3_do_jump +.vp8_sad16x16x3_ssse3_do_jump: pop rcx ; get the address of do_jump - mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump + mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable @@ -203,23 +203,23 @@ vp8_sad16x16x3_ssse3_do_jump: jmp rcx - PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3 - -vp8_sad16x16x3_ssse3_aligned_by_15: + PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3 + +.vp8_sad16x16x3_ssse3_aligned_by_15: PROCESS_16X2X3 1 PROCESS_16X2X3 0 PROCESS_16X2X3 0 @@ -229,7 +229,7 @@ vp8_sad16x16x3_ssse3_aligned_by_15: PROCESS_16X2X3 0 PROCESS_16X2X3 0 -vp8_sad16x16x3_ssse3_store_off: +.vp8_sad16x16x3_ssse3_store_off: mov rdi, arg(4) ;Results movq xmm0, xmm5 @@ -282,30 +282,30 @@ sym(vp8_sad16x8x3_ssse3): mov rdx, 0xf and rdx, rdi - jmp vp8_sad16x8x3_ssse3_skiptable -vp8_sad16x8x3_ssse3_jumptable: - dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump - dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump -vp8_sad16x8x3_ssse3_skiptable: - - call vp8_sad16x8x3_ssse3_do_jump -vp8_sad16x8x3_ssse3_do_jump: + jmp .vp8_sad16x8x3_ssse3_skiptable +.vp8_sad16x8x3_ssse3_jumptable: + dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump + dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump +.vp8_sad16x8x3_ssse3_skiptable: + + call .vp8_sad16x8x3_ssse3_do_jump +.vp8_sad16x8x3_ssse3_do_jump: pop rcx ; get the address of do_jump - mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump + mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable @@ -316,30 +316,30 @@ vp8_sad16x8x3_ssse3_do_jump: jmp rcx - PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3 - -vp8_sad16x8x3_ssse3_aligned_by_15: + PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3 + +.vp8_sad16x8x3_ssse3_aligned_by_15: PROCESS_16X2X3 1 PROCESS_16X2X3 0 PROCESS_16X2X3 0 PROCESS_16X2X3 0 -vp8_sad16x8x3_ssse3_store_off: +.vp8_sad16x8x3_ssse3_store_off: mov rdi, arg(4) ;Results movq xmm0, xmm5 diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm index 8af4b4533f..c6db3d1c62 100644 --- a/vp8/encoder/x86/ssim_opt.asm +++ b/vp8/encoder/x86/ssim_opt.asm @@ -84,7 +84,7 @@ sym(vp8_ssim_parms_16x16_sse2): pxor xmm11,xmm11 ;sum_sxr mov rdx, 16 ;row counter -NextRow: +.NextRow: ;grab source and reference pixels movdqu xmm5, [rsi] @@ -107,7 +107,7 @@ NextRow: add rdi, rax ; next r row dec rdx ; counter - jnz NextRow + jnz .NextRow SUM_ACROSS_W xmm15 SUM_ACROSS_W xmm14 @@ -174,7 +174,7 @@ sym(vp8_ssim_parms_8x8_sse2): pxor xmm11,xmm11 ;sum_sxr mov rdx, 8 ;row counter -NextRow2: +.NextRow: ;grab source and reference pixels movq xmm3, [rsi] @@ -188,7 +188,7 @@ NextRow2: add rdi, rax ; next r row dec rdx ; counter - jnz NextRow2 + jnz .NextRow SUM_ACROSS_W xmm15 SUM_ACROSS_W xmm14 diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm index a47e1f0d6e..4ce16ce900 100644 --- a/vp8/encoder/x86/subtract_mmx.asm +++ b/vp8/encoder/x86/subtract_mmx.asm @@ -93,7 +93,7 @@ sym(vp8_subtract_mby_mmx): mov rcx, 16 pxor mm0, mm0 -submby_loop: +.submby_loop: movq mm1, [rsi] movq mm3, [rax] @@ -139,7 +139,7 @@ submby_loop: lea rsi, [rsi+rdx] sub rcx, 1 - jnz submby_loop + jnz .submby_loop pop rdi pop rsi diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm index 95888f6be9..3bd1ff6784 100644 --- a/vp8/encoder/x86/subtract_sse2.asm +++ b/vp8/encoder/x86/subtract_sse2.asm @@ -91,7 +91,7 @@ sym(vp8_subtract_mby_sse2): mov rcx, 8 ; do two lines at one time -submby_loop: +.submby_loop: movdqa xmm0, XMMWORD PTR [rsi] ; src movdqa xmm1, XMMWORD PTR [rax] ; pred @@ -133,7 +133,7 @@ submby_loop: lea rsi, [rsi+rdx*2] sub rcx, 1 - jnz submby_loop + jnz .submby_loop pop rdi pop rsi diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm index b777ef5666..b97c694391 100644 --- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm +++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -71,26 +71,26 @@ sym(vp8_temporal_filter_apply_sse2): lea rcx, [rdx + 16*16*1] cmp dword ptr [rsp + block_size], 8 - jne temporal_filter_apply_load_16 + jne .temporal_filter_apply_load_16 lea rcx, [rdx + 8*8*1] -temporal_filter_apply_load_8: +.temporal_filter_apply_load_8: movq xmm0, [rsi] ; first row lea rsi, [rsi + rbp] ; += stride punpcklbw xmm0, xmm7 ; src[ 0- 7] movq xmm1, [rsi] ; second row lea rsi, [rsi + rbp] ; += stride punpcklbw xmm1, xmm7 ; src[ 8-15] - jmp temporal_filter_apply_load_finished + jmp .temporal_filter_apply_load_finished -temporal_filter_apply_load_16: +.temporal_filter_apply_load_16: movdqa xmm0, [rsi] ; src (frame1) lea rsi, [rsi + rbp] ; += stride movdqa xmm1, xmm0 punpcklbw xmm0, xmm7 ; src[ 0- 7] punpckhbw xmm1, xmm7 ; src[ 8-15] -temporal_filter_apply_load_finished: +.temporal_filter_apply_load_finished: movdqa xmm2, [rdx] ; predictor (frame2) movdqa xmm3, xmm2 punpcklbw xmm2, xmm7 ; pred[ 0- 7] @@ -176,13 +176,13 @@ temporal_filter_apply_load_finished: lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) cmp rdx, rcx - je temporal_filter_apply_epilog + je .temporal_filter_apply_epilog pxor xmm7, xmm7 ; zero for extraction cmp dword ptr [rsp + block_size], 16 - je temporal_filter_apply_load_16 - jmp temporal_filter_apply_load_8 + je .temporal_filter_apply_load_16 + jmp .temporal_filter_apply_load_8 -temporal_filter_apply_epilog: +.temporal_filter_apply_epilog: ; begin epilog mov rbp, [rsp + rbp_backup] add rsp, stack_size diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm index 13b76ea91f..2be8bbeb3d 100644 --- a/vp8/encoder/x86/variance_impl_mmx.asm +++ b/vp8/encoder/x86/variance_impl_mmx.asm @@ -27,7 +27,7 @@ sym(vp8_get_mb_ss_mmx): mov rcx, 16 pxor mm4, mm4 -NEXTROW: +.NEXTROW: movq mm0, [rax] movq mm1, [rax+8] movq mm2, [rax+16] @@ -44,7 +44,7 @@ NEXTROW: add rax, 32 dec rcx - ja NEXTROW + ja .NEXTROW movq QWORD PTR [rsp], mm4 ;return sum[0]+sum[1]; @@ -568,7 +568,7 @@ sym(vp8_filter_block2d_bil4x4_var_mmx): add rsi, r8 %endif -filter_block2d_bil4x4_var_mmx_loop: +.filter_block2d_bil4x4_var_mmx_loop: movd mm1, [rsi] ; movd mm3, [rsi+1] ; @@ -614,7 +614,7 @@ filter_block2d_bil4x4_var_mmx_loop: add rdi, r9 %endif sub rcx, 1 ; - jnz filter_block2d_bil4x4_var_mmx_loop ; + jnz .filter_block2d_bil4x4_var_mmx_loop ; pxor mm3, mm3 ; @@ -726,7 +726,7 @@ sym(vp8_filter_block2d_bil_var_mmx): add rsi, r8 %endif -filter_block2d_bil_var_mmx_loop: +.filter_block2d_bil_var_mmx_loop: movq mm1, [rsi] ; movq mm3, [rsi+1] ; @@ -807,7 +807,7 @@ filter_block2d_bil_var_mmx_loop: add rdi, r9 %endif sub rcx, 1 ; - jnz filter_block2d_bil_var_mmx_loop ; + jnz .filter_block2d_bil_var_mmx_loop ; pxor mm3, mm3 ; diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index b7a6b3286c..762922091e 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -33,7 +33,7 @@ sym(vp8_get_mb_ss_sse2): mov rcx, 8 pxor xmm4, xmm4 -NEXTROW: +.NEXTROW: movdqa xmm0, [rax] movdqa xmm1, [rax+16] movdqa xmm2, [rax+32] @@ -50,7 +50,7 @@ NEXTROW: add rax, 0x40 dec rcx - ja NEXTROW + ja .NEXTROW movdqa xmm3,xmm4 psrldq xmm4,8 @@ -126,7 +126,7 @@ sym(vp8_get16x16var_sse2): pxor xmm6, xmm6 ; clear xmm6 for accumulating sse mov rcx, 16 -var16loop: +.var16loop: movdqu xmm1, XMMWORD PTR [rsi] movdqu xmm2, XMMWORD PTR [rdi] @@ -160,7 +160,7 @@ var16loop: add rdi, rdx sub rcx, 1 - jnz var16loop + jnz .var16loop movdqa xmm1, xmm6 diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm index a582f8dc5f..97e8b0e2e1 100644 --- a/vp8/encoder/x86/variance_impl_ssse3.asm +++ b/vp8/encoder/x86/variance_impl_ssse3.asm @@ -47,7 +47,7 @@ sym(vp8_filter_block2d_bil_var_ssse3): movsxd rax, dword ptr arg(5) ; xoffset cmp rax, 0 ; skip first_pass filter if xoffset=0 - je filter_block2d_bil_var_ssse3_sp_only + je .filter_block2d_bil_var_ssse3_sp_only shl rax, 4 ; point to filter coeff with xoffset lea rax, [rax + rcx] ; HFilter @@ -55,7 +55,7 @@ sym(vp8_filter_block2d_bil_var_ssse3): movsxd rdx, dword ptr arg(6) ; yoffset cmp rdx, 0 ; skip second_pass filter if yoffset=0 - je filter_block2d_bil_var_ssse3_fp_only + je .filter_block2d_bil_var_ssse3_fp_only shl rdx, 4 lea rdx, [rdx + rcx] ; VFilter @@ -88,7 +88,7 @@ sym(vp8_filter_block2d_bil_var_ssse3): lea rsi, [rsi + r8] %endif -filter_block2d_bil_var_ssse3_loop: +.filter_block2d_bil_var_ssse3_loop: movdqu xmm1, XMMWORD PTR [rsi] movdqu xmm2, XMMWORD PTR [rsi+1] movdqa xmm3, xmm1 @@ -142,15 +142,15 @@ filter_block2d_bil_var_ssse3_loop: %endif sub rcx, 1 - jnz filter_block2d_bil_var_ssse3_loop + jnz .filter_block2d_bil_var_ssse3_loop - jmp filter_block2d_bil_variance + jmp .filter_block2d_bil_variance -filter_block2d_bil_var_ssse3_sp_only: +.filter_block2d_bil_var_ssse3_sp_only: movsxd rdx, dword ptr arg(6) ; yoffset cmp rdx, 0 ; Both xoffset =0 and yoffset=0 - je filter_block2d_bil_var_ssse3_full_pixel + je .filter_block2d_bil_var_ssse3_full_pixel shl rdx, 4 lea rdx, [rdx + rcx] ; VFilter @@ -169,7 +169,7 @@ filter_block2d_bil_var_ssse3_sp_only: lea rsi, [rsi + rax] -filter_block2d_bil_sp_only_loop: +.filter_block2d_bil_sp_only_loop: movdqu xmm3, XMMWORD PTR [rsi] movdqa xmm2, xmm1 movdqa xmm0, xmm3 @@ -209,11 +209,11 @@ filter_block2d_bil_sp_only_loop: %endif sub rcx, 1 - jnz filter_block2d_bil_sp_only_loop + jnz .filter_block2d_bil_sp_only_loop - jmp filter_block2d_bil_variance + jmp .filter_block2d_bil_variance -filter_block2d_bil_var_ssse3_full_pixel: +.filter_block2d_bil_var_ssse3_full_pixel: mov rsi, arg(0) ;ref_ptr mov rdi, arg(2) ;src_ptr movsxd rcx, dword ptr arg(4) ;Height @@ -221,7 +221,7 @@ filter_block2d_bil_var_ssse3_full_pixel: movsxd rdx, dword ptr arg(3) ;src_pixels_per_line pxor xmm0, xmm0 -filter_block2d_bil_full_pixel_loop: +.filter_block2d_bil_full_pixel_loop: movq xmm1, QWORD PTR [rsi] punpcklbw xmm1, xmm0 movq xmm2, QWORD PTR [rsi+8] @@ -244,11 +244,11 @@ filter_block2d_bil_full_pixel_loop: lea rsi, [rsi + rax] ;ref_pixels_per_line lea rdi, [rdi + rdx] ;src_pixels_per_line sub rcx, 1 - jnz filter_block2d_bil_full_pixel_loop + jnz .filter_block2d_bil_full_pixel_loop - jmp filter_block2d_bil_variance + jmp .filter_block2d_bil_variance -filter_block2d_bil_var_ssse3_fp_only: +.filter_block2d_bil_var_ssse3_fp_only: mov rsi, arg(0) ;ref_ptr mov rdi, arg(2) ;src_ptr movsxd rcx, dword ptr arg(4) ;Height @@ -260,7 +260,7 @@ filter_block2d_bil_var_ssse3_fp_only: movsxd r9, dword ptr arg(3) ;src_pixels_per_line %endif -filter_block2d_bil_fp_only_loop: +.filter_block2d_bil_fp_only_loop: movdqu xmm1, XMMWORD PTR [rsi] movdqu xmm2, XMMWORD PTR [rsi+1] movdqa xmm3, xmm1 @@ -298,11 +298,11 @@ filter_block2d_bil_fp_only_loop: %endif sub rcx, 1 - jnz filter_block2d_bil_fp_only_loop + jnz .filter_block2d_bil_fp_only_loop - jmp filter_block2d_bil_variance + jmp .filter_block2d_bil_variance -filter_block2d_bil_variance: +.filter_block2d_bil_variance: pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm5, xmm5 -- GitLab