Commit 2497356d authored by Johann's avatar Johann

Update x86inc.asm from x264

Revision d23d18655249944c1ca894b451e2c82c7a584c62
from https://git.videolan.org/x264.git

Change-Id: I841ec8c5ea71935aa364657299c6bba0c8742fc1
parent fdc977af
URL: http://git.videolan.org/?p=x264.git URL: https://git.videolan.org/git/x264.git
Version: a95584945dd9ce3acc66c6cd8f6796bc4404d40d Version: d23d18655249944c1ca894b451e2c82c7a584c62
License: ISC License: ISC
License File: LICENSE License File: LICENSE
...@@ -8,17 +8,3 @@ x264/libav's framework for x86 assembly. Contains a variety of macros and ...@@ -8,17 +8,3 @@ x264/libav's framework for x86 assembly. Contains a variety of macros and
defines that help automatically allow assembly to work cross-platform. defines that help automatically allow assembly to work cross-platform.
Local Modifications: Local Modifications:
Get configuration from vpx_config.asm.
Prefix functions with vpx by default.
Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
exist in libvpx.
Expand PIC default to macho64 and respect CONFIG_PIC from libvpx
Catch all elf formats for 'hidden' status and SECTION notes.
Avoid 'amdnop' when building with nasm.
Set 'private_extern' visibility for macho targets.
Copy PIC 'GLOBAL' macros from x86_abi_support.asm
Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
Use .text with no alignment for aout
Only use 'hidden' visibility with Chromium
Move '%use smartalign' for nasm out of 'INIT_CPUFLAGS' and before
'ALIGNMODE'.
;***************************************************************************** ;*****************************************************************************
;* x86inc.asm: x264asm abstraction layer ;* x86inc.asm: x264asm abstraction layer
;***************************************************************************** ;*****************************************************************************
;* Copyright (C) 2005-2015 x264 project ;* Copyright (C) 2005-2016 x264 project
;* ;*
;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Anton Mitrofanov <BugMaster@narod.ru> ;* Anton Mitrofanov <BugMaster@narod.ru>
...@@ -34,10 +34,8 @@ ...@@ -34,10 +34,8 @@
; as this feature might be useful for others as well. Send patches or ideas ; as this feature might be useful for others as well. Send patches or ideas
; to x264-devel@videolan.org . ; to x264-devel@videolan.org .
%include "vpx_config.asm"
%ifndef private_prefix %ifndef private_prefix
%define private_prefix vpx %define private_prefix x264
%endif %endif
%ifndef public_prefix %ifndef public_prefix
...@@ -66,129 +64,40 @@ ...@@ -66,129 +64,40 @@
%endif %endif
%endif %endif
%ifidn __OUTPUT_FORMAT__,elf32 %define FORMAT_ELF 0
%define mangle(x) x %ifidn __OUTPUT_FORMAT__,elf
%define FORMAT_ELF 1
%elifidn __OUTPUT_FORMAT__,elf32
%define FORMAT_ELF 1
%elifidn __OUTPUT_FORMAT__,elf64 %elifidn __OUTPUT_FORMAT__,elf64
%define mangle(x) x %define FORMAT_ELF 1
%elifidn __OUTPUT_FORMAT__,x64
%define mangle(x) x
%elifidn __OUTPUT_FORMAT__,win64
%define mangle(x) x
%else
%define mangle(x) _ %+ x
%endif %endif
; In some instances macho32 tables get misaligned when using .rodata. %ifdef PREFIX
; When looking at the disassembly it appears that the offset is either %define mangle(x) _ %+ x
; correct or consistently off by 90. Placing them in the .text section
; works around the issue. It appears to be specific to the way libvpx
; handles the tables.
%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,macho32
SECTION .text align=%1
fakegot:
%elifidn __OUTPUT_FORMAT__,aout
SECTION .text
%else
SECTION .rodata align=%1
%endif
%endmacro
%macro SECTION_TEXT 0-1 16
%ifidn __OUTPUT_FORMAT__,aout
SECTION .text
%else
SECTION .text align=%1
%endif
%endmacro
; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
; from original code is added in for 64bit.
%ifidn __OUTPUT_FORMAT__,elf32
%define ABI_IS_32BIT 1
%elifidn __OUTPUT_FORMAT__,macho32
%define ABI_IS_32BIT 1
%elifidn __OUTPUT_FORMAT__,win32
%define ABI_IS_32BIT 1
%elifidn __OUTPUT_FORMAT__,aout
%define ABI_IS_32BIT 1
%else %else
%define ABI_IS_32BIT 0 %define mangle(x) x
%endif %endif
%if ABI_IS_32BIT %macro SECTION_RODATA 0-1 16
%if CONFIG_PIC=1 SECTION .rodata align=%1
%ifidn __OUTPUT_FORMAT__,elf32 %endmacro
%define GET_GOT_DEFINED 1
%define WRT_PLT wrt ..plt
%macro GET_GOT 1
extern _GLOBAL_OFFSET_TABLE_
push %1
call %%get_got
%%sub_offset:
jmp %%exitGG
%%get_got:
mov %1, [esp]
add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
ret
%%exitGG:
%undef GLOBAL
%define GLOBAL(x) x + %1 wrt ..gotoff
%undef RESTORE_GOT
%define RESTORE_GOT pop %1
%endmacro
%elifidn __OUTPUT_FORMAT__,macho32
%define GET_GOT_DEFINED 1
%macro GET_GOT 1
push %1
call %%get_got
%%get_got:
pop %1
%undef GLOBAL
%define GLOBAL(x) x + %1 - %%get_got
%undef RESTORE_GOT
%define RESTORE_GOT pop %1
%endmacro
%else
%define GET_GOT_DEFINED 0
%endif
%endif
%if ARCH_X86_64 == 0
%undef PIC
%endif
%else
%macro GET_GOT 1
%endmacro
%define GLOBAL(x) rel x
%define WRT_PLT wrt ..plt
%if WIN64
%define PIC
%elifidn __OUTPUT_FORMAT__,macho64
%define PIC
%elif CONFIG_PIC
%define PIC
%endif
%endif
%ifnmacro GET_GOT %if WIN64
%macro GET_GOT 1 %define PIC
%endmacro %elif ARCH_X86_64 == 0
%define GLOBAL(x) x ; x86_32 doesn't require PIC.
%endif ; Some distros prefer shared objects to be PIC, but nothing breaks if
%ifndef RESTORE_GOT ; the code contains a few textrels, so we'll skip that complexity.
%define RESTORE_GOT %undef PIC
%endif %endif
%ifndef WRT_PLT
%define WRT_PLT
%endif
%ifdef PIC %ifdef PIC
default rel default rel
%endif %endif
; Done with PIC macros
%ifdef __NASM_VER__
%use smartalign
%endif
; Macros to eliminate most code duplication between x86_32 and x86_64: ; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments ; Currently this works only for leaf functions which load all their arguments
...@@ -237,6 +146,7 @@ ...@@ -237,6 +146,7 @@
%define r%1w %2w %define r%1w %2w
%define r%1b %2b %define r%1b %2b
%define r%1h %2h %define r%1h %2h
%define %2q %2
%if %0 == 2 %if %0 == 2
%define r%1m %2d %define r%1m %2d
%define r%1mp %2 %define r%1mp %2
...@@ -261,9 +171,9 @@ ...@@ -261,9 +171,9 @@
%define e%1h %3 %define e%1h %3
%define r%1b %2 %define r%1b %2
%define e%1b %2 %define e%1b %2
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
%define r%1 e%1 %define r%1 e%1
%endif %endif
%endmacro %endmacro
DECLARE_REG_SIZE ax, al, ah DECLARE_REG_SIZE ax, al, ah
...@@ -373,7 +283,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 ...@@ -373,7 +283,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%macro ASSERT 1 %macro ASSERT 1
%if (%1) == 0 %if (%1) == 0
%error assert failed %error assertion ``%1'' failed
%endif %endif
%endmacro %endmacro
...@@ -464,8 +374,10 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 ...@@ -464,8 +374,10 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
%if %1 > 0 %if %1 > 0
%assign regs_used (regs_used + 1) %assign regs_used (regs_used + 1)
%elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 %endif
%warning "Stack pointer will overwrite register argument" %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
; Ensure that we don't clobber any registers containing arguments
%assign regs_used 5 + UNIX64 * 3
%endif %endif
%endif %endif
%endif %endif
...@@ -579,9 +491,9 @@ DECLARE_REG 14, R15, 120 ...@@ -579,9 +491,9 @@ DECLARE_REG 14, R15, 120
%macro RET 0 %macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp WIN64_RESTORE_XMM_INTERNAL rsp
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
%if mmsize == 32 %if mmsize == 32
vzeroupper vzeroupper
%endif %endif
AUTO_REP_RET AUTO_REP_RET
%endmacro %endmacro
...@@ -618,17 +530,17 @@ DECLARE_REG 14, R15, 72 ...@@ -618,17 +530,17 @@ DECLARE_REG 14, R15, 72
%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
%macro RET 0 %macro RET 0
%if stack_size_padded > 0 %if stack_size_padded > 0
%if required_stack_alignment > STACK_ALIGNMENT %if required_stack_alignment > STACK_ALIGNMENT
mov rsp, rstkm mov rsp, rstkm
%else %else
add rsp, stack_size_padded add rsp, stack_size_padded
%endif %endif
%endif %endif
POP_IF_USED 14, 13, 12, 11, 10, 9 POP_IF_USED 14, 13, 12, 11, 10, 9
%if mmsize == 32 %if mmsize == 32
vzeroupper vzeroupper
%endif %endif
AUTO_REP_RET AUTO_REP_RET
%endmacro %endmacro
...@@ -674,29 +586,29 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 ...@@ -674,29 +586,29 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
%macro RET 0 %macro RET 0
%if stack_size_padded > 0 %if stack_size_padded > 0
%if required_stack_alignment > STACK_ALIGNMENT %if required_stack_alignment > STACK_ALIGNMENT
mov rsp, rstkm mov rsp, rstkm
%else %else
add rsp, stack_size_padded add rsp, stack_size_padded
%endif %endif
%endif %endif
POP_IF_USED 6, 5, 4, 3 POP_IF_USED 6, 5, 4, 3
%if mmsize == 32 %if mmsize == 32
vzeroupper vzeroupper
%endif %endif
AUTO_REP_RET AUTO_REP_RET
%endmacro %endmacro
%endif ;====================================================================== %endif ;======================================================================
%if WIN64 == 0 %if WIN64 == 0
%macro WIN64_SPILL_XMM 1 %macro WIN64_SPILL_XMM 1
%endmacro %endmacro
%macro WIN64_RESTORE_XMM 1 %macro WIN64_RESTORE_XMM 1
%endmacro %endmacro
%macro WIN64_PUSH_XMM 0 %macro WIN64_PUSH_XMM 0
%endmacro %endmacro
%endif %endif
; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
...@@ -709,24 +621,26 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 ...@@ -709,24 +621,26 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%else %else
rep ret rep ret
%endif %endif
annotate_function_size
%endmacro %endmacro
%define last_branch_adr $$ %define last_branch_adr $$
%macro AUTO_REP_RET 0 %macro AUTO_REP_RET 0
%ifndef cpuflags %if notcpuflag(ssse3)
times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
%elif notcpuflag(ssse3)
times ((last_branch_adr-$)>>31)+1 rep
%endif %endif
ret ret
annotate_function_size
%endmacro %endmacro
%macro BRANCH_INSTR 0-* %macro BRANCH_INSTR 0-*
%rep %0 %rep %0
%macro %1 1-2 %1 %macro %1 1-2 %1
%2 %1 %2 %1
%%branch_instr: %if notcpuflag(ssse3)
%xdefine last_branch_adr %%branch_instr %%branch_instr equ $
%xdefine last_branch_adr %%branch_instr
%endif
%endmacro %endmacro
%rotate 1 %rotate 1
%endrep %endrep
...@@ -741,6 +655,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, ...@@ -741,6 +655,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%elif %2 %elif %2
jmp %1 jmp %1
%endif %endif
annotate_function_size
%endmacro %endmacro
;============================================================================= ;=============================================================================
...@@ -762,16 +677,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, ...@@ -762,16 +677,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
cglobal_internal 0, %1 %+ SUFFIX, %2 cglobal_internal 0, %1 %+ SUFFIX, %2
%endmacro %endmacro
%macro cglobal_internal 2-3+ %macro cglobal_internal 2-3+
annotate_function_size
%if %1 %if %1
%xdefine %%FUNCTION_PREFIX private_prefix %xdefine %%FUNCTION_PREFIX private_prefix
; libvpx explicitly sets visibility in shared object builds. Avoid %xdefine %%VISIBILITY hidden
; setting visibility to hidden as it may break builds that split
; sources on e.g., directory boundaries.
%ifdef CHROMIUM
%xdefine %%VISIBILITY hidden
%else
%xdefine %%VISIBILITY
%endif
%else %else
%xdefine %%FUNCTION_PREFIX public_prefix %xdefine %%FUNCTION_PREFIX public_prefix
%xdefine %%VISIBILITY %xdefine %%VISIBILITY
...@@ -782,22 +691,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, ...@@ -782,22 +691,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
CAT_XDEFINE cglobaled_, %2, 1 CAT_XDEFINE cglobaled_, %2, 1
%endif %endif
%xdefine current_function %2 %xdefine current_function %2
%ifidn __OUTPUT_FORMAT__,elf32 %xdefine current_function_section __SECT__
global %2:function %%VISIBILITY %if FORMAT_ELF
%elifidn __OUTPUT_FORMAT__,elf64
global %2:function %%VISIBILITY global %2:function %%VISIBILITY
%elifidn __OUTPUT_FORMAT__,macho32
%ifdef __NASM_VER__
global %2
%else
global %2:private_extern
%endif
%elifidn __OUTPUT_FORMAT__,macho64
%ifdef __NASM_VER__
global %2
%else
global %2:private_extern
%endif
%else %else
global %2 global %2
%endif %endif
...@@ -822,16 +718,16 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, ...@@ -822,16 +718,16 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
; like cextern, but without the prefix ; like cextern, but without the prefix
%macro cextern_naked 1 %macro cextern_naked 1
%xdefine %1 mangle(%1) %ifdef PREFIX
%xdefine %1 mangle(%1)
%endif
CAT_XDEFINE cglobaled_, %1, 1 CAT_XDEFINE cglobaled_, %1, 1
extern %1 extern %1
%endmacro %endmacro
%macro const 1-2+ %macro const 1-2+
%xdefine %1 mangle(private_prefix %+ _ %+ %1) %xdefine %1 mangle(private_prefix %+ _ %+ %1)
%ifidn __OUTPUT_FORMAT__,elf32 %if FORMAT_ELF
global %1:data hidden
%elifidn __OUTPUT_FORMAT__,elf64
global %1:data hidden global %1:data hidden
%else %else
global %1 global %1
...@@ -839,14 +735,29 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, ...@@ -839,14 +735,29 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%1: %2 %1: %2
%endmacro %endmacro
; This is needed for ELF, otherwise the GNU linker assumes the stack is ; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
; executable by default. %if FORMAT_ELF
%ifidn __OUTPUT_FORMAT__,elf32 [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
SECTION .note.GNU-stack noalloc noexec nowrite progbits
%elifidn __OUTPUT_FORMAT__,elf64
SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endif %endif
; Tell debuggers how large the function was.
; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
; then its size might be unspecified.
%macro annotate_function_size 0
%ifdef __YASM_VER__
%ifdef current_function
%if FORMAT_ELF
current_function_section
%%ecf equ $
size current_function %%ecf - current_function
__SECT__
%endif
%endif
%endif
%endmacro
; cpuflags ; cpuflags
%assign cpuflags_mmx (1<<0) %assign cpuflags_mmx (1<<0)
...@@ -875,12 +786,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ...@@ -875,12 +786,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt %assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 %assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
%define notcpuflag(x) (cpuflag(x) ^ 1)
%ifdef __NASM_VER__
%use smartalign
%endif
; Takes an arbitrary number of cpuflags from the above list. ; Takes an arbitrary number of cpuflags from the above list.
; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
...@@ -917,12 +825,18 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ...@@ -917,12 +825,18 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endif %endif
%endif %endif
%ifdef __NASM_VER__ %if ARCH_X86_64 || cpuflag(sse2)
ALIGNMODE k7 %ifdef __NASM_VER__
%elif ARCH_X86_64 || cpuflag(sse2) ALIGNMODE k8
CPU amdnop %else
CPU amdnop
%endif
%else %else
CPU basicnop %ifdef __NASM_VER__
ALIGNMODE nop
%else
CPU basicnop
%endif
%endif %endif
%endmacro %endmacro
...@@ -951,14 +865,14 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ...@@ -951,14 +865,14 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define movnta movntq %define movnta movntq
%assign %%i 0 %assign %%i 0
%rep 8 %rep 8
CAT_XDEFINE m, %%i, mm %+ %%i CAT_XDEFINE m, %%i, mm %+ %%i
CAT_XDEFINE nnmm, %%i, %%i CAT_XDEFINE nnmm, %%i, %%i
%assign %%i %%i+1 %assign %%i %%i+1
%endrep %endrep
%rep 8 %rep 8
CAT_UNDEF m, %%i CAT_UNDEF m, %%i
CAT_UNDEF nnmm, %%i CAT_UNDEF nnmm, %%i
%assign %%i %%i+1 %assign %%i %%i+1
%endrep %endrep
INIT_CPUFLAGS %1 INIT_CPUFLAGS %1
%endmacro %endmacro
...@@ -969,7 +883,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ...@@ -969,7 +883,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define mmsize 16 %define mmsize 16
%define num_mmregs 8 %define num_mmregs 8
%if ARCH_X86_64 %if ARCH_X86_64
%define num_mmregs 16 %define num_mmregs 16
%endif %endif
%define mova movdqa %define mova movdqa
%define movu movdqu %define movu movdqu
...@@ -977,9 +891,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ...@@ -977,9 +891,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define movnta movntdq %define movnta movntdq
%assign %%i 0 %assign %%i 0
%rep num_mmregs %rep num_mmregs
CAT_XDEFINE m, %%i, xmm %+ %%i CAT_XDEFINE m, %%i, xmm %+ %%i
CAT_XDEFINE nnxmm, %%i, %%i CAT_XDEFINE nnxmm, %%i, %%i
%assign %%i %%i+1 %assign %%i %%i+1
%endrep %endrep
INIT_CPUFLAGS %1 INIT_CPUFLAGS %1
%endmacro %endmacro
...@@ -990,7 +904,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ...@@ -990,7 +904,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define mmsize 32