Commit dfa42cf8 authored by Johann's avatar Johann

Update x86inc.asm from x264

Revision a95584945dd9ce3acc66c6cd8f6796bc4404d40d
from git://git.videolan.org/x264.git

Temporarily name file x86inc.asm until all necessary local patches are
applied.

Change-Id: I9c7d0ed4d3ed900ae2d5db0abbcc048a2892c9b8
parent a9aa29d9
URL: http://git.videolan.org/?p=x264.git
Version: 999b753ff0f4dc872077f4fa90d465e948cbe656
Version: a95584945dd9ce3acc66c6cd8f6796bc4404d40d
License: ISC
License File: LICENSE
......@@ -8,5 +8,3 @@ x264/libav's framework for x86 assembly. Contains a variety of macros and
defines that help automatically allow assembly to work cross-platform.
Local Modifications:
Some modifications to allow PIC to work with x86inc.
Conditionally define program_name to allow overriding.
;*****************************************************************************
;* x86inc.asm: x264asm abstraction layer
;*****************************************************************************
;* Copyright (C) 2005-2015 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Anton Mitrofanov <BugMaster@narod.ru>
;* Fiona Glaser <fiona@x264.com>
;* Henrik Gramner <henrik@gramner.com>
;*
;* Permission to use, copy, modify, and/or distribute this software for any
;* purpose with or without fee is hereby granted, provided that the above
;* copyright notice and this permission notice appear in all copies.
;*
;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;*****************************************************************************
; This is a header file for the x264ASM assembly language, which uses
; NASM/YASM syntax combined with a large number of macros to provide easy
; abstraction between different calling conventions (x86_32, win64, linux64).
; It also has various other useful features to simplify writing the kind of
; DSP functions that are most often used in x264.
; Unlike the rest of x264, this file is available under an ISC license, as it
; has significant usefulness outside of x264 and we want it to be available
; to the largest audience possible. Of course, if you modify it for your own
; purposes to add a new feature, we strongly encourage contributing a patch
; as this feature might be useful for others as well. Send patches or ideas
; to x264-devel@videolan.org .
%ifndef private_prefix
%define private_prefix x264
%endif
%ifndef public_prefix
%define public_prefix private_prefix
%endif
%ifndef STACK_ALIGNMENT
%if ARCH_X86_64
%define STACK_ALIGNMENT 16
%else
%define STACK_ALIGNMENT 4
%endif
%endif
%define WIN64 0
%define UNIX64 0
%if ARCH_X86_64
%ifidn __OUTPUT_FORMAT__,win32
%define WIN64 1
%elifidn __OUTPUT_FORMAT__,win64
%define WIN64 1
%elifidn __OUTPUT_FORMAT__,x64
%define WIN64 1
%else
%define UNIX64 1
%endif
%endif
%ifdef PREFIX
%define mangle(x) _ %+ x
%else
%define mangle(x) x
%endif
%macro SECTION_RODATA 0-1 16
SECTION .rodata align=%1
%endmacro
%macro SECTION_TEXT 0-1 16
SECTION .text align=%1
%endmacro
%if WIN64
%define PIC
%elif ARCH_X86_64 == 0
; x86_32 doesn't require PIC.
; Some distros prefer shared objects to be PIC, but nothing breaks if
; the code contains a few textrels, so we'll skip that complexity.
%undef PIC
%endif
%ifdef PIC
default rel
%endif
; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments
; into registers at the start, and make no other use of the stack. Luckily that
; covers most of x264's asm.
; PROLOGUE:
; %1 = number of arguments. loads them from stack if needed.
; %2 = number of registers used. pushes callee-saved regs if needed.
; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
; %4 = (optional) stack size to be allocated. The stack will be aligned before
; allocating the specified stack size. If the required stack alignment is
; larger than the known stack alignment the stack will be manually aligned
; and an extra register will be allocated to hold the original stack
; pointer (to not invalidate r0m etc.). To prevent the use of an extra
; register as stack pointer, request a negative stack size.
; %4+/%5+ = list of names to define to registers
; PROLOGUE can also be invoked by adding the same options to cglobal
; e.g.
; cglobal foo, 2,3,7,0x40, dst, src, tmp
; declares a function (foo) that automatically loads two arguments (dst and
; src) into registers, uses one additional register (tmp) plus 7 vector
; registers (m0-m6) and allocates 0x40 bytes of stack space.
; TODO Some functions can use some args directly from the stack. If they're the
; last args then you can just not declare them, but if they're in the middle
; we need more flexible macro.
; RET:
; Pops anything that was pushed by PROLOGUE, and returns.
; REP_RET:
; Use this instead of RET if it's a branch target.
; registers:
; rN and rNq are the native-size register holding function argument N
; rNd, rNw, rNb are dword, word, and byte size
; rNh is the high 8 bits of the word size
; rNm is the original location of arg N (a register or on the stack), dword
; rNmp is native size
%macro DECLARE_REG 2-3
%define r%1q %2
%define r%1d %2d
%define r%1w %2w
%define r%1b %2b
%define r%1h %2h
%if %0 == 2
%define r%1m %2d
%define r%1mp %2
%elif ARCH_X86_64 ; memory
%define r%1m [rstk + stack_offset + %3]
%define r%1mp qword r %+ %1 %+ m
%else
%define r%1m [rstk + stack_offset + %3]
%define r%1mp dword r %+ %1 %+ m
%endif
%define r%1 %2
%endmacro
%macro DECLARE_REG_SIZE 3
%define r%1q r%1
%define e%1q r%1
%define r%1d e%1
%define e%1d e%1
%define r%1w %1
%define e%1w %1
%define r%1h %3
%define e%1h %3
%define r%1b %2
%define e%1b %2
%if ARCH_X86_64 == 0
%define r%1 e%1
%endif
%endmacro
DECLARE_REG_SIZE ax, al, ah
DECLARE_REG_SIZE bx, bl, bh
DECLARE_REG_SIZE cx, cl, ch
DECLARE_REG_SIZE dx, dl, dh
DECLARE_REG_SIZE si, sil, null
DECLARE_REG_SIZE di, dil, null
DECLARE_REG_SIZE bp, bpl, null
; t# defines for when per-arch register allocation is more complex than just function arguments
%macro DECLARE_REG_TMP 1-*
%assign %%i 0
%rep %0
CAT_XDEFINE t, %%i, r%1
%assign %%i %%i+1
%rotate 1
%endrep
%endmacro
%macro DECLARE_REG_TMP_SIZE 0-*
%rep %0
%define t%1q t%1 %+ q
%define t%1d t%1 %+ d
%define t%1w t%1 %+ w
%define t%1h t%1 %+ h
%define t%1b t%1 %+ b
%rotate 1
%endrep
%endmacro
DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%if ARCH_X86_64
%define gprsize 8
%else
%define gprsize 4
%endif
%macro PUSH 1
push %1
%ifidn rstk, rsp
%assign stack_offset stack_offset+gprsize
%endif
%endmacro
%macro POP 1
pop %1
%ifidn rstk, rsp
%assign stack_offset stack_offset-gprsize
%endif
%endmacro
%macro PUSH_IF_USED 1-*
%rep %0
%if %1 < regs_used
PUSH r%1
%endif
%rotate 1
%endrep
%endmacro
%macro POP_IF_USED 1-*
%rep %0
%if %1 < regs_used
pop r%1
%endif
%rotate 1
%endrep
%endmacro
%macro LOAD_IF_USED 1-*
%rep %0
%if %1 < num_args
mov r%1, r %+ %1 %+ mp
%endif
%rotate 1
%endrep
%endmacro
%macro SUB 2
sub %1, %2
%ifidn %1, rstk
%assign stack_offset stack_offset+(%2)
%endif
%endmacro
%macro ADD 2
add %1, %2
%ifidn %1, rstk
%assign stack_offset stack_offset-(%2)
%endif
%endmacro
%macro movifnidn 2
%ifnidn %1, %2
mov %1, %2
%endif
%endmacro
%macro movsxdifnidn 2
%ifnidn %1, %2
movsxd %1, %2
%endif
%endmacro
%macro ASSERT 1
%if (%1) == 0
%error assert failed
%endif
%endmacro
%macro DEFINE_ARGS 0-*
%ifdef n_arg_names
%assign %%i 0
%rep n_arg_names
CAT_UNDEF arg_name %+ %%i, q
CAT_UNDEF arg_name %+ %%i, d
CAT_UNDEF arg_name %+ %%i, w
CAT_UNDEF arg_name %+ %%i, h
CAT_UNDEF arg_name %+ %%i, b
CAT_UNDEF arg_name %+ %%i, m
CAT_UNDEF arg_name %+ %%i, mp
CAT_UNDEF arg_name, %%i
%assign %%i %%i+1
%endrep
%endif
%xdefine %%stack_offset stack_offset
%undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
%assign %%i 0
%rep %0
%xdefine %1q r %+ %%i %+ q
%xdefine %1d r %+ %%i %+ d
%xdefine %1w r %+ %%i %+ w
%xdefine %1h r %+ %%i %+ h
%xdefine %1b r %+ %%i %+ b
%xdefine %1m r %+ %%i %+ m
%xdefine %1mp r %+ %%i %+ mp
CAT_XDEFINE arg_name, %%i, %1
%assign %%i %%i+1
%rotate 1
%endrep
%xdefine stack_offset %%stack_offset
%assign n_arg_names %0
%endmacro
%define required_stack_alignment ((mmsize + 15) & ~15)
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
%if %1 != 0
%assign %%pad 0
%assign stack_size %1
%if stack_size < 0
%assign stack_size -stack_size
%endif
%if WIN64
%assign %%pad %%pad + 32 ; shadow space
%if mmsize != 8
%assign xmm_regs_used %2
%if xmm_regs_used > 8
%assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
%endif
%endif
%endif
%if required_stack_alignment <= STACK_ALIGNMENT
; maintain the current stack alignment
%assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
SUB rsp, stack_size_padded
%else
%assign %%reg_num (regs_used - 1)
%xdefine rstk r %+ %%reg_num
; align stack, and save original stack location directly above
; it, i.e. in [rsp+stack_size_padded], so we can restore the
; stack in a single instruction (i.e. mov rsp, rstk or mov
; rsp, [rsp+stack_size_padded])
%if %1 < 0 ; need to store rsp on stack
%xdefine rstkm [rsp + stack_size + %%pad]
%assign %%pad %%pad + gprsize
%else ; can keep rsp in rstk during whole function
%xdefine rstkm rstk
%endif
%assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
mov rstk, rsp
and rsp, ~(required_stack_alignment-1)
sub rsp, stack_size_padded
movifnidn rstkm, rstk
%endif
WIN64_PUSH_XMM
%endif
%endif
%endmacro
%macro SETUP_STACK_POINTER 1
%ifnum %1
%if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
%if %1 > 0
%assign regs_used (regs_used + 1)
%elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
%warning "Stack pointer will overwrite register argument"
%endif
%endif
%endif
%endmacro
%macro DEFINE_ARGS_INTERNAL 3+
%ifnum %2
DEFINE_ARGS %3
%elif %1 == 4
DEFINE_ARGS %2
%elif %1 > 4
DEFINE_ARGS %2, %3
%endif
%endmacro
%if WIN64 ; Windows x64 ;=================================================
DECLARE_REG 0, rcx
DECLARE_REG 1, rdx
DECLARE_REG 2, R8
DECLARE_REG 3, R9
DECLARE_REG 4, R10, 40
DECLARE_REG 5, R11, 48
DECLARE_REG 6, rax, 56
DECLARE_REG 7, rdi, 64
DECLARE_REG 8, rsi, 72
DECLARE_REG 9, rbx, 80
DECLARE_REG 10, rbp, 88
DECLARE_REG 11, R12, 96
DECLARE_REG 12, R13, 104
DECLARE_REG 13, R14, 112
DECLARE_REG 14, R15, 120
%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
ALLOC_STACK %4, %3
%if mmsize != 8 && stack_size == 0
WIN64_SPILL_XMM %3
%endif
LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
%macro WIN64_PUSH_XMM 0
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
%if xmm_regs_used > 6
movaps [rstk + stack_offset + 8], xmm6
%endif
%if xmm_regs_used > 7
movaps [rstk + stack_offset + 24], xmm7
%endif
%if xmm_regs_used > 8
%assign %%i 8
%rep xmm_regs_used-8
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
%assign %%i %%i+1
%endrep
%endif
%endmacro
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 8
; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
%assign %%pad (xmm_regs_used-8)*16 + 32
%assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
SUB rsp, stack_size_padded
%endif
WIN64_PUSH_XMM
%endmacro
%macro WIN64_RESTORE_XMM_INTERNAL 1
%assign %%pad_size 0
%if xmm_regs_used > 8
%assign %%i xmm_regs_used
%rep xmm_regs_used-8
%assign %%i %%i-1
movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
%endrep
%endif
%if stack_size_padded > 0
%if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
mov rsp, rstkm
%else
add %1, stack_size_padded
%assign %%pad_size stack_size_padded
%endif
%endif
%if xmm_regs_used > 7
movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
%endif
%if xmm_regs_used > 6
movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
%endif
%endmacro
%macro WIN64_RESTORE_XMM 1
WIN64_RESTORE_XMM_INTERNAL %1
%assign stack_offset (stack_offset-stack_size_padded)
%assign xmm_regs_used 0
%endmacro
%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
%if mmsize == 32
vzeroupper
%endif
AUTO_REP_RET
%endmacro
%elif ARCH_X86_64 ; *nix x64 ;=============================================
DECLARE_REG 0, rdi
DECLARE_REG 1, rsi
DECLARE_REG 2, rdx
DECLARE_REG 3, rcx
DECLARE_REG 4, R8
DECLARE_REG 5, R9
DECLARE_REG 6, rax, 8
DECLARE_REG 7, R10, 16
DECLARE_REG 8, R11, 24
DECLARE_REG 9, rbx, 32
DECLARE_REG 10, rbp, 40
DECLARE_REG 11, R12, 48
DECLARE_REG 12, R13, 56
DECLARE_REG 13, R14, 64
DECLARE_REG 14, R15, 72
%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
PUSH_IF_USED 9, 10, 11, 12, 13, 14
ALLOC_STACK %4
LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
%macro RET 0
%if stack_size_padded > 0
%if required_stack_alignment > STACK_ALIGNMENT
mov rsp, rstkm
%else
add rsp, stack_size_padded
%endif
%endif
POP_IF_USED 14, 13, 12, 11, 10, 9
%if mmsize == 32
vzeroupper
%endif
AUTO_REP_RET
%endmacro
%else ; X86_32 ;==============================================================
DECLARE_REG 0, eax, 4
DECLARE_REG 1, ecx, 8
DECLARE_REG 2, edx, 12
DECLARE_REG 3, ebx, 16
DECLARE_REG 4, esi, 20
DECLARE_REG 5, edi, 24
DECLARE_REG 6, ebp, 28
%define rsp esp
%macro DECLARE_ARG 1-*
%rep %0
%define r%1m [rstk + stack_offset + 4*%1 + 4]
%define r%1mp dword r%1m
%rotate 1
%endrep
%endmacro
DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
ASSERT regs_used >= num_args
%if num_args > 7
%assign num_args 7
%endif
%if regs_used > 7
%assign regs_used 7
%endif
SETUP_STACK_POINTER %4
ASSERT regs_used <= 7
PUSH_IF_USED 3, 4, 5, 6
ALLOC_STACK %4
LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
%macro RET 0
%if stack_size_padded > 0
%if required_stack_alignment > STACK_ALIGNMENT
mov rsp, rstkm
%else
add rsp, stack_size_padded
%endif
%endif
POP_IF_USED 6, 5, 4, 3
%if mmsize == 32
vzeroupper
%endif
AUTO_REP_RET
%endmacro
%endif ;======================================================================
%if WIN64 == 0
%macro WIN64_SPILL_XMM 1
%endmacro
%macro WIN64_RESTORE_XMM 1
%endmacro
%macro WIN64_PUSH_XMM 0
%endmacro
%endif
; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
; a branch or a branch target. So switch to a 2-byte form of ret in that case.
; We can automatically detect "follows a branch", but not a branch target.
; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
%macro REP_RET 0
%if has_epilogue
RET
%else
rep ret
%endif
%endmacro
%define last_branch_adr $$
%macro AUTO_REP_RET 0
%ifndef cpuflags
times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
%elif notcpuflag(ssse3)
times ((last_branch_adr-$)>>31)+1 rep
%endif
ret
%endmacro
%macro BRANCH_INSTR 0-*
%rep %0
%macro %1 1-2 %1
%2 %1
%%branch_instr:
%xdefine last_branch_adr %%branch_instr
%endmacro
%rotate 1
%endrep
%endmacro
BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
%macro TAIL_CALL 2 ; callee, is_nonadjacent
%if has_epilogue
call %1
RET
%elif %2
jmp %1
%endif
%endmacro
;=============================================================================
; arch-independent part
;=============================================================================
%assign function_align 16
; Begin a function.
; Applies any symbol mangling needed for C linkage, and sets up a define such that
; subsequent uses of the function name automatically refer to the mangled version.
; Appends cpuflags to the function name if cpuflags has been specified.
; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
cglobal_internal 1, %1 %+ SUFFIX, %2
%endmacro
%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
cglobal_internal 0, %1 %+ SUFFIX, %2
%endmacro
%macro cglobal_internal 2-3+
%if %1
%xdefine %%FUNCTION_PREFIX private_prefix
%xdefine %%VISIBILITY hidden
%else
%xdefine %%FUNCTION_PREFIX public_prefix
%xdefine %%VISIBILITY
%endif
%ifndef cglobaled_%2
%xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
%xdefine %2.skip_prologue %2 %+ .skip_prologue
CAT_XDEFINE cglobaled_, %2, 1
%endif
%xdefine current_function %2
%ifidn __OUTPUT_FORMAT__,elf
global %2:function %%VISIBILITY
%else
global %2
%endif
align function_align
%2:
RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
%xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
%assign stack_offset 0 ; stack pointer offset relative to the return address
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
%ifnidn %3, ""
PROLOGUE %3
%endif
%endmacro
%macro cextern 1
%xdefine %1 mangle(private_prefix %+ _ %+ %1)
CAT_XDEFINE cglobaled_, %1, 1
extern %1
%endmacro
; like cextern, but without the prefix
%macro cextern_naked 1
%xdefine %1 mangle(%1)
CAT_XDEFINE cglobaled_, %1, 1
extern %1
%endmacro
%macro const 1-2+
%xdefine %1 mangle(private_prefix %+ _ %+ %1)
%ifidn __OUTPUT_FORMAT__,elf
global %1:data hidden
%else
global %1
%endif
%1: %2
%endmacro
; This is needed for ELF, otherwise the GNU linker assumes the stack is
; executable by default.
%ifidn __OUTPUT_FORMAT__,elf
SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endif
; cpuflags
%assign cpuflags_mmx (1<<0)
%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
%assign cpuflags_sse (1<<4) | cpuflags_mmx2
%assign cpuflags_sse2 (1<<5) | cpuflags_sse
%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
%assign cpuflags_avx (1<<11)| cpuflags_sse42
%assign cpuflags_xop (1<<12)| cpuflags_avx
%assign cpuflags_fma4 (1<<13)| cpuflags_avx
%assign cpuflags_fma3 (1<<14)| cpuflags_avx
%assign cpuflags_avx2 (1<<15)| cpuflags_fma3
%assign cpuflags_cache32 (1<<16)
%assign cpuflags_cache64 (1<<17)
%assign cpuflags_slowctz (1<<18)
%assign cpuflags_lzcnt (1<<19)