Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Guillaume Martres
aom-rav1e
Commits
beaafefc
Commit
beaafefc
authored
Mar 24, 2011
by
Johann
Committed by
Code Review
Mar 24, 2011
Browse files
Merge "use asm_offsets with vp8_regular_quantize_b_sse2"
parents
4cde2ab7
8edaf6e2
Changes
6
Hide whitespace changes
Inline
Side-by-side
build/make/Makefile
View file @
beaafefc
...
...
@@ -331,11 +331,8 @@ ifneq ($(call enabled,DIST-SRCS),)
DIST-SRCS-$(CONFIG_MSVS)
+=
build/make/gen_msvs_sln.sh
DIST-SRCS-$(CONFIG_MSVS)
+=
build/x86-msvs/yasm.rules
DIST-SRCS-$(CONFIG_RVCT)
+=
build/make/armlink_adapter.sh
#
# This isn't really ARCH_ARM dependent, it's dependent on whether we're
# using assembly code or not (CONFIG_OPTIMIZATIONS maybe). Just use
# this for now.
DIST-SRCS-$(ARCH_ARM)
+=
build/make/obj_int_extract.c
# Include obj_int_extract if we use offsets from asm_*_offsets
DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64)
+=
build/make/obj_int_extract.c
DIST-SRCS-$(ARCH_ARM)
+=
build/make/ads2gas.pl
DIST-SRCS-yes
+=
$
(
target:-
$(TOOLCHAIN)
=)
.mk
endif
...
...
libs.mk
View file @
beaafefc
...
...
@@ -245,7 +245,9 @@ ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat
OBJS-yes
+=
$(VP8_PREFIX)
common/asm_com_offsets.c.o
CLEAN-OBJS
+=
asm_com_offsets.asm
$(filter %$(ASM).o,$(OBJS-yes))
:
$(BUILD_PFX)asm_com_offsets.asm
endif
ifeq
($(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64), yes)
ifeq
($(CONFIG_VP8_ENCODER), yes)
asm_enc_offsets.asm
:
obj_int_extract
asm_enc_offsets.asm
:
$(VP8_PREFIX)encoder/asm_enc_offsets.c.o
...
...
@@ -254,7 +256,9 @@ ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat
CLEAN-OBJS
+=
asm_enc_offsets.asm
$(filter %$(ASM).o,$(OBJS-yes))
:
$(BUILD_PFX)asm_enc_offsets.asm
endif
endif
ifeq
($(ARCH_ARM), yes)
ifeq
($(CONFIG_VP8_DECODER), yes)
asm_dec_offsets.asm
:
obj_int_extract
asm_dec_offsets.asm
:
$(VP8_PREFIX)decoder/asm_dec_offsets.c.o
...
...
vp8/encoder/asm_enc_offsets.c
View file @
beaafefc
...
...
@@ -12,9 +12,11 @@
#include "vpx_ports/config.h"
#include <stddef.h>
#include "block.h"
#include "vp8/common/blockd.h"
#include "onyx_int.h"
#include "treewriter.h"
#include "tokenize.h"
#include "onyx_int.h"
#define ct_assert(name,cond) \
static void assert_##name(void) UNUSED;\
...
...
@@ -31,6 +33,21 @@
* {
*/
//regular quantize
DEFINE
(
vp8_block_coeff
,
offsetof
(
BLOCK
,
coeff
));
DEFINE
(
vp8_block_zbin
,
offsetof
(
BLOCK
,
zbin
));
DEFINE
(
vp8_block_round
,
offsetof
(
BLOCK
,
round
));
DEFINE
(
vp8_block_quant
,
offsetof
(
BLOCK
,
quant
));
DEFINE
(
vp8_block_quant_fast
,
offsetof
(
BLOCK
,
quant_fast
));
DEFINE
(
vp8_block_zbin_extra
,
offsetof
(
BLOCK
,
zbin_extra
));
DEFINE
(
vp8_block_zrun_zbin_boost
,
offsetof
(
BLOCK
,
zrun_zbin_boost
));
DEFINE
(
vp8_block_quant_shift
,
offsetof
(
BLOCK
,
quant_shift
));
DEFINE
(
vp8_blockd_qcoeff
,
offsetof
(
BLOCKD
,
qcoeff
));
DEFINE
(
vp8_blockd_dequant
,
offsetof
(
BLOCKD
,
dequant
));
DEFINE
(
vp8_blockd_dqcoeff
,
offsetof
(
BLOCKD
,
dqcoeff
));
DEFINE
(
vp8_blockd_eob
,
offsetof
(
BLOCKD
,
eob
));
//pack tokens
DEFINE
(
vp8_writer_lowvalue
,
offsetof
(
vp8_writer
,
lowvalue
));
DEFINE
(
vp8_writer_range
,
offsetof
(
vp8_writer
,
range
));
...
...
@@ -65,17 +82,6 @@ DEFINE(TOKENLIST_SZ, sizeof(TOKENLIST));
DEFINE
(
vp8_common_mb_rows
,
offsetof
(
VP8_COMMON
,
mb_rows
));
// offsets from BLOCK structure
DEFINE
(
vp8_block_coeff
,
offsetof
(
BLOCK
,
coeff
));
DEFINE
(
vp8_block_quant_fast
,
offsetof
(
BLOCK
,
quant_fast
));
DEFINE
(
vp8_block_round
,
offsetof
(
BLOCK
,
round
));
// offsets from BLOCKD structure
DEFINE
(
vp8_blockd_qcoeff
,
offsetof
(
BLOCKD
,
qcoeff
));
DEFINE
(
vp8_blockd_dqcoeff
,
offsetof
(
BLOCKD
,
dqcoeff
));
DEFINE
(
vp8_blockd_dequant
,
offsetof
(
BLOCKD
,
dequant
));
DEFINE
(
vp8_blockd_eob
,
offsetof
(
BLOCKD
,
eob
));
// These two sizes are used in vp8cx_pack_tokens. They are hard coded
// so if the size changes this will have to be adjusted.
#if HAVE_ARMV5TE
...
...
vp8/encoder/x86/quantize_sse2.asm
View file @
beaafefc
...
...
@@ -9,48 +9,59 @@
%include "vpx_ports/x86_abi_support.asm"
%include "asm_enc_offsets.asm"
;int vp8_regular_quantize_b_impl_sse2(
; short *coeff_ptr,
; short *zbin_ptr,
; short *qcoeff_ptr,
; short *dequant_ptr,
; const int *default_zig_zag,
; short *round_ptr,
; short *quant_ptr,
; short *dqcoeff_ptr,
; unsigned short zbin_oq_value,
; short *zbin_boost_ptr,
; short *quant_shift);
;
global
sym
(
vp8_regular_quantize_b_impl_sse2
)
sym
(
vp8_regular_quantize_b_impl_sse2
):
; void vp8_regular_quantize_b_sse2 | arg
; (BLOCK *b, | 0
; BLOCKD *d) | 1
global
sym
(
vp8_regular_quantize_b_sse2
)
sym
(
vp8_regular_quantize_b_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
11
SAVE_XMM
GET_GOT
rbx
push
rsi
%if ABI_IS_32BIT
push
rdi
%else
%ifidn __OUTPUT_FORMAT__,x64
push
rdi
push
rbx
%endif
%endif
ALIGN
_STACK
16
,
rax
%define abs_minus_zbin 0
%define temp_qcoeff 32
%define qcoeff 64
%define eob_tmp 96
%define BLOCKD_d 0
; 8
%define zrun_zbin_boost 8
; 8
%define abs_minus_zbin 16
; 32
%define temp_qcoeff 48
; 32
%define qcoeff 80
; 32
%define stack_size 112
sub
rsp
,
stack_size
; end prolog
mov
rdx
,
arg
(
0
)
; coeff_ptr
mov
rcx
,
arg
(
1
)
; zbin_ptr
movd
xmm7
,
arg
(
8
)
; zbin_oq_value
mov
rdi
,
arg
(
5
)
; round_ptr
mov
rsi
,
arg
(
6
)
; quant_ptr
%if ABI_IS_32BIT
mov
rdi
,
arg
(
0
)
%else
%ifidn __OUTPUT_FORMAT__,x64
mov
rdi
,
rcx
; BLOCK *b
mov
[
rsp
+
BL
OCKD_d
],
rdx
%else
;mov rdi, rdi ; BLOCK *b
mov
[
rsp
+
BL
OCKD_d
],
rsi
%endif
%endif
mov
rdx
,
[
rdi
+
vp8_block_coeff
]
; coeff_ptr
mov
rcx
,
[
rdi
+
vp8_block_zbin
]
; zbin_ptr
movd
xmm7
,
[
rdi
+
vp8_block_zbin_extra
]
; zbin_oq_value
; z
movdqa
xmm0
,
OWORD
PTR
[
rdx
]
movdqa
xmm4
,
OWORD
PTR
[
rdx
+
16
]
movdqa
xmm0
,
[
rdx
]
movdqa
xmm4
,
[
rdx
+
16
]
mov
rdx
,
[
rdi
+
vp8_block_round
]
; round_ptr
pshuflw
xmm7
,
xmm7
,
0
punpcklwd
xmm7
,
xmm7
; duplicated zbin_oq_value
...
...
@@ -70,8 +81,9 @@ sym(vp8_regular_quantize_b_impl_sse2):
psubw
xmm1
,
xmm0
psubw
xmm5
,
xmm4
movdqa
xmm2
,
OWORD
PTR
[
rcx
]
movdqa
xmm3
,
OWORD
PTR
[
rcx
+
16
]
movdqa
xmm2
,
[
rcx
]
movdqa
xmm3
,
[
rcx
+
16
]
mov
rcx
,
[
rdi
+
vp8_block_quant
]
; quant_ptr
; *zbin_ptr + zbin_oq_value
paddw
xmm2
,
xmm7
...
...
@@ -80,18 +92,18 @@ sym(vp8_regular_quantize_b_impl_sse2):
; x - (*zbin_ptr + zbin_oq_value)
psubw
xmm1
,
xmm2
psubw
xmm5
,
xmm3
movdqa
OWORD
PTR
[
rsp
+
abs_minus_zbin
],
xmm1
movdqa
OWORD
PTR
[
rsp
+
abs_minus_zbin
+
16
],
xmm5
movdqa
[
rsp
+
abs_minus_zbin
],
xmm1
movdqa
[
rsp
+
abs_minus_zbin
+
16
],
xmm5
; add (zbin_ptr + zbin_oq_value) back
paddw
xmm1
,
xmm2
paddw
xmm5
,
xmm3
movdqa
xmm2
,
OWORD
PTR
[
rd
i
]
movdqa
xmm6
,
OWORD
PTR
[
rd
i
+
16
]
movdqa
xmm2
,
[
rd
x
]
movdqa
xmm6
,
[
rd
x
+
16
]
movdqa
xmm3
,
OWORD
PTR
[
rsi
]
movdqa
xmm7
,
OWORD
PTR
[
rsi
+
16
]
movdqa
xmm3
,
[
rcx
]
movdqa
xmm7
,
[
rcx
+
16
]
; x + round
paddw
xmm1
,
xmm2
...
...
@@ -105,68 +117,67 @@ sym(vp8_regular_quantize_b_impl_sse2):
paddw
xmm1
,
xmm3
paddw
xmm5
,
xmm7
movdqa
OWORD
PTR
[
rsp
+
temp_qcoeff
],
xmm1
movdqa
OWORD
PTR
[
rsp
+
temp_qcoeff
+
16
],
xmm5
movdqa
[
rsp
+
temp_qcoeff
],
xmm1
movdqa
[
rsp
+
temp_qcoeff
+
16
],
xmm5
pxor
xmm6
,
xmm6
; zero qcoeff
movdqa
OWORD
PTR
[
rsp
+
qcoeff
],
xmm6
movdqa
OWORD
PTR
[
rsp
+
qcoeff
+
16
],
xmm6
movdqa
[
rsp
+
qcoeff
],
xmm6
movdqa
[
rsp
+
qcoeff
+
16
],
xmm6
mov
[
rsp
+
eob_tmp
],
DWORD
-
1
; eob
mov
rsi
,
arg
(
9
)
; zbin_boost_ptr
mov
rdi
,
arg
(
4
)
; default_zig_zag
mov
rax
,
arg
(
10
)
; quant_shift_ptr
mov
rsi
,
[
rdi
+
vp8_block_zrun_zbin_boost
]
; zbin_boost_ptr
mov
rax
,
[
rdi
+
vp8_block_quant_shift
]
; quant_shift_ptr
mov
[
rsp
+
zrun_zbin_boost
],
rsi
%macro ZIGZAG_LOOP 2
rq_zigzag_loop_
%
1
:
movsxd
rdx
,
DWORD
PTR
[
rdi
+
(
%
1
*
4
)]
; rc
movsx
ebx
,
WORD
PTR
[
rsi
]
; *zbin_boost_ptr
lea
rsi
,
[
rsi
+
2
]
; zbin_boost_ptr++
%macro ZIGZAG_LOOP 1
movsx
edx
,
WORD
PTR
[
GLOBAL
(
zig_zag
)
+
(
%
1
*
2
)]
; rc
; x
movsx
ecx
,
WORD
PTR
[
rsp
+
abs_minus_zbin
+
rdx
*
2
]
; if (x >= zbin)
sub
ecx
,
ebx
; x - zbin
jl
rq_zigzag_loop_
%
2
; x < zbin
sub
cx
,
WORD
PTR
[
rsi
]
; x - zbin
lea
rsi
,
[
rsi
+
2
]
; zbin_boost_ptr++
jl
rq_zigzag_loop_
%
1
; x < zbin
movsx
e
bx
,
WORD
PTR
[
rsp
+
temp_qcoeff
+
rdx
*
2
]
movsx
e
di
,
WORD
PTR
[
rsp
+
temp_qcoeff
+
rdx
*
2
]
; downshift by quant_shift[rdx]
movsx
ecx
,
WORD
PTR
[
rax
+
rdx
*
2
]
; quant_shift_ptr[rc]
sar
ebx
,
cl
; also sets Z bit
je
rq_zigzag_loop_
%
2
; !y
mov
WORD
PTR
[
rsp
+
qcoeff
+
rdx
*
2
],
bx
;qcoeff_ptr[rc] = temp_qcoeff[rc]
mov
rsi
,
arg
(
9
)
; reset to b->zrun_zbin_boost
mov
[
rsp
+
eob_tmp
],
DWORD
%
1
; eob = i
sar
edi
,
cl
; also sets Z bit
je
rq_zigzag_loop_
%
1
; !y
mov
WORD
PTR
[
rsp
+
qcoeff
+
rdx
*
2
],
di
;qcoeff_ptr[rc] = temp_qcoeff[rc]
mov
rsi
,
[
rsp
+
zrun_zbin_boost
]
; reset to b->zrun_zbin_boost
rq_zigzag_loop_
%
1
:
%endmacro
ZIGZAG_LOOP
0
,
1
ZIGZAG_LOOP
1
,
2
ZIGZAG_LOOP
2
,
3
ZIGZAG_LOOP
3
,
4
ZIGZAG_LOOP
4
,
5
ZIGZAG_LOOP
5
,
6
ZIGZAG_LOOP
6
,
7
ZIGZAG_LOOP
7
,
8
ZIGZAG_LOOP
8
,
9
ZIGZAG_LOOP
9
,
10
ZIGZAG_LOOP
10
,
11
ZIGZAG_LOOP
11
,
12
ZIGZAG_LOOP
12
,
13
ZIGZAG_LOOP
13
,
14
ZIGZAG_LOOP
14
,
15
ZIGZAG_LOOP
15
,
end
rq_zigzag_loop_end:
mov
rbx
,
arg
(
2
)
; qcoeff_ptr
mov
rcx
,
arg
(
3
)
; dequant_ptr
mov
rsi
,
arg
(
7
)
; dqcoeff_ptr
mov
rax
,
[
rsp
+
eob_tmp
]
; eob
movdqa
xmm2
,
OWORD
PTR
[
rsp
+
qcoeff
]
movdqa
xmm3
,
OWORD
PTR
[
rsp
+
qcoeff
+
16
]
ZIGZAG_LOOP
0
ZIGZAG_LOOP
1
ZIGZAG_LOOP
2
ZIGZAG_LOOP
3
ZIGZAG_LOOP
4
ZIGZAG_LOOP
5
ZIGZAG_LOOP
6
ZIGZAG_LOOP
7
ZIGZAG_LOOP
8
ZIGZAG_LOOP
9
ZIGZAG_LOOP
10
ZIGZAG_LOOP
11
ZIGZAG_LOOP
12
ZIGZAG_LOOP
13
ZIGZAG_LOOP
14
ZIGZAG_LOOP
15
movdqa
xmm2
,
[
rsp
+
qcoeff
]
movdqa
xmm3
,
[
rsp
+
qcoeff
+
16
]
%if ABI_IS_32BIT
mov
rdi
,
arg
(
1
)
%else
mov
rdi
,
[
rsp
+
BL
OCKD_d
]
%endif
mov
rcx
,
[
rdi
+
vp8_blockd_dequant
]
; dequant_ptr
mov
rsi
,
[
rdi
+
vp8_blockd_dqcoeff
]
; dqcoeff_ptr
; y ^ sz
pxor
xmm2
,
xmm0
...
...
@@ -175,34 +186,67 @@ rq_zigzag_loop_end:
psubw
xmm2
,
xmm0
psubw
xmm3
,
xmm4
movdqa
xmm0
,
OWORD
PTR
[
rcx
]
movdqa
xmm1
,
OWORD
PTR
[
rcx
+
16
]
; dequant
movdqa
xmm0
,
[
rcx
]
movdqa
xmm1
,
[
rcx
+
16
]
mov
rcx
,
[
rdi
+
vp8_blockd_qcoeff
]
; qcoeff_ptr
pmullw
xmm0
,
xmm2
pmullw
xmm1
,
xmm3
movdqa
OWORD
PTR
[
rbx
],
xmm2
movdqa
OWORD
PTR
[
rbx
+
16
],
xmm3
movdqa
OWORD
PTR
[
rsi
],
xmm0
; store dqcoeff
movdqa
OWORD
PTR
[
rsi
+
16
],
xmm1
; store dqcoeff
add
rax
,
1
movdqa
[
rcx
],
xmm2
; store qcoeff
movdqa
[
rcx
+
16
],
xmm3
movdqa
[
rsi
],
xmm0
; store dqcoeff
movdqa
[
rsi
+
16
],
xmm1
; select the last value (in zig_zag order) for EOB
pcmpeqw
xmm2
,
xmm6
pcmpeqw
xmm3
,
xmm6
; !
pcmpeqw
xmm6
,
xmm6
pxor
xmm2
,
xmm6
pxor
xmm3
,
xmm6
; mask inv_zig_zag
pand
xmm2
,
[
GLOBAL
(
inv_zig_zag
)]
pand
xmm3
,
[
GLOBAL
(
inv_zig_zag
)
+
16
]
; select the max value
pmaxsw
xmm2
,
xmm3
pshufd
xmm3
,
xmm2
,
00001110b
pmaxsw
xmm2
,
xmm3
pshuflw
xmm3
,
xmm2
,
00001110b
pmaxsw
xmm2
,
xmm3
pshuflw
xmm3
,
xmm2
,
00000001b
pmaxsw
xmm2
,
xmm3
movd
eax
,
xmm2
and
eax
,
0xff
mov
[
rdi
+
vp8_blockd_eob
],
eax
; begin epilog
add
rsp
,
stack_size
pop
rsp
pop
rbx
%if ABI_IS_32BIT
pop
rdi
%else
%ifidn __OUTPUT_FORMAT__,x64
pop
rdi
%endif
%endif
pop
rsi
REST
ORE_GOT
REST
ORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
; short *qcoeff_ptr,short *dequant_ptr,
; short *inv_scan_order, short *round_ptr,
; short *quant_ptr, short *dqcoeff_ptr);
; int vp8_fast_quantize_b_impl_sse2 | arg
; (short *coeff_ptr, | 0
; short *qcoeff_ptr, | 1
; short *dequant_ptr, | 2
; short *inv_scan_order, | 3
; short *round_ptr, | 4
; short *quant_ptr, | 5
; short *dqcoeff_ptr) | 6
global
sym
(
vp8_fast_quantize_b_impl_sse2
)
sym
(
vp8_fast_quantize_b_impl_sse2
):
push
rbp
...
...
@@ -300,3 +344,16 @@ sym(vp8_fast_quantize_b_impl_sse2):
UNSHADOW_ARGS
pop
rbp
ret
SECTION
_RODATA
align
16
zig_zag:
dw
0x0000
,
0x0001
,
0x0004
,
0x0008
dw
0x0005
,
0x0002
,
0x0003
,
0x0006
dw
0x0009
,
0x000c
,
0x000d
,
0x000a
dw
0x0007
,
0x000b
,
0x000e
,
0x000f
inv_zig_zag:
dw
0x0001
,
0x0002
,
0x0006
,
0x0007
dw
0x0003
,
0x0005
,
0x0008
,
0x000d
dw
0x0004
,
0x0009
,
0x000c
,
0x000e
dw
0x000a
,
0x000b
,
0x000f
,
0x0010
vp8/encoder/x86/quantize_x86.h
View file @
beaafefc
...
...
@@ -27,11 +27,8 @@ extern prototype_quantize_block(vp8_regular_quantize_b_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
// Currently, this function realizes a gain on x86 and a loss on x86_64
#if ARCH_X86
#undef vp8_quantize_quantb
#define vp8_quantize_quantb vp8_regular_quantize_b_sse2
#endif
#endif
...
...
vp8/encoder/x86/x86_csystemdependent.c
View file @
beaafefc
...
...
@@ -106,30 +106,6 @@ static void fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
);
}
int
vp8_regular_quantize_b_impl_sse2
(
short
*
coeff_ptr
,
short
*
zbin_ptr
,
short
*
qcoeff_ptr
,
short
*
dequant_ptr
,
const
int
*
default_zig_zag
,
short
*
round_ptr
,
short
*
quant_ptr
,
short
*
dqcoeff_ptr
,
unsigned
short
zbin_oq_value
,
short
*
zbin_boost_ptr
,
short
*
quant_shift_ptr
);
static
void
regular_quantize_b_sse2
(
BLOCK
*
b
,
BLOCKD
*
d
)
{
d
->
eob
=
vp8_regular_quantize_b_impl_sse2
(
b
->
coeff
,
b
->
zbin
,
d
->
qcoeff
,
d
->
dequant
,
vp8_default_zig_zag1d
,
b
->
round
,
b
->
quant
,
d
->
dqcoeff
,
b
->
zbin_extra
,
b
->
zrun_zbin_boost
,
b
->
quant_shift
);
}
int
vp8_mbblock_error_xmm_impl
(
short
*
coeff_ptr
,
short
*
dcoef_ptr
,
int
dc
);
static
int
mbblock_error_xmm
(
MACROBLOCK
*
mb
,
int
dc
)
{
...
...
@@ -317,9 +293,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi
->
rtcd
.
encodemb
.
submby
=
vp8_subtract_mby_sse2
;
cpi
->
rtcd
.
encodemb
.
submbuv
=
vp8_subtract_mbuv_sse2
;
#if ARCH_X86
cpi
->
rtcd
.
quantize
.
quantb
=
regular_quantize_b_sse2
;
#endif
cpi
->
rtcd
.
quantize
.
quantb
=
vp8_regular_quantize_b_sse2
;
cpi
->
rtcd
.
quantize
.
fastquantb
=
fast_quantize_b_sse2
;
#if !(CONFIG_REALTIME_ONLY)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment