Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
ea2bac9f
Commit
ea2bac9f
authored
Jun 19, 2014
by
Johann
Committed by
Gerrit Code Review
Jun 19, 2014
Browse files
Merge "sse4 regular quantize"
parents
02d557ea
0d3ed089
Changes
4
Hide whitespace changes
Inline
Side-by-side
vp8/common/rtcd_defs.pl
View file @
ea2bac9f
...
...
@@ -463,9 +463,7 @@ $vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon;
# Quantizer
#
add_proto
qw/void vp8_regular_quantize_b/
,
"
struct block *, struct blockd *
";
specialize
qw/vp8_regular_quantize_b sse2/
;
# TODO(johann) Update sse4 implementation and re-enable
#$vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4;
specialize
qw/vp8_regular_quantize_b sse2 sse4_1/
;
add_proto
qw/void vp8_fast_quantize_b/
,
"
struct block *, struct blockd *
";
specialize
qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/
;
...
...
vp8/encoder/x86/quantize_sse4.asm
deleted
100644 → 0
View file @
02d557ea
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%include "vp8_asm_enc_offsets.asm"
; void vp8_regular_quantize_b_sse4 | arg
; (BLOCK *b, | 0
; BLOCKD *d) | 1
global
sym
(
vp8_regular_quantize_b_sse4
)
PRIVATE
sym
(
vp8_regular_quantize_b_sse4
):
%if ABI_IS_32BIT
push
rbp
mov
rbp
,
rsp
GET_GOT
rbx
push
rdi
push
rsi
ALIGN
_STACK
16
,
rax
%define qcoeff 0
; 32
%define stack_size 32
sub
rsp
,
stack_size
%else
%if LIBVPX_YASM_WIN64
SAVE_XMM
8
,
u
push
rdi
push
rsi
%endif
%endif
; end prolog
%if ABI_IS_32BIT
mov
rdi
,
arg
(
0
)
; BLOCK *b
mov
rsi
,
arg
(
1
)
; BLOCKD *d
%else
%if LIBVPX_YASM_WIN64
mov
rdi
,
rcx
; BLOCK *b
mov
rsi
,
rdx
; BLOCKD *d
%else
;mov rdi, rdi ; BLOCK *b
;mov rsi, rsi ; BLOCKD *d
%endif
%endif
mov
rax
,
[
rdi
+
vp8_block_coeff
]
mov
rcx
,
[
rdi
+
vp8_block_zbin
]
mov
rdx
,
[
rdi
+
vp8_block_round
]
movd
xmm7
,
[
rdi
+
vp8_block_zbin_extra
]
; z
movdqa
xmm0
,
[
rax
]
movdqa
xmm1
,
[
rax
+
16
]
; duplicate zbin_oq_value
pshuflw
xmm7
,
xmm7
,
0
punpcklwd
xmm7
,
xmm7
movdqa
xmm2
,
xmm0
movdqa
xmm3
,
xmm1
; sz
psraw
xmm0
,
15
psraw
xmm1
,
15
; (z ^ sz)
pxor
xmm2
,
xmm0
pxor
xmm3
,
xmm1
; x = abs(z)
psubw
xmm2
,
xmm0
psubw
xmm3
,
xmm1
; zbin
movdqa
xmm4
,
[
rcx
]
movdqa
xmm5
,
[
rcx
+
16
]
; *zbin_ptr + zbin_oq_value
paddw
xmm4
,
xmm7
paddw
xmm5
,
xmm7
movdqa
xmm6
,
xmm2
movdqa
xmm7
,
xmm3
; x - (*zbin_ptr + zbin_oq_value)
psubw
xmm6
,
xmm4
psubw
xmm7
,
xmm5
; round
movdqa
xmm4
,
[
rdx
]
movdqa
xmm5
,
[
rdx
+
16
]
mov
rax
,
[
rdi
+
vp8_block_quant_shift
]
mov
rcx
,
[
rdi
+
vp8_block_quant
]
mov
rdx
,
[
rdi
+
vp8_block_zrun_zbin_boost
]
; x + round
paddw
xmm2
,
xmm4
paddw
xmm3
,
xmm5
; quant
movdqa
xmm4
,
[
rcx
]
movdqa
xmm5
,
[
rcx
+
16
]
; y = x * quant_ptr >> 16
pmulhw
xmm4
,
xmm2
pmulhw
xmm5
,
xmm3
; y += x
paddw
xmm2
,
xmm4
paddw
xmm3
,
xmm5
pxor
xmm4
,
xmm4
%if ABI_IS_32BIT
movdqa
[
rsp
+
qcoeff
],
xmm4
movdqa
[
rsp
+
qcoeff
+
16
],
xmm4
%else
pxor
xmm8
,
xmm8
%endif
; quant_shift
movdqa
xmm5
,
[
rax
]
; zrun_zbin_boost
mov
rax
,
rdx
%macro ZIGZAG_LOOP 5
; x
pextrw
ecx
,
%
4
,
%
2
; if (x >= zbin)
sub
cx
,
WORD
PTR
[
rdx
]
; x - zbin
lea
rdx
,
[
rdx
+
2
]
; zbin_boost_ptr++
jl
.rq_zigzag_loop_
%
1
; x < zbin
pextrw
edi
,
%
3
,
%
2
; y
; downshift by quant_shift[rc]
pextrb
ecx
,
xmm5
,
%
1
; quant_shift[rc]
sar
edi
,
cl
; also sets Z bit
je
.rq_zigzag_loop_
%
1
; !y
%if ABI_IS_32BIT
mov
WORD
PTR
[
rsp
+
qcoeff
+
%
1
*
2
],
di
%else
pinsrw
%
5
,
edi
,
%
2
; qcoeff[rc]
%endif
mov
rdx
,
rax
; reset to b->zrun_zbin_boost
.rq_zigzag_loop_
%
1
:
%endmacro
; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
ZIGZAG_LOOP
0
,
0
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
1
,
1
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
4
,
4
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
8
,
0
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
5
,
5
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
2
,
2
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
3
,
3
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
6
,
6
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
9
,
1
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
12
,
4
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
13
,
5
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
10
,
2
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
7
,
7
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
11
,
3
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
14
,
6
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
15
,
7
,
xmm3
,
xmm7
,
xmm8
mov
rcx
,
[
rsi
+
vp8_blockd_dequant
]
mov
rdi
,
[
rsi
+
vp8_blockd_dqcoeff
]
%if ABI_IS_32BIT
movdqa
xmm4
,
[
rsp
+
qcoeff
]
movdqa
xmm5
,
[
rsp
+
qcoeff
+
16
]
%else
%define xmm5 xmm8
%endif
; y ^ sz
pxor
xmm4
,
xmm0
pxor
xmm5
,
xmm1
; x = (y ^ sz) - sz
psubw
xmm4
,
xmm0
psubw
xmm5
,
xmm1
; dequant
movdqa
xmm0
,
[
rcx
]
movdqa
xmm1
,
[
rcx
+
16
]
mov
rcx
,
[
rsi
+
vp8_blockd_qcoeff
]
pmullw
xmm0
,
xmm4
pmullw
xmm1
,
xmm5
; store qcoeff
movdqa
[
rcx
],
xmm4
movdqa
[
rcx
+
16
],
xmm5
; store dqcoeff
movdqa
[
rdi
],
xmm0
movdqa
[
rdi
+
16
],
xmm1
mov
rcx
,
[
rsi
+
vp8_blockd_eob
]
; select the last value (in zig_zag order) for EOB
pxor
xmm6
,
xmm6
pcmpeqw
xmm4
,
xmm6
pcmpeqw
xmm5
,
xmm6
packsswb
xmm4
,
xmm5
pshufb
xmm4
,
[
GLOBAL
(
zig_zag1d
)]
pmovmskb
edx
,
xmm4
xor
rdi
,
rdi
mov
eax
,
-
1
xor
dx
,
ax
bsr
eax
,
edx
sub
edi
,
edx
sar
edi
,
31
add
eax
,
1
and
eax
,
edi
mov
BYTE
PTR
[
rcx
],
al
; store eob
; begin epilog
%if ABI_IS_32BIT
add
rsp
,
stack_size
pop
rsp
pop
rsi
pop
rdi
REST
ORE_GOT
pop
rbp
%else
%undef xmm5
%if LIBVPX_YASM_WIN64
pop
rsi
pop
rdi
REST
ORE_XMM
%endif
%endif
ret
SECTION
_RODATA
align
16
; vp8/common/entropy.c: vp8_default_zig_zag1d
zig_zag1d:
db
0
,
1
,
4
,
8
,
5
,
2
,
3
,
6
,
9
,
12
,
13
,
10
,
7
,
11
,
14
,
15
vp8/encoder/x86/quantize_sse4.c
0 → 100644
View file @
ea2bac9f
/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <smmintrin.h>
/* SSE4.1 */
#include "./vp8_rtcd.h"
#include "vp8/encoder/block.h"
#include "vp8/common/entropy.h"
/* vp8_default_inv_zig_zag */
#define SELECT_EOB(i, z, x, y, q) \
do { \
__label__ select_eob_end; \
short boost = *zbin_boost_ptr; \
short x_z = _mm_extract_epi16(x, z); \
short y_z = _mm_extract_epi16(y, z); \
int cmp = (x_z < boost) | (y_z == 0); \
zbin_boost_ptr++; \
if (cmp) \
goto select_eob_end; \
q = _mm_insert_epi16(q, y_z, z); \
eob = i; \
zbin_boost_ptr = b->zrun_zbin_boost; \
select_eob_end:; \
} while (0)
void
vp8_regular_quantize_b_sse4_1
(
BLOCK
*
b
,
BLOCKD
*
d
)
{
char
eob
=
0
;
short
*
zbin_boost_ptr
=
b
->
zrun_zbin_boost
;
__m128i
sz0
,
x0
,
sz1
,
x1
,
y0
,
y1
,
x_minus_zbin0
,
x_minus_zbin1
,
dqcoeff0
,
dqcoeff1
;
__m128i
quant_shift0
=
_mm_load_si128
((
__m128i
*
)(
b
->
quant_shift
));
__m128i
quant_shift1
=
_mm_load_si128
((
__m128i
*
)(
b
->
quant_shift
+
8
));
__m128i
z0
=
_mm_load_si128
((
__m128i
*
)(
b
->
coeff
));
__m128i
z1
=
_mm_load_si128
((
__m128i
*
)(
b
->
coeff
+
8
));
__m128i
zbin_extra
=
_mm_cvtsi32_si128
(
b
->
zbin_extra
);
__m128i
zbin0
=
_mm_load_si128
((
__m128i
*
)(
b
->
zbin
));
__m128i
zbin1
=
_mm_load_si128
((
__m128i
*
)(
b
->
zbin
+
8
));
__m128i
round0
=
_mm_load_si128
((
__m128i
*
)(
b
->
round
));
__m128i
round1
=
_mm_load_si128
((
__m128i
*
)(
b
->
round
+
8
));
__m128i
quant0
=
_mm_load_si128
((
__m128i
*
)(
b
->
quant
));
__m128i
quant1
=
_mm_load_si128
((
__m128i
*
)(
b
->
quant
+
8
));
__m128i
dequant0
=
_mm_load_si128
((
__m128i
*
)(
d
->
dequant
));
__m128i
dequant1
=
_mm_load_si128
((
__m128i
*
)(
d
->
dequant
+
8
));
__m128i
qcoeff0
=
_mm_setzero_si128
();
__m128i
qcoeff1
=
_mm_setzero_si128
();
/* Duplicate to all lanes. */
zbin_extra
=
_mm_shufflelo_epi16
(
zbin_extra
,
0
);
zbin_extra
=
_mm_unpacklo_epi16
(
zbin_extra
,
zbin_extra
);
/* Sign of z: z >> 15 */
sz0
=
_mm_srai_epi16
(
z0
,
15
);
sz1
=
_mm_srai_epi16
(
z1
,
15
);
/* x = abs(z): (z ^ sz) - sz */
x0
=
_mm_xor_si128
(
z0
,
sz0
);
x1
=
_mm_xor_si128
(
z1
,
sz1
);
x0
=
_mm_sub_epi16
(
x0
,
sz0
);
x1
=
_mm_sub_epi16
(
x1
,
sz1
);
/* zbin[] + zbin_extra */
zbin0
=
_mm_add_epi16
(
zbin0
,
zbin_extra
);
zbin1
=
_mm_add_epi16
(
zbin1
,
zbin_extra
);
/* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
* the equation because boost is the only value which can change:
* x - (zbin[] + extra) >= boost */
x_minus_zbin0
=
_mm_sub_epi16
(
x0
,
zbin0
);
x_minus_zbin1
=
_mm_sub_epi16
(
x1
,
zbin1
);
/* All the remaining calculations are valid whether they are done now with
* simd or later inside the loop one at a time. */
x0
=
_mm_add_epi16
(
x0
,
round0
);
x1
=
_mm_add_epi16
(
x1
,
round1
);
y0
=
_mm_mulhi_epi16
(
x0
,
quant0
);
y1
=
_mm_mulhi_epi16
(
x1
,
quant1
);
y0
=
_mm_add_epi16
(
y0
,
x0
);
y1
=
_mm_add_epi16
(
y1
,
x1
);
/* Instead of shifting each value independently we convert the scaling
* factor with 1 << (16 - shift) so we can use multiply/return high half. */
y0
=
_mm_mulhi_epi16
(
y0
,
quant_shift0
);
y1
=
_mm_mulhi_epi16
(
y1
,
quant_shift1
);
/* Return the sign: (y ^ sz) - sz */
y0
=
_mm_xor_si128
(
y0
,
sz0
);
y1
=
_mm_xor_si128
(
y1
,
sz1
);
y0
=
_mm_sub_epi16
(
y0
,
sz0
);
y1
=
_mm_sub_epi16
(
y1
,
sz1
);
/* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
SELECT_EOB
(
1
,
0
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
2
,
1
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
3
,
4
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
4
,
0
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
5
,
5
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
6
,
2
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
7
,
3
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
8
,
6
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
9
,
1
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
10
,
4
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
11
,
5
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
12
,
2
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
13
,
7
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
14
,
3
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
15
,
6
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
16
,
7
,
x_minus_zbin1
,
y1
,
qcoeff1
);
_mm_store_si128
((
__m128i
*
)(
d
->
qcoeff
),
qcoeff0
);
_mm_store_si128
((
__m128i
*
)(
d
->
qcoeff
+
8
),
qcoeff1
);
dqcoeff0
=
_mm_mullo_epi16
(
qcoeff0
,
dequant0
);
dqcoeff1
=
_mm_mullo_epi16
(
qcoeff1
,
dequant1
);
_mm_store_si128
((
__m128i
*
)(
d
->
dqcoeff
),
dqcoeff0
);
_mm_store_si128
((
__m128i
*
)(
d
->
dqcoeff
+
8
),
dqcoeff1
);
*
d
->
eob
=
eob
;
}
vp8/vp8cx.mk
View file @
ea2bac9f
...
...
@@ -89,6 +89,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/fwalsh_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/quantize_sse2.c
VP8_CX_SRCS-$(HAVE_SSSE3)
+=
encoder/x86/quantize_ssse3.c
VP8_CX_SRCS-$(HAVE_SSE4_1)
+=
encoder/x86/quantize_sse4.c
ifeq
($(CONFIG_TEMPORAL_DENOISING),yes)
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/denoising_sse2.c
...
...
@@ -97,7 +98,6 @@ endif
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/subtract_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/temporal_filter_apply_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/vp8_enc_stubs_sse2.c
VP8_CX_SRCS-$(HAVE_SSE4_1)
+=
encoder/x86/quantize_sse4.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64)
+=
encoder/x86/quantize_mmx.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64)
+=
encoder/x86/encodeopt.asm
VP8_CX_SRCS-$(ARCH_X86_64)
+=
encoder/x86/ssim_opt_x86_64.asm
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment