Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Xiph.Org
aom-rav1e
Commits
ea2bac9f
Commit
ea2bac9f
authored
Jun 19, 2014
by
Johann
Committed by
Gerrit Code Review
Jun 19, 2014
Browse files
Options
Browse Files
Download
Plain Diff
Merge "sse4 regular quantize"
parents
02d557ea
0d3ed089
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
132 additions
and
260 deletions
+132
-260
vp8/common/rtcd_defs.pl
vp8/common/rtcd_defs.pl
+1
-3
vp8/encoder/x86/quantize_sse4.asm
vp8/encoder/x86/quantize_sse4.asm
+0
-256
vp8/encoder/x86/quantize_sse4.c
vp8/encoder/x86/quantize_sse4.c
+130
-0
vp8/vp8cx.mk
vp8/vp8cx.mk
+1
-1
No files found.
vp8/common/rtcd_defs.pl
View file @
ea2bac9f
...
...
@@ -463,9 +463,7 @@ $vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon;
# Quantizer
#
add_proto
qw/void vp8_regular_quantize_b/
,
"
struct block *, struct blockd *
";
specialize
qw/vp8_regular_quantize_b sse2/
;
# TODO(johann) Update sse4 implementation and re-enable
#$vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4;
specialize
qw/vp8_regular_quantize_b sse2 sse4_1/
;
add_proto
qw/void vp8_fast_quantize_b/
,
"
struct block *, struct blockd *
";
specialize
qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/
;
...
...
vp8/encoder/x86/quantize_sse4.asm
deleted
100644 → 0
View file @
02d557ea
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%include "vp8_asm_enc_offsets.asm"
; void vp8_regular_quantize_b_sse4 | arg
; (BLOCK *b, | 0
; BLOCKD *d) | 1
global
sym
(
vp8_regular_quantize_b_sse4
)
PRIVATE
sym
(
vp8_regular_quantize_b_sse4
):
%if ABI_IS_32BIT
push
rbp
mov
rbp
,
rsp
GET_GOT
rbx
push
rdi
push
rsi
ALIGN
_STACK
16
,
rax
%define qcoeff 0
; 32
%define stack_size 32
sub
rsp
,
stack_size
%else
%if LIBVPX_YASM_WIN64
SAVE_XMM
8
,
u
push
rdi
push
rsi
%endif
%endif
; end prolog
%if ABI_IS_32BIT
mov
rdi
,
arg
(
0
)
; BLOCK *b
mov
rsi
,
arg
(
1
)
; BLOCKD *d
%else
%if LIBVPX_YASM_WIN64
mov
rdi
,
rcx
; BLOCK *b
mov
rsi
,
rdx
; BLOCKD *d
%else
;mov rdi, rdi ; BLOCK *b
;mov rsi, rsi ; BLOCKD *d
%endif
%endif
mov
rax
,
[
rdi
+
vp8_block_coeff
]
mov
rcx
,
[
rdi
+
vp8_block_zbin
]
mov
rdx
,
[
rdi
+
vp8_block_round
]
movd
xmm7
,
[
rdi
+
vp8_block_zbin_extra
]
; z
movdqa
xmm0
,
[
rax
]
movdqa
xmm1
,
[
rax
+
16
]
; duplicate zbin_oq_value
pshuflw
xmm7
,
xmm7
,
0
punpcklwd
xmm7
,
xmm7
movdqa
xmm2
,
xmm0
movdqa
xmm3
,
xmm1
; sz
psraw
xmm0
,
15
psraw
xmm1
,
15
; (z ^ sz)
pxor
xmm2
,
xmm0
pxor
xmm3
,
xmm1
; x = abs(z)
psubw
xmm2
,
xmm0
psubw
xmm3
,
xmm1
; zbin
movdqa
xmm4
,
[
rcx
]
movdqa
xmm5
,
[
rcx
+
16
]
; *zbin_ptr + zbin_oq_value
paddw
xmm4
,
xmm7
paddw
xmm5
,
xmm7
movdqa
xmm6
,
xmm2
movdqa
xmm7
,
xmm3
; x - (*zbin_ptr + zbin_oq_value)
psubw
xmm6
,
xmm4
psubw
xmm7
,
xmm5
; round
movdqa
xmm4
,
[
rdx
]
movdqa
xmm5
,
[
rdx
+
16
]
mov
rax
,
[
rdi
+
vp8_block_quant_shift
]
mov
rcx
,
[
rdi
+
vp8_block_quant
]
mov
rdx
,
[
rdi
+
vp8_block_zrun_zbin_boost
]
; x + round
paddw
xmm2
,
xmm4
paddw
xmm3
,
xmm5
; quant
movdqa
xmm4
,
[
rcx
]
movdqa
xmm5
,
[
rcx
+
16
]
; y = x * quant_ptr >> 16
pmulhw
xmm4
,
xmm2
pmulhw
xmm5
,
xmm3
; y += x
paddw
xmm2
,
xmm4
paddw
xmm3
,
xmm5
pxor
xmm4
,
xmm4
%if ABI_IS_32BIT
movdqa
[
rsp
+
qcoeff
],
xmm4
movdqa
[
rsp
+
qcoeff
+
16
],
xmm4
%else
pxor
xmm8
,
xmm8
%endif
; quant_shift
movdqa
xmm5
,
[
rax
]
; zrun_zbin_boost
mov
rax
,
rdx
%macro ZIGZAG_LOOP 5
; x
pextrw
ecx
,
%
4
,
%
2
; if (x >= zbin)
sub
cx
,
WORD
PTR
[
rdx
]
; x - zbin
lea
rdx
,
[
rdx
+
2
]
; zbin_boost_ptr++
jl
.rq_zigzag_loop_
%
1
; x < zbin
pextrw
edi
,
%
3
,
%
2
; y
; downshift by quant_shift[rc]
pextrb
ecx
,
xmm5
,
%
1
; quant_shift[rc]
sar
edi
,
cl
; also sets Z bit
je
.rq_zigzag_loop_
%
1
; !y
%if ABI_IS_32BIT
mov
WORD
PTR
[
rsp
+
qcoeff
+
%
1
*
2
],
di
%else
pinsrw
%
5
,
edi
,
%
2
; qcoeff[rc]
%endif
mov
rdx
,
rax
; reset to b->zrun_zbin_boost
.rq_zigzag_loop_
%
1
:
%endmacro
; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
ZIGZAG_LOOP
0
,
0
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
1
,
1
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
4
,
4
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
8
,
0
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
5
,
5
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
2
,
2
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
3
,
3
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
6
,
6
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
9
,
1
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
12
,
4
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
13
,
5
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
10
,
2
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
7
,
7
,
xmm2
,
xmm6
,
xmm4
ZIGZAG_LOOP
11
,
3
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
14
,
6
,
xmm3
,
xmm7
,
xmm8
ZIGZAG_LOOP
15
,
7
,
xmm3
,
xmm7
,
xmm8
mov
rcx
,
[
rsi
+
vp8_blockd_dequant
]
mov
rdi
,
[
rsi
+
vp8_blockd_dqcoeff
]
%if ABI_IS_32BIT
movdqa
xmm4
,
[
rsp
+
qcoeff
]
movdqa
xmm5
,
[
rsp
+
qcoeff
+
16
]
%else
%define xmm5 xmm8
%endif
; y ^ sz
pxor
xmm4
,
xmm0
pxor
xmm5
,
xmm1
; x = (y ^ sz) - sz
psubw
xmm4
,
xmm0
psubw
xmm5
,
xmm1
; dequant
movdqa
xmm0
,
[
rcx
]
movdqa
xmm1
,
[
rcx
+
16
]
mov
rcx
,
[
rsi
+
vp8_blockd_qcoeff
]
pmullw
xmm0
,
xmm4
pmullw
xmm1
,
xmm5
; store qcoeff
movdqa
[
rcx
],
xmm4
movdqa
[
rcx
+
16
],
xmm5
; store dqcoeff
movdqa
[
rdi
],
xmm0
movdqa
[
rdi
+
16
],
xmm1
mov
rcx
,
[
rsi
+
vp8_blockd_eob
]
; select the last value (in zig_zag order) for EOB
pxor
xmm6
,
xmm6
pcmpeqw
xmm4
,
xmm6
pcmpeqw
xmm5
,
xmm6
packsswb
xmm4
,
xmm5
pshufb
xmm4
,
[
GLOBAL
(
zig_zag1d
)]
pmovmskb
edx
,
xmm4
xor
rdi
,
rdi
mov
eax
,
-
1
xor
dx
,
ax
bsr
eax
,
edx
sub
edi
,
edx
sar
edi
,
31
add
eax
,
1
and
eax
,
edi
mov
BYTE
PTR
[
rcx
],
al
; store eob
; begin epilog
%if ABI_IS_32BIT
add
rsp
,
stack_size
pop
rsp
pop
rsi
pop
rdi
REST
ORE_GOT
pop
rbp
%else
%undef xmm5
%if LIBVPX_YASM_WIN64
pop
rsi
pop
rdi
REST
ORE_XMM
%endif
%endif
ret
SECTION
_RODATA
align
16
; vp8/common/entropy.c: vp8_default_zig_zag1d
zig_zag1d:
db
0
,
1
,
4
,
8
,
5
,
2
,
3
,
6
,
9
,
12
,
13
,
10
,
7
,
11
,
14
,
15
vp8/encoder/x86/quantize_sse4.c
0 → 100644
View file @
ea2bac9f
/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <smmintrin.h>
/* SSE4.1 */
#include "./vp8_rtcd.h"
#include "vp8/encoder/block.h"
#include "vp8/common/entropy.h"
/* vp8_default_inv_zig_zag */
#define SELECT_EOB(i, z, x, y, q) \
do { \
__label__ select_eob_end; \
short boost = *zbin_boost_ptr; \
short x_z = _mm_extract_epi16(x, z); \
short y_z = _mm_extract_epi16(y, z); \
int cmp = (x_z < boost) | (y_z == 0); \
zbin_boost_ptr++; \
if (cmp) \
goto select_eob_end; \
q = _mm_insert_epi16(q, y_z, z); \
eob = i; \
zbin_boost_ptr = b->zrun_zbin_boost; \
select_eob_end:; \
} while (0)
void
vp8_regular_quantize_b_sse4_1
(
BLOCK
*
b
,
BLOCKD
*
d
)
{
char
eob
=
0
;
short
*
zbin_boost_ptr
=
b
->
zrun_zbin_boost
;
__m128i
sz0
,
x0
,
sz1
,
x1
,
y0
,
y1
,
x_minus_zbin0
,
x_minus_zbin1
,
dqcoeff0
,
dqcoeff1
;
__m128i
quant_shift0
=
_mm_load_si128
((
__m128i
*
)(
b
->
quant_shift
));
__m128i
quant_shift1
=
_mm_load_si128
((
__m128i
*
)(
b
->
quant_shift
+
8
));
__m128i
z0
=
_mm_load_si128
((
__m128i
*
)(
b
->
coeff
));
__m128i
z1
=
_mm_load_si128
((
__m128i
*
)(
b
->
coeff
+
8
));
__m128i
zbin_extra
=
_mm_cvtsi32_si128
(
b
->
zbin_extra
);
__m128i
zbin0
=
_mm_load_si128
((
__m128i
*
)(
b
->
zbin
));
__m128i
zbin1
=
_mm_load_si128
((
__m128i
*
)(
b
->
zbin
+
8
));
__m128i
round0
=
_mm_load_si128
((
__m128i
*
)(
b
->
round
));
__m128i
round1
=
_mm_load_si128
((
__m128i
*
)(
b
->
round
+
8
));
__m128i
quant0
=
_mm_load_si128
((
__m128i
*
)(
b
->
quant
));
__m128i
quant1
=
_mm_load_si128
((
__m128i
*
)(
b
->
quant
+
8
));
__m128i
dequant0
=
_mm_load_si128
((
__m128i
*
)(
d
->
dequant
));
__m128i
dequant1
=
_mm_load_si128
((
__m128i
*
)(
d
->
dequant
+
8
));
__m128i
qcoeff0
=
_mm_setzero_si128
();
__m128i
qcoeff1
=
_mm_setzero_si128
();
/* Duplicate to all lanes. */
zbin_extra
=
_mm_shufflelo_epi16
(
zbin_extra
,
0
);
zbin_extra
=
_mm_unpacklo_epi16
(
zbin_extra
,
zbin_extra
);
/* Sign of z: z >> 15 */
sz0
=
_mm_srai_epi16
(
z0
,
15
);
sz1
=
_mm_srai_epi16
(
z1
,
15
);
/* x = abs(z): (z ^ sz) - sz */
x0
=
_mm_xor_si128
(
z0
,
sz0
);
x1
=
_mm_xor_si128
(
z1
,
sz1
);
x0
=
_mm_sub_epi16
(
x0
,
sz0
);
x1
=
_mm_sub_epi16
(
x1
,
sz1
);
/* zbin[] + zbin_extra */
zbin0
=
_mm_add_epi16
(
zbin0
,
zbin_extra
);
zbin1
=
_mm_add_epi16
(
zbin1
,
zbin_extra
);
/* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
* the equation because boost is the only value which can change:
* x - (zbin[] + extra) >= boost */
x_minus_zbin0
=
_mm_sub_epi16
(
x0
,
zbin0
);
x_minus_zbin1
=
_mm_sub_epi16
(
x1
,
zbin1
);
/* All the remaining calculations are valid whether they are done now with
* simd or later inside the loop one at a time. */
x0
=
_mm_add_epi16
(
x0
,
round0
);
x1
=
_mm_add_epi16
(
x1
,
round1
);
y0
=
_mm_mulhi_epi16
(
x0
,
quant0
);
y1
=
_mm_mulhi_epi16
(
x1
,
quant1
);
y0
=
_mm_add_epi16
(
y0
,
x0
);
y1
=
_mm_add_epi16
(
y1
,
x1
);
/* Instead of shifting each value independently we convert the scaling
* factor with 1 << (16 - shift) so we can use multiply/return high half. */
y0
=
_mm_mulhi_epi16
(
y0
,
quant_shift0
);
y1
=
_mm_mulhi_epi16
(
y1
,
quant_shift1
);
/* Return the sign: (y ^ sz) - sz */
y0
=
_mm_xor_si128
(
y0
,
sz0
);
y1
=
_mm_xor_si128
(
y1
,
sz1
);
y0
=
_mm_sub_epi16
(
y0
,
sz0
);
y1
=
_mm_sub_epi16
(
y1
,
sz1
);
/* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
SELECT_EOB
(
1
,
0
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
2
,
1
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
3
,
4
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
4
,
0
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
5
,
5
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
6
,
2
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
7
,
3
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
8
,
6
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
9
,
1
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
10
,
4
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
11
,
5
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
12
,
2
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
13
,
7
,
x_minus_zbin0
,
y0
,
qcoeff0
);
SELECT_EOB
(
14
,
3
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
15
,
6
,
x_minus_zbin1
,
y1
,
qcoeff1
);
SELECT_EOB
(
16
,
7
,
x_minus_zbin1
,
y1
,
qcoeff1
);
_mm_store_si128
((
__m128i
*
)(
d
->
qcoeff
),
qcoeff0
);
_mm_store_si128
((
__m128i
*
)(
d
->
qcoeff
+
8
),
qcoeff1
);
dqcoeff0
=
_mm_mullo_epi16
(
qcoeff0
,
dequant0
);
dqcoeff1
=
_mm_mullo_epi16
(
qcoeff1
,
dequant1
);
_mm_store_si128
((
__m128i
*
)(
d
->
dqcoeff
),
dqcoeff0
);
_mm_store_si128
((
__m128i
*
)(
d
->
dqcoeff
+
8
),
dqcoeff1
);
*
d
->
eob
=
eob
;
}
vp8/vp8cx.mk
View file @
ea2bac9f
...
...
@@ -89,6 +89,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/fwalsh_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/quantize_sse2.c
VP8_CX_SRCS-$(HAVE_SSSE3)
+=
encoder/x86/quantize_ssse3.c
VP8_CX_SRCS-$(HAVE_SSE4_1)
+=
encoder/x86/quantize_sse4.c
ifeq
($(CONFIG_TEMPORAL_DENOISING),yes)
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/denoising_sse2.c
...
...
@@ -97,7 +98,6 @@ endif
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/subtract_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/temporal_filter_apply_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/vp8_enc_stubs_sse2.c
VP8_CX_SRCS-$(HAVE_SSE4_1)
+=
encoder/x86/quantize_sse4.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64)
+=
encoder/x86/quantize_mmx.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64)
+=
encoder/x86/encodeopt.asm
VP8_CX_SRCS-$(ARCH_X86_64)
+=
encoder/x86/ssim_opt_x86_64.asm
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment