Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
00a9671b
Commit
00a9671b
authored
Oct 14, 2014
by
Alex Converse
Committed by
Gerrit Code Review
Oct 14, 2014
Browse files
Merge "Add a 32-bit friendly sse2 quantizer."
parents
a78fd6a4
7497d2fb
Changes
3
Hide whitespace changes
Inline
Side-by-side
vp9/common/vp9_rtcd_defs.pl
View file @
00a9671b
...
...
@@ -1155,7 +1155,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize
qw/vp9_quantize_fp_32x32/
,
"
$ssse3_x86_64
";
add_proto
qw/void vp9_quantize_b/
,
"
const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan
";
specialize
qw/vp9_quantize_b/
,
"
$ssse3_x86_64
";
specialize
qw/vp9_quantize_b
sse2
/
,
"
$ssse3_x86_64
";
add_proto
qw/void vp9_quantize_b_32x32/
,
"
const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan
";
specialize
qw/vp9_quantize_b_32x32/
,
"
$ssse3_x86_64
";
...
...
vp9/encoder/x86/vp9_quantize_sse2.c
0 → 100644
View file @
00a9671b
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include
<emmintrin.h>
#include
<xmmintrin.h>
#include
"vpx/vpx_integer.h"
void
vp9_quantize_b_sse2
(
const
int16_t
*
coeff_ptr
,
intptr_t
n_coeffs
,
int
skip_block
,
const
int16_t
*
zbin_ptr
,
const
int16_t
*
round_ptr
,
const
int16_t
*
quant_ptr
,
const
int16_t
*
quant_shift_ptr
,
int16_t
*
qcoeff_ptr
,
int16_t
*
dqcoeff_ptr
,
const
int16_t
*
dequant_ptr
,
int
zbin_oq_value
,
uint16_t
*
eob_ptr
,
const
int16_t
*
scan_ptr
,
const
int16_t
*
iscan_ptr
)
{
__m128i
zero
;
(
void
)
scan_ptr
;
coeff_ptr
+=
n_coeffs
;
iscan_ptr
+=
n_coeffs
;
qcoeff_ptr
+=
n_coeffs
;
dqcoeff_ptr
+=
n_coeffs
;
n_coeffs
=
-
n_coeffs
;
zero
=
_mm_setzero_si128
();
if
(
!
skip_block
)
{
__m128i
eob
;
__m128i
zbin
;
__m128i
round
,
quant
,
dequant
,
shift
;
{
__m128i
coeff0
,
coeff1
;
// Setup global values
{
__m128i
zbin_oq
;
__m128i
pw_1
;
zbin_oq
=
_mm_set1_epi16
(
zbin_oq_value
);
zbin
=
_mm_load_si128
((
const
__m128i
*
)
zbin_ptr
);
round
=
_mm_load_si128
((
const
__m128i
*
)
round_ptr
);
quant
=
_mm_load_si128
((
const
__m128i
*
)
quant_ptr
);
zbin
=
_mm_add_epi16
(
zbin
,
zbin_oq
);
pw_1
=
_mm_set1_epi16
(
1
);
zbin
=
_mm_sub_epi16
(
zbin
,
pw_1
);
dequant
=
_mm_load_si128
((
const
__m128i
*
)
dequant_ptr
);
shift
=
_mm_load_si128
((
const
__m128i
*
)
quant_shift_ptr
);
}
{
__m128i
coeff0_sign
,
coeff1_sign
;
__m128i
qcoeff0
,
qcoeff1
;
__m128i
qtmp0
,
qtmp1
;
__m128i
cmp_mask0
,
cmp_mask1
;
// Do DC and first 15 AC
coeff0
=
_mm_load_si128
((
const
__m128i
*
)(
coeff_ptr
+
n_coeffs
));
coeff1
=
_mm_load_si128
((
const
__m128i
*
)(
coeff_ptr
+
n_coeffs
)
+
1
);
// Poor man's sign extract
coeff0_sign
=
_mm_srai_epi16
(
coeff0
,
15
);
coeff1_sign
=
_mm_srai_epi16
(
coeff1
,
15
);
qcoeff0
=
_mm_xor_si128
(
coeff0
,
coeff0_sign
);
qcoeff1
=
_mm_xor_si128
(
coeff1
,
coeff1_sign
);
qcoeff0
=
_mm_sub_epi16
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_sub_epi16
(
qcoeff1
,
coeff1_sign
);
cmp_mask0
=
_mm_cmpgt_epi16
(
qcoeff0
,
zbin
);
zbin
=
_mm_unpackhi_epi64
(
zbin
,
zbin
);
// Switch DC to AC
cmp_mask1
=
_mm_cmpgt_epi16
(
qcoeff1
,
zbin
);
qcoeff0
=
_mm_adds_epi16
(
qcoeff0
,
round
);
round
=
_mm_unpackhi_epi64
(
round
,
round
);
qcoeff1
=
_mm_adds_epi16
(
qcoeff1
,
round
);
qtmp0
=
_mm_mulhi_epi16
(
qcoeff0
,
quant
);
quant
=
_mm_unpackhi_epi64
(
quant
,
quant
);
qtmp1
=
_mm_mulhi_epi16
(
qcoeff1
,
quant
);
qtmp0
=
_mm_add_epi16
(
qtmp0
,
qcoeff0
);
qtmp1
=
_mm_add_epi16
(
qtmp1
,
qcoeff1
);
qcoeff0
=
_mm_mulhi_epi16
(
qtmp0
,
shift
);
shift
=
_mm_unpackhi_epi64
(
shift
,
shift
);
qcoeff1
=
_mm_mulhi_epi16
(
qtmp1
,
shift
);
// Reinsert signs
qcoeff0
=
_mm_xor_si128
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_xor_si128
(
qcoeff1
,
coeff1_sign
);
qcoeff0
=
_mm_sub_epi16
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_sub_epi16
(
qcoeff1
,
coeff1_sign
);
// Mask out zbin threshold coeffs
qcoeff0
=
_mm_and_si128
(
qcoeff0
,
cmp_mask0
);
qcoeff1
=
_mm_and_si128
(
qcoeff1
,
cmp_mask1
);
_mm_store_si128
((
__m128i
*
)(
qcoeff_ptr
+
n_coeffs
),
qcoeff0
);
_mm_store_si128
((
__m128i
*
)(
qcoeff_ptr
+
n_coeffs
)
+
1
,
qcoeff1
);
coeff0
=
_mm_mullo_epi16
(
qcoeff0
,
dequant
);
dequant
=
_mm_unpackhi_epi64
(
dequant
,
dequant
);
coeff1
=
_mm_mullo_epi16
(
qcoeff1
,
dequant
);
_mm_store_si128
((
__m128i
*
)(
dqcoeff_ptr
+
n_coeffs
),
coeff0
);
_mm_store_si128
((
__m128i
*
)(
dqcoeff_ptr
+
n_coeffs
)
+
1
,
coeff1
);
}
{
// Scan for eob
__m128i
zero_coeff0
,
zero_coeff1
;
__m128i
nzero_coeff0
,
nzero_coeff1
;
__m128i
iscan0
,
iscan1
;
__m128i
eob1
;
zero_coeff0
=
_mm_cmpeq_epi16
(
coeff0
,
zero
);
zero_coeff1
=
_mm_cmpeq_epi16
(
coeff1
,
zero
);
nzero_coeff0
=
_mm_cmpeq_epi16
(
zero_coeff0
,
zero
);
nzero_coeff1
=
_mm_cmpeq_epi16
(
zero_coeff1
,
zero
);
iscan0
=
_mm_load_si128
((
const
__m128i
*
)(
iscan_ptr
+
n_coeffs
));
iscan1
=
_mm_load_si128
((
const
__m128i
*
)(
iscan_ptr
+
n_coeffs
)
+
1
);
// Add one to convert from indices to counts
iscan0
=
_mm_sub_epi16
(
iscan0
,
nzero_coeff0
);
iscan1
=
_mm_sub_epi16
(
iscan1
,
nzero_coeff1
);
eob
=
_mm_and_si128
(
iscan0
,
nzero_coeff0
);
eob1
=
_mm_and_si128
(
iscan1
,
nzero_coeff1
);
eob
=
_mm_max_epi16
(
eob
,
eob1
);
}
n_coeffs
+=
8
*
2
;
}
// AC only loop
while
(
n_coeffs
<
0
)
{
__m128i
coeff0
,
coeff1
;
{
__m128i
coeff0_sign
,
coeff1_sign
;
__m128i
qcoeff0
,
qcoeff1
;
__m128i
qtmp0
,
qtmp1
;
__m128i
cmp_mask0
,
cmp_mask1
;
coeff0
=
_mm_load_si128
((
const
__m128i
*
)(
coeff_ptr
+
n_coeffs
));
coeff1
=
_mm_load_si128
((
const
__m128i
*
)(
coeff_ptr
+
n_coeffs
)
+
1
);
// Poor man's sign extract
coeff0_sign
=
_mm_srai_epi16
(
coeff0
,
15
);
coeff1_sign
=
_mm_srai_epi16
(
coeff1
,
15
);
qcoeff0
=
_mm_xor_si128
(
coeff0
,
coeff0_sign
);
qcoeff1
=
_mm_xor_si128
(
coeff1
,
coeff1_sign
);
qcoeff0
=
_mm_sub_epi16
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_sub_epi16
(
qcoeff1
,
coeff1_sign
);
cmp_mask0
=
_mm_cmpgt_epi16
(
qcoeff0
,
zbin
);
cmp_mask1
=
_mm_cmpgt_epi16
(
qcoeff1
,
zbin
);
qcoeff0
=
_mm_adds_epi16
(
qcoeff0
,
round
);
qcoeff1
=
_mm_adds_epi16
(
qcoeff1
,
round
);
qtmp0
=
_mm_mulhi_epi16
(
qcoeff0
,
quant
);
qtmp1
=
_mm_mulhi_epi16
(
qcoeff1
,
quant
);
qtmp0
=
_mm_add_epi16
(
qtmp0
,
qcoeff0
);
qtmp1
=
_mm_add_epi16
(
qtmp1
,
qcoeff1
);
qcoeff0
=
_mm_mulhi_epi16
(
qtmp0
,
shift
);
qcoeff1
=
_mm_mulhi_epi16
(
qtmp1
,
shift
);
// Reinsert signs
qcoeff0
=
_mm_xor_si128
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_xor_si128
(
qcoeff1
,
coeff1_sign
);
qcoeff0
=
_mm_sub_epi16
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_sub_epi16
(
qcoeff1
,
coeff1_sign
);
// Mask out zbin threshold coeffs
qcoeff0
=
_mm_and_si128
(
qcoeff0
,
cmp_mask0
);
qcoeff1
=
_mm_and_si128
(
qcoeff1
,
cmp_mask1
);
_mm_store_si128
((
__m128i
*
)(
qcoeff_ptr
+
n_coeffs
),
qcoeff0
);
_mm_store_si128
((
__m128i
*
)(
qcoeff_ptr
+
n_coeffs
)
+
1
,
qcoeff1
);
coeff0
=
_mm_mullo_epi16
(
qcoeff0
,
dequant
);
coeff1
=
_mm_mullo_epi16
(
qcoeff1
,
dequant
);
_mm_store_si128
((
__m128i
*
)(
dqcoeff_ptr
+
n_coeffs
),
coeff0
);
_mm_store_si128
((
__m128i
*
)(
dqcoeff_ptr
+
n_coeffs
)
+
1
,
coeff1
);
}
{
// Scan for eob
__m128i
zero_coeff0
,
zero_coeff1
;
__m128i
nzero_coeff0
,
nzero_coeff1
;
__m128i
iscan0
,
iscan1
;
__m128i
eob0
,
eob1
;
zero_coeff0
=
_mm_cmpeq_epi16
(
coeff0
,
zero
);
zero_coeff1
=
_mm_cmpeq_epi16
(
coeff1
,
zero
);
nzero_coeff0
=
_mm_cmpeq_epi16
(
zero_coeff0
,
zero
);
nzero_coeff1
=
_mm_cmpeq_epi16
(
zero_coeff1
,
zero
);
iscan0
=
_mm_load_si128
((
const
__m128i
*
)(
iscan_ptr
+
n_coeffs
));
iscan1
=
_mm_load_si128
((
const
__m128i
*
)(
iscan_ptr
+
n_coeffs
)
+
1
);
// Add one to convert from indices to counts
iscan0
=
_mm_sub_epi16
(
iscan0
,
nzero_coeff0
);
iscan1
=
_mm_sub_epi16
(
iscan1
,
nzero_coeff1
);
eob0
=
_mm_and_si128
(
iscan0
,
nzero_coeff0
);
eob1
=
_mm_and_si128
(
iscan1
,
nzero_coeff1
);
eob0
=
_mm_max_epi16
(
eob0
,
eob1
);
eob
=
_mm_max_epi16
(
eob
,
eob0
);
}
n_coeffs
+=
8
*
2
;
}
// Accumulate EOB
{
__m128i
eob_shuffled
;
eob_shuffled
=
_mm_shuffle_epi32
(
eob
,
0xe
);
eob
=
_mm_max_epi16
(
eob
,
eob_shuffled
);
eob_shuffled
=
_mm_shufflelo_epi16
(
eob
,
0xe
);
eob
=
_mm_max_epi16
(
eob
,
eob_shuffled
);
eob_shuffled
=
_mm_shufflelo_epi16
(
eob
,
0x1
);
eob
=
_mm_max_epi16
(
eob
,
eob_shuffled
);
*
eob_ptr
=
_mm_extract_epi16
(
eob
,
1
);
}
}
else
{
do
{
_mm_store_si128
((
__m128i
*
)(
dqcoeff_ptr
+
n_coeffs
),
zero
);
_mm_store_si128
((
__m128i
*
)(
dqcoeff_ptr
+
n_coeffs
)
+
1
,
zero
);
_mm_store_si128
((
__m128i
*
)(
qcoeff_ptr
+
n_coeffs
),
zero
);
_mm_store_si128
((
__m128i
*
)(
qcoeff_ptr
+
n_coeffs
)
+
1
,
zero
);
n_coeffs
+=
8
*
2
;
}
while
(
n_coeffs
<
0
);
*
eob_ptr
=
0
;
}
}
vp9/vp9cx.mk
View file @
00a9671b
...
...
@@ -101,6 +101,7 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2)
+=
encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE3)
+=
encoder/x86/vp9_sad_sse3.asm
VP9_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/vp9_quantize_sse2.c
ifeq
($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_MMX)
+=
encoder/x86/vp9_dct_mmx.asm
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment