Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
38408900
Commit
38408900
authored
Nov 22, 2013
by
Yunqing Wang
Committed by
Gerrit Code Review
Nov 22, 2013
Browse files
Merge "Improve vp9_fdct4x4_sse2 (x1.2)"
parents
16ad35f6
ec2dbdd1
Changes
1
Hide whitespace changes
Inline
Side-by-side
vp9/encoder/x86/vp9_dct_sse2.c
View file @
38408900
...
...
@@ -26,24 +26,25 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
// by constructing the 32 bit constant corresponding to that pair.
const
__m128i
k__cospi_p16_p16
=
_mm_set1_epi16
(
cospi_16_64
);
const
__m128i
k__cospi_p16_m16
=
pair_set_epi16
(
cospi_16_64
,
-
cospi_16_64
);
const
__m128i
k__cospi_p
24_p08
=
pair_set_epi16
(
cospi_
24
_64
,
cospi_
8
_64
);
const
__m128i
k__cospi_
m08_p24
=
pair_set_epi16
(
-
cospi_
8
_64
,
cospi_
24
_64
);
const
__m128i
k__cospi_p
08_p24
=
pair_set_epi16
(
cospi_
8
_64
,
cospi_
24
_64
);
const
__m128i
k__cospi_
p24_m08
=
pair_set_epi16
(
cospi_
24
_64
,
-
cospi_
8
_64
);
const
__m128i
k__DCT_CONST_ROUNDING
=
_mm_set1_epi32
(
DCT_CONST_ROUNDING
);
const
__m128i
k__nonzero_bias_a
=
_mm_setr_epi16
(
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
);
const
__m128i
k__nonzero_bias_b
=
_mm_setr_epi16
(
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
);
const
__m128i
kOne
=
_mm_set1_epi16
(
1
);
__m128i
in0
,
in1
,
in2
,
in3
;
__m128i
in0
,
in1
;
// Load inputs.
{
in0
=
_mm_loadl_epi64
((
const
__m128i
*
)(
input
+
0
*
stride
));
in1
=
_mm_loadl_epi64
((
const
__m128i
*
)(
input
+
1
*
stride
));
in2
=
_mm_loadl_epi64
((
const
__m128i
*
)(
input
+
2
*
stride
));
in3
=
_mm_loadl_epi64
((
const
__m128i
*
)(
input
+
3
*
stride
));
in0
=
_mm_unpacklo_epi64
(
in0
,
_mm_loadl_epi64
((
const
__m128i
*
)
(
input
+
1
*
stride
)));
in1
=
_mm_loadl_epi64
((
const
__m128i
*
)(
input
+
2
*
stride
));
in1
=
_mm_unpacklo_epi64
(
_mm_loadl_epi64
((
const
__m128i
*
)
(
input
+
3
*
stride
)),
in1
);
// x = x << 4
in0
=
_mm_slli_epi16
(
in0
,
4
);
in1
=
_mm_slli_epi16
(
in1
,
4
);
in2
=
_mm_slli_epi16
(
in2
,
4
);
in3
=
_mm_slli_epi16
(
in3
,
4
);
// if (i == 0 && input[0]) input[0] += 1;
{
// The mask will only contain wether the first value is zero, all
...
...
@@ -60,18 +61,18 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
// Do the two transform/transpose passes
for
(
pass
=
0
;
pass
<
2
;
++
pass
)
{
// Transform 1/2: Add/substract
const
__m128i
r0
=
_mm_add_epi16
(
in0
,
in
3
);
const
__m128i
r1
=
_mm_
add
_epi16
(
in
1
,
in
2
);
const
__m128i
r2
=
_mm_
sub_epi16
(
in1
,
in2
);
const
__m128i
r3
=
_mm_
sub_epi16
(
in0
,
in3
);
const
__m128i
r0
=
_mm_add_epi16
(
in0
,
in
1
);
const
__m128i
r1
=
_mm_
sub
_epi16
(
in
0
,
in
1
);
const
__m128i
r2
=
_mm_
unpacklo_epi64
(
r0
,
r1
);
const
__m128i
r3
=
_mm_
unpackhi_epi64
(
r0
,
r1
);
// Transform 1/2: Interleave to do the multiply by constants which gets us
// into 32 bits.
const
__m128i
t0
=
_mm_unpacklo_epi16
(
r
0
,
r
1
);
const
__m128i
t2
=
_mm_unpack
lo
_epi16
(
r2
,
r3
);
const
__m128i
t0
=
_mm_unpacklo_epi16
(
r
2
,
r
3
);
const
__m128i
t2
=
_mm_unpack
hi
_epi16
(
r2
,
r3
);
const
__m128i
u0
=
_mm_madd_epi16
(
t0
,
k__cospi_p16_p16
);
const
__m128i
u2
=
_mm_madd_epi16
(
t0
,
k__cospi_p16_m16
);
const
__m128i
u4
=
_mm_madd_epi16
(
t2
,
k__cospi_p
24_p08
);
const
__m128i
u6
=
_mm_madd_epi16
(
t2
,
k__cospi_
m08_p24
);
const
__m128i
u4
=
_mm_madd_epi16
(
t2
,
k__cospi_p
08_p24
);
const
__m128i
u6
=
_mm_madd_epi16
(
t2
,
k__cospi_
p24_m08
);
const
__m128i
v0
=
_mm_add_epi32
(
u0
,
k__DCT_CONST_ROUNDING
);
const
__m128i
v2
=
_mm_add_epi32
(
u2
,
k__DCT_CONST_ROUNDING
);
const
__m128i
v4
=
_mm_add_epi32
(
u4
,
k__DCT_CONST_ROUNDING
);
...
...
@@ -90,24 +91,21 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
// 00 10 01 11 02 12 03 13
// 20 30 21 31 22 32 23 33
in0
=
_mm_unpacklo_epi32
(
tr0_0
,
tr0_1
);
in2
=
_mm_unpackhi_epi32
(
tr0_0
,
tr0_1
);
in1
=
_mm_unpackhi_epi32
(
tr0_0
,
tr0_1
);
in1
=
_mm_shuffle_epi32
(
in1
,
0x4E
);
// 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1
// 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3
if
(
0
==
pass
)
{
// Extract values in the high part for second pass as transform code
// only uses the first four values.
in1
=
_mm_unpackhi_epi64
(
in0
,
in0
);
in3
=
_mm_unpackhi_epi64
(
in2
,
in2
);
}
else
{
// Post-condition output and store it (v + 1) >> 2, taking advantage
// of the fact 1/3 are stored just after 0/2.
__m128i
out01
=
_mm_add_epi16
(
in0
,
kOne
);
__m128i
out23
=
_mm_add_epi16
(
in2
,
kOne
);
out01
=
_mm_srai_epi16
(
out01
,
2
);
out23
=
_mm_srai_epi16
(
out23
,
2
);
_mm_storeu_si128
((
__m128i
*
)(
output
+
0
*
4
),
out01
);
_mm_storeu_si128
((
__m128i
*
)(
output
+
2
*
4
),
out23
);
}
// 02 12 22 32 03 13 23 33 in1 contains 2 followed by 3
}
in1
=
_mm_shuffle_epi32
(
in1
,
0x4E
);
// Post-condition output and store it (v + 1) >> 2, taking advantage
// of the fact 1/3 are stored just after 0/2.
{
__m128i
out01
=
_mm_add_epi16
(
in0
,
kOne
);
__m128i
out23
=
_mm_add_epi16
(
in1
,
kOne
);
out01
=
_mm_srai_epi16
(
out01
,
2
);
out23
=
_mm_srai_epi16
(
out23
,
2
);
_mm_storeu_si128
((
__m128i
*
)(
output
+
0
*
4
),
out01
);
_mm_storeu_si128
((
__m128i
*
)(
output
+
2
*
4
),
out23
);
}
}
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment