Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
faa2ba86
Commit
faa2ba86
authored
Jan 09, 2014
by
Jingning Han
Committed by
Gerrit Code Review
Jan 09, 2014
Browse files
Merge "Optimze inv 16x16 DCT with 10 non-zero coeffs - P1"
parents
36c8daed
ba6ab46c
Changes
1
Hide whitespace changes
Inline
Side-by-side
vp9/common/x86/vp9_idct_intrin_sse2.c
View file @
faa2ba86
...
...
@@ -380,17 +380,13 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
}
#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1
, out2, out3
) \
#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
\
in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
/* i1 i0 */
\
in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
/* i3 i2 */
\
in2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
/* i5 i4 */
\
in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
/* i7 i6 */
\
}
#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
...
...
@@ -2463,112 +2459,66 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
stp2_8
,
stp2_9
,
stp2_10
,
stp2_11
,
stp2_12
,
stp2_13
,
stp2_14
,
stp2_15
;
__m128i
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
tmp6
,
tmp7
;
int
i
;
in
[
4
]
=
in
[
5
]
=
in
[
6
]
=
in
[
7
]
=
in
[
12
]
=
in
[
13
]
=
in
[
14
]
=
in
[
15
]
=
zero
;
// 1-D idct. Load input data.
// in[4] = in[5] = in[6] = in[7] = in[12] = in[13] = in[14] = in[15] = zero;
// First 1-D inverse DCT
// Load input data.
in
[
0
]
=
_mm_load_si128
((
const
__m128i
*
)
input
);
in
[
8
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
8
*
1
));
in
[
1
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
8
*
2
));
in
[
9
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
8
*
3
));
in
[
2
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
8
*
4
));
in
[
10
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
8
*
5
));
in
[
3
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
8
*
6
));
in
[
11
]
=
_mm_load_si128
((
const
__m128i
*
)(
input
+
8
*
7
));
TRANSPOSE_8X4
(
in
[
0
],
in
[
1
],
in
[
2
],
in
[
3
],
in
[
0
],
in
[
1
],
in
[
2
],
in
[
3
]);
TRANSPOSE_8X4
(
in
[
8
],
in
[
9
],
in
[
10
],
in
[
11
],
in
[
8
],
in
[
9
],
in
[
10
],
in
[
11
]);
TRANSPOSE_8X4
(
in
[
0
],
in
[
1
],
in
[
2
],
in
[
3
],
in
[
0
],
in
[
1
]);
// Stage2
{
const
__m128i
lo_1_15
=
_mm_unpackhi_epi16
(
in
[
0
],
in
[
11
]);
const
__m128i
lo_9_7
=
_mm_unpackhi_epi16
(
in
[
8
],
in
[
3
]);
const
__m128i
lo_5_11
=
_mm_unpackhi_epi16
(
in
[
2
],
in
[
9
]);
const
__m128i
lo_13_3
=
_mm_unpackhi_epi16
(
in
[
10
],
in
[
1
]);
const
__m128i
lo_1_15
=
_mm_unpackhi_epi16
(
in
[
0
],
zero
);
const
__m128i
lo_13_3
=
_mm_unpackhi_epi16
(
zero
,
in
[
1
]);
tmp0
=
_mm_madd_epi16
(
lo_1_15
,
stg2_0
);
tmp2
=
_mm_madd_epi16
(
lo_1_15
,
stg2_1
);
tmp4
=
_mm_madd_epi16
(
lo_9_7
,
stg2_2
);
tmp6
=
_mm_madd_epi16
(
lo_9_7
,
stg2_3
);
tmp1
=
_mm_madd_epi16
(
lo_5_11
,
stg2_4
);
tmp3
=
_mm_madd_epi16
(
lo_5_11
,
stg2_5
);
tmp5
=
_mm_madd_epi16
(
lo_13_3
,
stg2_6
);
tmp7
=
_mm_madd_epi16
(
lo_13_3
,
stg2_7
);
tmp0
=
_mm_add_epi32
(
tmp0
,
rounding
);
tmp2
=
_mm_add_epi32
(
tmp2
,
rounding
);
tmp4
=
_mm_add_epi32
(
tmp4
,
rounding
);
tmp6
=
_mm_add_epi32
(
tmp6
,
rounding
);
tmp1
=
_mm_add_epi32
(
tmp1
,
rounding
);
tmp3
=
_mm_add_epi32
(
tmp3
,
rounding
);
tmp5
=
_mm_add_epi32
(
tmp5
,
rounding
);
tmp7
=
_mm_add_epi32
(
tmp7
,
rounding
);
tmp0
=
_mm_srai_epi32
(
tmp0
,
DCT_CONST_BITS
);
tmp2
=
_mm_srai_epi32
(
tmp2
,
DCT_CONST_BITS
);
tmp4
=
_mm_srai_epi32
(
tmp4
,
DCT_CONST_BITS
);
tmp6
=
_mm_srai_epi32
(
tmp6
,
DCT_CONST_BITS
);
tmp1
=
_mm_srai_epi32
(
tmp1
,
DCT_CONST_BITS
);
tmp3
=
_mm_srai_epi32
(
tmp3
,
DCT_CONST_BITS
);
tmp5
=
_mm_srai_epi32
(
tmp5
,
DCT_CONST_BITS
);
tmp7
=
_mm_srai_epi32
(
tmp7
,
DCT_CONST_BITS
);
stp2_8
=
_mm_packs_epi32
(
tmp0
,
zero
);
stp2_15
=
_mm_packs_epi32
(
tmp2
,
zero
);
stp2_9
=
_mm_packs_epi32
(
tmp4
,
zero
);
stp2_14
=
_mm_packs_epi32
(
tmp6
,
zero
);
stp2_10
=
_mm_packs_epi32
(
tmp1
,
zero
);
stp2_13
=
_mm_packs_epi32
(
tmp3
,
zero
);
stp2_11
=
_mm_packs_epi32
(
tmp5
,
zero
);
stp2_12
=
_mm_packs_epi32
(
tmp7
,
zero
);
stp2_8
=
_mm_packs_epi32
(
tmp0
,
tmp2
);
stp2_11
=
_mm_packs_epi32
(
tmp5
,
tmp7
);
}
// Stage3
{
const
__m128i
lo_2_14
=
_mm_unpacklo_epi16
(
in
[
1
],
in
[
11
]);
const
__m128i
lo_10_6
=
_mm_unpacklo_epi16
(
in
[
9
],
in
[
3
]);
const
__m128i
lo_2_14
=
_mm_unpacklo_epi16
(
in
[
1
],
zero
);
tmp0
=
_mm_madd_epi16
(
lo_2_14
,
stg3_0
);
tmp2
=
_mm_madd_epi16
(
lo_2_14
,
stg3_1
);
tmp4
=
_mm_madd_epi16
(
lo_10_6
,
stg3_2
);
tmp6
=
_mm_madd_epi16
(
lo_10_6
,
stg3_3
);
tmp0
=
_mm_add_epi32
(
tmp0
,
rounding
);
tmp2
=
_mm_add_epi32
(
tmp2
,
rounding
);
tmp4
=
_mm_add_epi32
(
tmp4
,
rounding
);
tmp6
=
_mm_add_epi32
(
tmp6
,
rounding
);
tmp0
=
_mm_srai_epi32
(
tmp0
,
DCT_CONST_BITS
);
tmp2
=
_mm_srai_epi32
(
tmp2
,
DCT_CONST_BITS
);
tmp4
=
_mm_srai_epi32
(
tmp4
,
DCT_CONST_BITS
);
tmp6
=
_mm_srai_epi32
(
tmp6
,
DCT_CONST_BITS
);
stp1_4
=
_mm_packs_epi32
(
tmp0
,
zero
);
stp1_7
=
_mm_packs_epi32
(
tmp2
,
zero
);
stp1_5
=
_mm_packs_epi32
(
tmp4
,
zero
);
stp1_6
=
_mm_packs_epi32
(
tmp6
,
zero
);
stp1_13
=
_mm_unpackhi_epi64
(
stp2_11
,
zero
);
stp1_14
=
_mm_unpackhi_epi64
(
stp2_8
,
zero
);
stp1_8_0
=
_mm_add_epi16
(
stp2_8
,
stp2_9
);
stp1_9
=
_mm_sub_epi16
(
stp2_8
,
stp2_9
);
stp1_10
=
_mm_sub_epi16
(
stp2_11
,
stp2_10
);
stp1_11
=
_mm_add_epi16
(
stp2_11
,
stp2_10
);
stp1_12_0
=
_mm_add_epi16
(
stp2_12
,
stp2_13
);
stp1_13
=
_mm_sub_epi16
(
stp2_12
,
stp2_13
);
stp1_14
=
_mm_sub_epi16
(
stp2_15
,
stp2_14
);
stp1_15
=
_mm_add_epi16
(
stp2_15
,
stp2_14
);
stp1_4
=
_mm_packs_epi32
(
tmp0
,
tmp2
);
}
// Stage4
{
const
__m128i
lo_0_8
=
_mm_unpacklo_epi16
(
in
[
0
],
in
[
8
]);
const
__m128i
lo_4_12
=
_mm_unpacklo_epi16
(
in
[
2
],
in
[
10
]);
const
__m128i
lo_9_14
=
_mm_unpacklo_epi16
(
stp1_9
,
stp1_14
);
const
__m128i
lo_10_13
=
_mm_unpacklo_epi16
(
stp1_10
,
stp1_13
);
const
__m128i
lo_0_8
=
_mm_unpacklo_epi16
(
in
[
0
],
zero
);
const
__m128i
lo_9_14
=
_mm_unpacklo_epi16
(
stp2_8
,
stp1_14
);
const
__m128i
lo_10_13
=
_mm_unpacklo_epi16
(
stp2_11
,
stp1_13
);
tmp0
=
_mm_madd_epi16
(
lo_0_8
,
stg4_0
);
tmp2
=
_mm_madd_epi16
(
lo_0_8
,
stg4_1
);
tmp4
=
_mm_madd_epi16
(
lo_4_12
,
stg4_2
);
tmp6
=
_mm_madd_epi16
(
lo_4_12
,
stg4_3
);
tmp1
=
_mm_madd_epi16
(
lo_9_14
,
stg4_4
);
tmp3
=
_mm_madd_epi16
(
lo_9_14
,
stg4_5
);
tmp5
=
_mm_madd_epi16
(
lo_10_13
,
stg4_6
);
...
...
@@ -2576,8 +2526,6 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
tmp0
=
_mm_add_epi32
(
tmp0
,
rounding
);
tmp2
=
_mm_add_epi32
(
tmp2
,
rounding
);
tmp4
=
_mm_add_epi32
(
tmp4
,
rounding
);
tmp6
=
_mm_add_epi32
(
tmp6
,
rounding
);
tmp1
=
_mm_add_epi32
(
tmp1
,
rounding
);
tmp3
=
_mm_add_epi32
(
tmp3
,
rounding
);
tmp5
=
_mm_add_epi32
(
tmp5
,
rounding
);
...
...
@@ -2585,49 +2533,40 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
tmp0
=
_mm_srai_epi32
(
tmp0
,
DCT_CONST_BITS
);
tmp2
=
_mm_srai_epi32
(
tmp2
,
DCT_CONST_BITS
);
tmp4
=
_mm_srai_epi32
(
tmp4
,
DCT_CONST_BITS
);
tmp6
=
_mm_srai_epi32
(
tmp6
,
DCT_CONST_BITS
);
tmp1
=
_mm_srai_epi32
(
tmp1
,
DCT_CONST_BITS
);
tmp3
=
_mm_srai_epi32
(
tmp3
,
DCT_CONST_BITS
);
tmp5
=
_mm_srai_epi32
(
tmp5
,
DCT_CONST_BITS
);
tmp7
=
_mm_srai_epi32
(
tmp7
,
DCT_CONST_BITS
);
stp2_0
=
_mm_packs_epi32
(
tmp0
,
zero
);
stp2_1
=
_mm_packs_epi32
(
tmp2
,
zero
);
stp2_2
=
_mm_packs_epi32
(
tmp4
,
zero
);
stp2_3
=
_mm_packs_epi32
(
tmp6
,
zero
);
stp2_9
=
_mm_packs_epi32
(
tmp1
,
zero
);
stp2_14
=
_mm_packs_epi32
(
tmp3
,
zero
);
stp2_10
=
_mm_packs_epi32
(
tmp5
,
zero
);
stp2_13
=
_mm_packs_epi32
(
tmp7
,
zero
);
stp2_4
=
_mm_add_epi16
(
stp1_4
,
stp1_5
);
stp2_5
=
_mm_sub_epi16
(
stp1_4
,
stp1_5
);
stp2_6
=
_mm_sub_epi16
(
stp1_7
,
stp1_6
);
stp2_7
=
_mm_add_epi16
(
stp1_7
,
stp1_6
);
stp1_0
=
_mm_packs_epi32
(
tmp0
,
tmp0
);
stp1_1
=
_mm_packs_epi32
(
tmp2
,
tmp2
);
stp2_9
=
_mm_packs_epi32
(
tmp1
,
tmp3
);
stp2_10
=
_mm_packs_epi32
(
tmp5
,
tmp7
);
stp2_6
=
_mm_unpackhi_epi64
(
stp1_4
,
zero
);
}
// Stage5 and Stage6
{
stp1_
0
=
_mm_add_epi16
(
stp2_
0
,
stp2_
3
);
stp1_
1
=
_mm_
add
_epi16
(
stp2_
1
,
stp2_
2
);
stp1_
2
=
_mm_
sub
_epi16
(
stp2_
1
,
stp2_
2
);
stp1_
3
=
_mm_sub_epi16
(
stp2_
0
,
stp2_
3
);
stp1_
8
=
_mm_
add_epi16
(
stp1_8_0
,
stp1_11
);
stp1_
9
=
_mm_
add_epi16
(
stp2_9
,
stp2_10
);
stp1_
10
=
_mm_
sub_epi16
(
stp2_9
,
stp2_10
);
stp1_11
=
_mm_
sub_epi16
(
stp1_8_0
,
stp1_11
);
stp1_1
2
=
_mm_
sub_epi16
(
stp1_15
,
stp1_12_0
);
stp1_1
3
=
_mm_
sub_epi16
(
stp2_14
,
stp2_13
);
stp1_1
4
=
_mm_
add_epi16
(
stp2_14
,
stp2_13
);
stp1_15
=
_mm_
add_epi16
(
stp1_15
,
stp1_12_0
);
tmp
0
=
_mm_add_epi16
(
stp2_
8
,
stp2_
11
);
tmp
1
=
_mm_
sub
_epi16
(
stp2_
8
,
stp2_
11
);
tmp
2
=
_mm_
add
_epi16
(
stp2_
9
,
stp2_
10
);
tmp
3
=
_mm_sub_epi16
(
stp2_
9
,
stp2_
10
);
stp1_
9
=
_mm_
unpacklo_epi64
(
tmp2
,
zero
);
stp1_
10
=
_mm_
unpacklo_epi64
(
tmp3
,
zero
);
stp1_
8
=
_mm_
unpacklo_epi64
(
tmp0
,
zero
);
stp1_11
=
_mm_
unpacklo_epi64
(
tmp1
,
zero
);
stp1_1
3
=
_mm_
unpackhi_epi64
(
tmp3
,
zero
);
stp1_1
4
=
_mm_
unpackhi_epi64
(
tmp2
,
zero
);
stp1_1
2
=
_mm_
unpackhi_epi64
(
tmp1
,
zero
);
stp1_15
=
_mm_
unpackhi_epi64
(
tmp0
,
zero
);
}
// Stage6
{
const
__m128i
lo_6_5
=
_mm_unpacklo_epi16
(
stp2_6
,
stp
2_5
);
const
__m128i
lo_6_5
=
_mm_unpacklo_epi16
(
stp2_6
,
stp
1_4
);
const
__m128i
lo_10_13
=
_mm_unpacklo_epi16
(
stp1_10
,
stp1_13
);
const
__m128i
lo_11_12
=
_mm_unpacklo_epi16
(
stp1_11
,
stp1_12
);
...
...
@@ -2652,21 +2591,26 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
tmp4
=
_mm_srai_epi32
(
tmp4
,
DCT_CONST_BITS
);
tmp6
=
_mm_srai_epi32
(
tmp6
,
DCT_CONST_BITS
);
stp1_
5
=
_mm_packs_epi32
(
tmp
1
,
zero
);
stp1_6
=
_mm_packs_epi32
(
tmp3
,
zero
);
stp1_
6
=
_mm_packs_epi32
(
tmp
3
,
tmp1
);
stp2_10
=
_mm_packs_epi32
(
tmp0
,
zero
);
stp2_13
=
_mm_packs_epi32
(
tmp2
,
zero
);
stp2_11
=
_mm_packs_epi32
(
tmp4
,
zero
);
stp2_12
=
_mm_packs_epi32
(
tmp6
,
zero
);
stp2_0
=
_mm_add_epi16
(
stp1_0
,
stp2_7
);
stp2_1
=
_mm_add_epi16
(
stp1_1
,
stp1_6
);
stp2_2
=
_mm_add_epi16
(
stp1_2
,
stp1_5
);
stp2_3
=
_mm_add_epi16
(
stp1_3
,
stp2_4
);
stp2_4
=
_mm_sub_epi16
(
stp1_3
,
stp2_4
);
stp2_5
=
_mm_sub_epi16
(
stp1_2
,
stp1_5
);
stp2_6
=
_mm_sub_epi16
(
stp1_1
,
stp1_6
);
stp2_7
=
_mm_sub_epi16
(
stp1_0
,
stp2_7
);
tmp0
=
_mm_add_epi16
(
stp1_0
,
stp1_4
);
tmp1
=
_mm_sub_epi16
(
stp1_0
,
stp1_4
);
tmp2
=
_mm_add_epi16
(
stp1_1
,
stp1_6
);
tmp3
=
_mm_sub_epi16
(
stp1_1
,
stp1_6
);
stp2_0
=
_mm_unpackhi_epi64
(
tmp0
,
zero
);
stp2_1
=
_mm_unpacklo_epi64
(
tmp2
,
zero
);
stp2_2
=
_mm_unpackhi_epi64
(
tmp2
,
zero
);
stp2_3
=
_mm_unpacklo_epi64
(
tmp0
,
zero
);
stp2_4
=
_mm_unpacklo_epi64
(
tmp1
,
zero
);
stp2_5
=
_mm_unpackhi_epi64
(
tmp3
,
zero
);
stp2_6
=
_mm_unpacklo_epi64
(
tmp3
,
zero
);
stp2_7
=
_mm_unpackhi_epi64
(
tmp1
,
zero
);
}
// Stage7. Left 8x16 only.
...
...
@@ -2687,7 +2631,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
l
[
14
]
=
_mm_sub_epi16
(
stp2_1
,
stp1_14
);
l
[
15
]
=
_mm_sub_epi16
(
stp2_0
,
stp1_15
);
//
2-D idct. We do 2
8x16 block
s.
//
Second 1-D inverse transform, performed per
8x16 block
for
(
i
=
0
;
i
<
2
;
i
++
)
{
array_transpose_4X8
(
l
+
8
*
i
,
in
);
in
[
8
]
=
in
[
9
]
=
in
[
10
]
=
in
[
11
]
=
in
[
12
]
=
in
[
13
]
=
in
[
14
]
=
in
[
15
]
=
zero
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment