Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
b9c934df
Commit
b9c934df
authored
Jun 25, 2013
by
Yaowu Xu
Committed by
Gerrit Code Review
Jun 25, 2013
Browse files
Merge "Enable sse2 implmentation of 8x8 ADST/DCT"
parents
ca976db4
a32a086d
Changes
2
Hide whitespace changes
Inline
Side-by-side
vp9/common/vp9_rtcd_defs.sh
View file @
b9c934df
...
...
@@ -547,7 +547,7 @@ prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pi
specialize vp9_short_fht4x4
prototype void vp9_short_fht8x8
"int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
specialize vp9_short_fht8x8
specialize vp9_short_fht8x8
sse2
prototype void vp9_short_fht16x16
"int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
specialize vp9_short_fht16x16
...
...
vp9/encoder/x86/vp9_dct_sse2.c
View file @
b9c934df
...
...
@@ -10,6 +10,7 @@
#include <emmintrin.h> // SSE2
#include "vp9/common/vp9_idct.h" // for cospi constants
#include "vpx_ports/mem.h"
void
vp9_short_fdct4x4_sse2
(
int16_t
*
input
,
int16_t
*
output
,
int
pitch
)
{
// The 2D transform is done with two passes which are actually pretty
...
...
@@ -373,6 +374,498 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
}
}
// load 8x8 array
static
INLINE
void
load_buffer_8x8
(
int16_t
*
input
,
__m128i
in
[
8
],
int
stride
)
{
in
[
0
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
0
*
stride
));
in
[
1
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
1
*
stride
));
in
[
2
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
2
*
stride
));
in
[
3
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
3
*
stride
));
in
[
4
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
4
*
stride
));
in
[
5
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
5
*
stride
));
in
[
6
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
6
*
stride
));
in
[
7
]
=
_mm_load_si128
((
__m128i
*
)(
input
+
7
*
stride
));
in
[
0
]
=
_mm_slli_epi16
(
in
[
0
],
2
);
in
[
1
]
=
_mm_slli_epi16
(
in
[
1
],
2
);
in
[
2
]
=
_mm_slli_epi16
(
in
[
2
],
2
);
in
[
3
]
=
_mm_slli_epi16
(
in
[
3
],
2
);
in
[
4
]
=
_mm_slli_epi16
(
in
[
4
],
2
);
in
[
5
]
=
_mm_slli_epi16
(
in
[
5
],
2
);
in
[
6
]
=
_mm_slli_epi16
(
in
[
6
],
2
);
in
[
7
]
=
_mm_slli_epi16
(
in
[
7
],
2
);
}
// write 8x8 array
static
INLINE
void
write_buffer_8x8
(
int16_t
*
output
,
__m128i
res
[
8
])
{
res
[
0
]
=
_mm_srai_epi16
(
res
[
0
],
1
);
res
[
1
]
=
_mm_srai_epi16
(
res
[
1
],
1
);
res
[
2
]
=
_mm_srai_epi16
(
res
[
2
],
1
);
res
[
3
]
=
_mm_srai_epi16
(
res
[
3
],
1
);
res
[
4
]
=
_mm_srai_epi16
(
res
[
4
],
1
);
res
[
5
]
=
_mm_srai_epi16
(
res
[
5
],
1
);
res
[
6
]
=
_mm_srai_epi16
(
res
[
6
],
1
);
res
[
7
]
=
_mm_srai_epi16
(
res
[
7
],
1
);
_mm_store_si128
((
__m128i
*
)(
output
+
0
*
8
),
res
[
0
]);
_mm_store_si128
((
__m128i
*
)(
output
+
1
*
8
),
res
[
1
]);
_mm_store_si128
((
__m128i
*
)(
output
+
2
*
8
),
res
[
2
]);
_mm_store_si128
((
__m128i
*
)(
output
+
3
*
8
),
res
[
3
]);
_mm_store_si128
((
__m128i
*
)(
output
+
4
*
8
),
res
[
4
]);
_mm_store_si128
((
__m128i
*
)(
output
+
5
*
8
),
res
[
5
]);
_mm_store_si128
((
__m128i
*
)(
output
+
6
*
8
),
res
[
6
]);
_mm_store_si128
((
__m128i
*
)(
output
+
7
*
8
),
res
[
7
]);
}
// perform in-place transpose
static
INLINE
void
array_transpose_8x8
(
__m128i
res
[
8
])
{
const
__m128i
tr0_0
=
_mm_unpacklo_epi16
(
res
[
0
],
res
[
1
]);
const
__m128i
tr0_1
=
_mm_unpacklo_epi16
(
res
[
2
],
res
[
3
]);
const
__m128i
tr0_2
=
_mm_unpackhi_epi16
(
res
[
0
],
res
[
1
]);
const
__m128i
tr0_3
=
_mm_unpackhi_epi16
(
res
[
2
],
res
[
3
]);
const
__m128i
tr0_4
=
_mm_unpacklo_epi16
(
res
[
4
],
res
[
5
]);
const
__m128i
tr0_5
=
_mm_unpacklo_epi16
(
res
[
6
],
res
[
7
]);
const
__m128i
tr0_6
=
_mm_unpackhi_epi16
(
res
[
4
],
res
[
5
]);
const
__m128i
tr0_7
=
_mm_unpackhi_epi16
(
res
[
6
],
res
[
7
]);
// 00 10 01 11 02 12 03 13
// 20 30 21 31 22 32 23 33
// 04 14 05 15 06 16 07 17
// 24 34 25 35 26 36 27 37
// 40 50 41 51 42 52 43 53
// 60 70 61 71 62 72 63 73
// 44 54 45 55 46 56 47 57
// 64 74 65 75 66 76 67 77
const
__m128i
tr1_0
=
_mm_unpacklo_epi32
(
tr0_0
,
tr0_1
);
const
__m128i
tr1_1
=
_mm_unpacklo_epi32
(
tr0_4
,
tr0_5
);
const
__m128i
tr1_2
=
_mm_unpackhi_epi32
(
tr0_0
,
tr0_1
);
const
__m128i
tr1_3
=
_mm_unpackhi_epi32
(
tr0_4
,
tr0_5
);
const
__m128i
tr1_4
=
_mm_unpacklo_epi32
(
tr0_2
,
tr0_3
);
const
__m128i
tr1_5
=
_mm_unpacklo_epi32
(
tr0_6
,
tr0_7
);
const
__m128i
tr1_6
=
_mm_unpackhi_epi32
(
tr0_2
,
tr0_3
);
const
__m128i
tr1_7
=
_mm_unpackhi_epi32
(
tr0_6
,
tr0_7
);
// 00 10 20 30 01 11 21 31
// 40 50 60 70 41 51 61 71
// 02 12 22 32 03 13 23 33
// 42 52 62 72 43 53 63 73
// 04 14 24 34 05 15 25 35
// 44 54 64 74 45 55 65 75
// 06 16 26 36 07 17 27 37
// 46 56 66 76 47 57 67 77
res
[
0
]
=
_mm_unpacklo_epi64
(
tr1_0
,
tr1_1
);
res
[
1
]
=
_mm_unpackhi_epi64
(
tr1_0
,
tr1_1
);
res
[
2
]
=
_mm_unpacklo_epi64
(
tr1_2
,
tr1_3
);
res
[
3
]
=
_mm_unpackhi_epi64
(
tr1_2
,
tr1_3
);
res
[
4
]
=
_mm_unpacklo_epi64
(
tr1_4
,
tr1_5
);
res
[
5
]
=
_mm_unpackhi_epi64
(
tr1_4
,
tr1_5
);
res
[
6
]
=
_mm_unpacklo_epi64
(
tr1_6
,
tr1_7
);
res
[
7
]
=
_mm_unpackhi_epi64
(
tr1_6
,
tr1_7
);
// 00 10 20 30 40 50 60 70
// 01 11 21 31 41 51 61 71
// 02 12 22 32 42 52 62 72
// 03 13 23 33 43 53 63 73
// 04 14 24 34 44 54 64 74
// 05 15 25 35 45 55 65 75
// 06 16 26 36 46 56 66 76
// 07 17 27 37 47 57 67 77
}
void
fdct8_1d_sse2
(
__m128i
in
[
8
])
{
// constants
const
__m128i
k__cospi_p16_p16
=
_mm_set1_epi16
(
cospi_16_64
);
const
__m128i
k__cospi_p16_m16
=
pair_set_epi16
(
cospi_16_64
,
-
cospi_16_64
);
const
__m128i
k__cospi_p24_p08
=
pair_set_epi16
(
cospi_24_64
,
cospi_8_64
);
const
__m128i
k__cospi_m08_p24
=
pair_set_epi16
(
-
cospi_8_64
,
cospi_24_64
);
const
__m128i
k__cospi_p28_p04
=
pair_set_epi16
(
cospi_28_64
,
cospi_4_64
);
const
__m128i
k__cospi_m04_p28
=
pair_set_epi16
(
-
cospi_4_64
,
cospi_28_64
);
const
__m128i
k__cospi_p12_p20
=
pair_set_epi16
(
cospi_12_64
,
cospi_20_64
);
const
__m128i
k__cospi_m20_p12
=
pair_set_epi16
(
-
cospi_20_64
,
cospi_12_64
);
const
__m128i
k__DCT_CONST_ROUNDING
=
_mm_set1_epi32
(
DCT_CONST_ROUNDING
);
__m128i
u0
,
u1
,
u2
,
u3
,
u4
,
u5
,
u6
,
u7
;
__m128i
v0
,
v1
,
v2
,
v3
,
v4
,
v5
,
v6
,
v7
;
__m128i
s0
,
s1
,
s2
,
s3
,
s4
,
s5
,
s6
,
s7
;
// stage 1
s0
=
_mm_add_epi16
(
in
[
0
],
in
[
7
]);
s1
=
_mm_add_epi16
(
in
[
1
],
in
[
6
]);
s2
=
_mm_add_epi16
(
in
[
2
],
in
[
5
]);
s3
=
_mm_add_epi16
(
in
[
3
],
in
[
4
]);
s4
=
_mm_sub_epi16
(
in
[
3
],
in
[
4
]);
s5
=
_mm_sub_epi16
(
in
[
2
],
in
[
5
]);
s6
=
_mm_sub_epi16
(
in
[
1
],
in
[
6
]);
s7
=
_mm_sub_epi16
(
in
[
0
],
in
[
7
]);
u0
=
_mm_add_epi16
(
s0
,
s3
);
u1
=
_mm_add_epi16
(
s1
,
s2
);
u2
=
_mm_sub_epi16
(
s1
,
s2
);
u3
=
_mm_sub_epi16
(
s0
,
s3
);
// interleave and perform butterfly multiplication/addition
v0
=
_mm_unpacklo_epi16
(
u0
,
u1
);
v1
=
_mm_unpackhi_epi16
(
u0
,
u1
);
v2
=
_mm_unpacklo_epi16
(
u2
,
u3
);
v3
=
_mm_unpackhi_epi16
(
u2
,
u3
);
u0
=
_mm_madd_epi16
(
v0
,
k__cospi_p16_p16
);
u1
=
_mm_madd_epi16
(
v1
,
k__cospi_p16_p16
);
u2
=
_mm_madd_epi16
(
v0
,
k__cospi_p16_m16
);
u3
=
_mm_madd_epi16
(
v1
,
k__cospi_p16_m16
);
u4
=
_mm_madd_epi16
(
v2
,
k__cospi_p24_p08
);
u5
=
_mm_madd_epi16
(
v3
,
k__cospi_p24_p08
);
u6
=
_mm_madd_epi16
(
v2
,
k__cospi_m08_p24
);
u7
=
_mm_madd_epi16
(
v3
,
k__cospi_m08_p24
);
// shift and rounding
v0
=
_mm_add_epi32
(
u0
,
k__DCT_CONST_ROUNDING
);
v1
=
_mm_add_epi32
(
u1
,
k__DCT_CONST_ROUNDING
);
v2
=
_mm_add_epi32
(
u2
,
k__DCT_CONST_ROUNDING
);
v3
=
_mm_add_epi32
(
u3
,
k__DCT_CONST_ROUNDING
);
v4
=
_mm_add_epi32
(
u4
,
k__DCT_CONST_ROUNDING
);
v5
=
_mm_add_epi32
(
u5
,
k__DCT_CONST_ROUNDING
);
v6
=
_mm_add_epi32
(
u6
,
k__DCT_CONST_ROUNDING
);
v7
=
_mm_add_epi32
(
u7
,
k__DCT_CONST_ROUNDING
);
u0
=
_mm_srai_epi32
(
v0
,
DCT_CONST_BITS
);
u1
=
_mm_srai_epi32
(
v1
,
DCT_CONST_BITS
);
u2
=
_mm_srai_epi32
(
v2
,
DCT_CONST_BITS
);
u3
=
_mm_srai_epi32
(
v3
,
DCT_CONST_BITS
);
u4
=
_mm_srai_epi32
(
v4
,
DCT_CONST_BITS
);
u5
=
_mm_srai_epi32
(
v5
,
DCT_CONST_BITS
);
u6
=
_mm_srai_epi32
(
v6
,
DCT_CONST_BITS
);
u7
=
_mm_srai_epi32
(
v7
,
DCT_CONST_BITS
);
in
[
0
]
=
_mm_packs_epi32
(
u0
,
u1
);
in
[
2
]
=
_mm_packs_epi32
(
u4
,
u5
);
in
[
4
]
=
_mm_packs_epi32
(
u2
,
u3
);
in
[
6
]
=
_mm_packs_epi32
(
u6
,
u7
);
// stage 2
// interleave and perform butterfly multiplication/addition
u0
=
_mm_unpacklo_epi16
(
s6
,
s5
);
u1
=
_mm_unpackhi_epi16
(
s6
,
s5
);
v0
=
_mm_madd_epi16
(
u0
,
k__cospi_p16_m16
);
v1
=
_mm_madd_epi16
(
u1
,
k__cospi_p16_m16
);
v2
=
_mm_madd_epi16
(
u0
,
k__cospi_p16_p16
);
v3
=
_mm_madd_epi16
(
u1
,
k__cospi_p16_p16
);
// shift and rounding
u0
=
_mm_add_epi32
(
v0
,
k__DCT_CONST_ROUNDING
);
u1
=
_mm_add_epi32
(
v1
,
k__DCT_CONST_ROUNDING
);
u2
=
_mm_add_epi32
(
v2
,
k__DCT_CONST_ROUNDING
);
u3
=
_mm_add_epi32
(
v3
,
k__DCT_CONST_ROUNDING
);
v0
=
_mm_srai_epi32
(
u0
,
DCT_CONST_BITS
);
v1
=
_mm_srai_epi32
(
u1
,
DCT_CONST_BITS
);
v2
=
_mm_srai_epi32
(
u2
,
DCT_CONST_BITS
);
v3
=
_mm_srai_epi32
(
u3
,
DCT_CONST_BITS
);
u0
=
_mm_packs_epi32
(
v0
,
v1
);
u1
=
_mm_packs_epi32
(
v2
,
v3
);
// stage 3
s0
=
_mm_add_epi16
(
s4
,
u0
);
s1
=
_mm_sub_epi16
(
s4
,
u0
);
s2
=
_mm_sub_epi16
(
s7
,
u1
);
s3
=
_mm_add_epi16
(
s7
,
u1
);
// stage 4
u0
=
_mm_unpacklo_epi16
(
s0
,
s3
);
u1
=
_mm_unpackhi_epi16
(
s0
,
s3
);
u2
=
_mm_unpacklo_epi16
(
s1
,
s2
);
u3
=
_mm_unpackhi_epi16
(
s1
,
s2
);
v0
=
_mm_madd_epi16
(
u0
,
k__cospi_p28_p04
);
v1
=
_mm_madd_epi16
(
u1
,
k__cospi_p28_p04
);
v2
=
_mm_madd_epi16
(
u2
,
k__cospi_p12_p20
);
v3
=
_mm_madd_epi16
(
u3
,
k__cospi_p12_p20
);
v4
=
_mm_madd_epi16
(
u2
,
k__cospi_m20_p12
);
v5
=
_mm_madd_epi16
(
u3
,
k__cospi_m20_p12
);
v6
=
_mm_madd_epi16
(
u0
,
k__cospi_m04_p28
);
v7
=
_mm_madd_epi16
(
u1
,
k__cospi_m04_p28
);
// shift and rounding
u0
=
_mm_add_epi32
(
v0
,
k__DCT_CONST_ROUNDING
);
u1
=
_mm_add_epi32
(
v1
,
k__DCT_CONST_ROUNDING
);
u2
=
_mm_add_epi32
(
v2
,
k__DCT_CONST_ROUNDING
);
u3
=
_mm_add_epi32
(
v3
,
k__DCT_CONST_ROUNDING
);
u4
=
_mm_add_epi32
(
v4
,
k__DCT_CONST_ROUNDING
);
u5
=
_mm_add_epi32
(
v5
,
k__DCT_CONST_ROUNDING
);
u6
=
_mm_add_epi32
(
v6
,
k__DCT_CONST_ROUNDING
);
u7
=
_mm_add_epi32
(
v7
,
k__DCT_CONST_ROUNDING
);
v0
=
_mm_srai_epi32
(
u0
,
DCT_CONST_BITS
);
v1
=
_mm_srai_epi32
(
u1
,
DCT_CONST_BITS
);
v2
=
_mm_srai_epi32
(
u2
,
DCT_CONST_BITS
);
v3
=
_mm_srai_epi32
(
u3
,
DCT_CONST_BITS
);
v4
=
_mm_srai_epi32
(
u4
,
DCT_CONST_BITS
);
v5
=
_mm_srai_epi32
(
u5
,
DCT_CONST_BITS
);
v6
=
_mm_srai_epi32
(
u6
,
DCT_CONST_BITS
);
v7
=
_mm_srai_epi32
(
u7
,
DCT_CONST_BITS
);
in
[
1
]
=
_mm_packs_epi32
(
v0
,
v1
);
in
[
3
]
=
_mm_packs_epi32
(
v4
,
v5
);
in
[
5
]
=
_mm_packs_epi32
(
v2
,
v3
);
in
[
7
]
=
_mm_packs_epi32
(
v6
,
v7
);
// transpose
array_transpose_8x8
(
in
);
}
void
fadst8_1d_sse2
(
__m128i
in
[
8
])
{
// Constants
const
__m128i
k__cospi_p02_p30
=
pair_set_epi16
(
cospi_2_64
,
cospi_30_64
);
const
__m128i
k__cospi_p30_m02
=
pair_set_epi16
(
cospi_30_64
,
-
cospi_2_64
);
const
__m128i
k__cospi_p10_p22
=
pair_set_epi16
(
cospi_10_64
,
cospi_22_64
);
const
__m128i
k__cospi_p22_m10
=
pair_set_epi16
(
cospi_22_64
,
-
cospi_10_64
);
const
__m128i
k__cospi_p18_p14
=
pair_set_epi16
(
cospi_18_64
,
cospi_14_64
);
const
__m128i
k__cospi_p14_m18
=
pair_set_epi16
(
cospi_14_64
,
-
cospi_18_64
);
const
__m128i
k__cospi_p26_p06
=
pair_set_epi16
(
cospi_26_64
,
cospi_6_64
);
const
__m128i
k__cospi_p06_m26
=
pair_set_epi16
(
cospi_6_64
,
-
cospi_26_64
);
const
__m128i
k__cospi_p08_p24
=
pair_set_epi16
(
cospi_8_64
,
cospi_24_64
);
const
__m128i
k__cospi_p24_m08
=
pair_set_epi16
(
cospi_24_64
,
-
cospi_8_64
);
const
__m128i
k__cospi_m24_p08
=
pair_set_epi16
(
-
cospi_24_64
,
cospi_8_64
);
const
__m128i
k__cospi_p16_m16
=
pair_set_epi16
(
cospi_16_64
,
-
cospi_16_64
);
const
__m128i
k__cospi_p16_p16
=
_mm_set1_epi16
(
cospi_16_64
);
const
__m128i
k__const_0
=
_mm_set1_epi16
(
0
);
const
__m128i
k__DCT_CONST_ROUNDING
=
_mm_set1_epi32
(
DCT_CONST_ROUNDING
);
__m128i
u0
,
u1
,
u2
,
u3
,
u4
,
u5
,
u6
,
u7
,
u8
,
u9
,
u10
,
u11
,
u12
,
u13
,
u14
,
u15
;
__m128i
v0
,
v1
,
v2
,
v3
,
v4
,
v5
,
v6
,
v7
,
v8
,
v9
,
v10
,
v11
,
v12
,
v13
,
v14
,
v15
;
__m128i
w0
,
w1
,
w2
,
w3
,
w4
,
w5
,
w6
,
w7
,
w8
,
w9
,
w10
,
w11
,
w12
,
w13
,
w14
,
w15
;
__m128i
s0
,
s1
,
s2
,
s3
,
s4
,
s5
,
s6
,
s7
;
__m128i
in0
,
in1
,
in2
,
in3
,
in4
,
in5
,
in6
,
in7
;
// properly aligned for butterfly input
in0
=
in
[
7
];
in1
=
in
[
0
];
in2
=
in
[
5
];
in3
=
in
[
2
];
in4
=
in
[
3
];
in5
=
in
[
4
];
in6
=
in
[
1
];
in7
=
in
[
6
];
// column transformation
// stage 1
// interleave and multiply/add into 32-bit integer
s0
=
_mm_unpacklo_epi16
(
in0
,
in1
);
s1
=
_mm_unpackhi_epi16
(
in0
,
in1
);
s2
=
_mm_unpacklo_epi16
(
in2
,
in3
);
s3
=
_mm_unpackhi_epi16
(
in2
,
in3
);
s4
=
_mm_unpacklo_epi16
(
in4
,
in5
);
s5
=
_mm_unpackhi_epi16
(
in4
,
in5
);
s6
=
_mm_unpacklo_epi16
(
in6
,
in7
);
s7
=
_mm_unpackhi_epi16
(
in6
,
in7
);
u0
=
_mm_madd_epi16
(
s0
,
k__cospi_p02_p30
);
u1
=
_mm_madd_epi16
(
s1
,
k__cospi_p02_p30
);
u2
=
_mm_madd_epi16
(
s0
,
k__cospi_p30_m02
);
u3
=
_mm_madd_epi16
(
s1
,
k__cospi_p30_m02
);
u4
=
_mm_madd_epi16
(
s2
,
k__cospi_p10_p22
);
u5
=
_mm_madd_epi16
(
s3
,
k__cospi_p10_p22
);
u6
=
_mm_madd_epi16
(
s2
,
k__cospi_p22_m10
);
u7
=
_mm_madd_epi16
(
s3
,
k__cospi_p22_m10
);
u8
=
_mm_madd_epi16
(
s4
,
k__cospi_p18_p14
);
u9
=
_mm_madd_epi16
(
s5
,
k__cospi_p18_p14
);
u10
=
_mm_madd_epi16
(
s4
,
k__cospi_p14_m18
);
u11
=
_mm_madd_epi16
(
s5
,
k__cospi_p14_m18
);
u12
=
_mm_madd_epi16
(
s6
,
k__cospi_p26_p06
);
u13
=
_mm_madd_epi16
(
s7
,
k__cospi_p26_p06
);
u14
=
_mm_madd_epi16
(
s6
,
k__cospi_p06_m26
);
u15
=
_mm_madd_epi16
(
s7
,
k__cospi_p06_m26
);
// addition
w0
=
_mm_add_epi32
(
u0
,
u8
);
w1
=
_mm_add_epi32
(
u1
,
u9
);
w2
=
_mm_add_epi32
(
u2
,
u10
);
w3
=
_mm_add_epi32
(
u3
,
u11
);
w4
=
_mm_add_epi32
(
u4
,
u12
);
w5
=
_mm_add_epi32
(
u5
,
u13
);
w6
=
_mm_add_epi32
(
u6
,
u14
);
w7
=
_mm_add_epi32
(
u7
,
u15
);
w8
=
_mm_sub_epi32
(
u0
,
u8
);
w9
=
_mm_sub_epi32
(
u1
,
u9
);
w10
=
_mm_sub_epi32
(
u2
,
u10
);
w11
=
_mm_sub_epi32
(
u3
,
u11
);
w12
=
_mm_sub_epi32
(
u4
,
u12
);
w13
=
_mm_sub_epi32
(
u5
,
u13
);
w14
=
_mm_sub_epi32
(
u6
,
u14
);
w15
=
_mm_sub_epi32
(
u7
,
u15
);
// shift and rounding
v0
=
_mm_add_epi32
(
w0
,
k__DCT_CONST_ROUNDING
);
v1
=
_mm_add_epi32
(
w1
,
k__DCT_CONST_ROUNDING
);
v2
=
_mm_add_epi32
(
w2
,
k__DCT_CONST_ROUNDING
);
v3
=
_mm_add_epi32
(
w3
,
k__DCT_CONST_ROUNDING
);
v4
=
_mm_add_epi32
(
w4
,
k__DCT_CONST_ROUNDING
);
v5
=
_mm_add_epi32
(
w5
,
k__DCT_CONST_ROUNDING
);
v6
=
_mm_add_epi32
(
w6
,
k__DCT_CONST_ROUNDING
);
v7
=
_mm_add_epi32
(
w7
,
k__DCT_CONST_ROUNDING
);
v8
=
_mm_add_epi32
(
w8
,
k__DCT_CONST_ROUNDING
);
v9
=
_mm_add_epi32
(
w9
,
k__DCT_CONST_ROUNDING
);
v10
=
_mm_add_epi32
(
w10
,
k__DCT_CONST_ROUNDING
);
v11
=
_mm_add_epi32
(
w11
,
k__DCT_CONST_ROUNDING
);
v12
=
_mm_add_epi32
(
w12
,
k__DCT_CONST_ROUNDING
);
v13
=
_mm_add_epi32
(
w13
,
k__DCT_CONST_ROUNDING
);
v14
=
_mm_add_epi32
(
w14
,
k__DCT_CONST_ROUNDING
);
v15
=
_mm_add_epi32
(
w15
,
k__DCT_CONST_ROUNDING
);
u0
=
_mm_srai_epi32
(
v0
,
DCT_CONST_BITS
);
u1
=
_mm_srai_epi32
(
v1
,
DCT_CONST_BITS
);
u2
=
_mm_srai_epi32
(
v2
,
DCT_CONST_BITS
);
u3
=
_mm_srai_epi32
(
v3
,
DCT_CONST_BITS
);
u4
=
_mm_srai_epi32
(
v4
,
DCT_CONST_BITS
);
u5
=
_mm_srai_epi32
(
v5
,
DCT_CONST_BITS
);
u6
=
_mm_srai_epi32
(
v6
,
DCT_CONST_BITS
);
u7
=
_mm_srai_epi32
(
v7
,
DCT_CONST_BITS
);
u8
=
_mm_srai_epi32
(
v8
,
DCT_CONST_BITS
);
u9
=
_mm_srai_epi32
(
v9
,
DCT_CONST_BITS
);
u10
=
_mm_srai_epi32
(
v10
,
DCT_CONST_BITS
);
u11
=
_mm_srai_epi32
(
v11
,
DCT_CONST_BITS
);
u12
=
_mm_srai_epi32
(
v12
,
DCT_CONST_BITS
);
u13
=
_mm_srai_epi32
(
v13
,
DCT_CONST_BITS
);
u14
=
_mm_srai_epi32
(
v14
,
DCT_CONST_BITS
);
u15
=
_mm_srai_epi32
(
v15
,
DCT_CONST_BITS
);
// back to 16-bit and pack 8 integers into __m128i
in
[
0
]
=
_mm_packs_epi32
(
u0
,
u1
);
in
[
1
]
=
_mm_packs_epi32
(
u2
,
u3
);
in
[
2
]
=
_mm_packs_epi32
(
u4
,
u5
);
in
[
3
]
=
_mm_packs_epi32
(
u6
,
u7
);
in
[
4
]
=
_mm_packs_epi32
(
u8
,
u9
);
in
[
5
]
=
_mm_packs_epi32
(
u10
,
u11
);
in
[
6
]
=
_mm_packs_epi32
(
u12
,
u13
);
in
[
7
]
=
_mm_packs_epi32
(
u14
,
u15
);
// stage 2
s0
=
_mm_add_epi16
(
in
[
0
],
in
[
2
]);
s1
=
_mm_add_epi16
(
in
[
1
],
in
[
3
]);
s2
=
_mm_sub_epi16
(
in
[
0
],
in
[
2
]);
s3
=
_mm_sub_epi16
(
in
[
1
],
in
[
3
]);
u0
=
_mm_unpacklo_epi16
(
in
[
4
],
in
[
5
]);
u1
=
_mm_unpackhi_epi16
(
in
[
4
],
in
[
5
]);
u2
=
_mm_unpacklo_epi16
(
in
[
6
],
in
[
7
]);
u3
=
_mm_unpackhi_epi16
(
in
[
6
],
in
[
7
]);
v0
=
_mm_madd_epi16
(
u0
,
k__cospi_p08_p24
);
v1
=
_mm_madd_epi16
(
u1
,
k__cospi_p08_p24
);
v2
=
_mm_madd_epi16
(
u0
,
k__cospi_p24_m08
);
v3
=
_mm_madd_epi16
(
u1
,
k__cospi_p24_m08
);
v4
=
_mm_madd_epi16
(
u2
,
k__cospi_m24_p08
);
v5
=
_mm_madd_epi16
(
u3
,
k__cospi_m24_p08
);
v6
=
_mm_madd_epi16
(
u2
,
k__cospi_p08_p24
);
v7
=
_mm_madd_epi16
(
u3
,
k__cospi_p08_p24
);
w0
=
_mm_add_epi32
(
v0
,
v4
);
w1
=
_mm_add_epi32
(
v1
,
v5
);
w2
=
_mm_add_epi32
(
v2
,
v6
);
w3
=
_mm_add_epi32
(
v3
,
v7
);
w4
=
_mm_sub_epi32
(
v0
,
v4
);
w5
=
_mm_sub_epi32
(
v1
,
v5
);
w6
=
_mm_sub_epi32
(
v2
,
v6
);
w7
=
_mm_sub_epi32
(
v3
,
v7
);
v0
=
_mm_add_epi32
(
w0
,
k__DCT_CONST_ROUNDING
);
v1
=
_mm_add_epi32
(
w1
,
k__DCT_CONST_ROUNDING
);
v2
=
_mm_add_epi32
(
w2
,
k__DCT_CONST_ROUNDING
);
v3
=
_mm_add_epi32
(
w3
,
k__DCT_CONST_ROUNDING
);
v4
=
_mm_add_epi32
(
w4
,
k__DCT_CONST_ROUNDING
);
v5
=
_mm_add_epi32
(
w5
,
k__DCT_CONST_ROUNDING
);
v6
=
_mm_add_epi32
(
w6
,
k__DCT_CONST_ROUNDING
);
v7
=
_mm_add_epi32
(
w7
,
k__DCT_CONST_ROUNDING
);
u0
=
_mm_srai_epi32
(
v0
,
DCT_CONST_BITS
);
u1
=
_mm_srai_epi32
(
v1
,
DCT_CONST_BITS
);
u2
=
_mm_srai_epi32
(
v2
,
DCT_CONST_BITS
);
u3
=
_mm_srai_epi32
(
v3
,
DCT_CONST_BITS
);
u4
=
_mm_srai_epi32
(
v4
,
DCT_CONST_BITS
);
u5
=
_mm_srai_epi32
(
v5
,
DCT_CONST_BITS
);
u6
=
_mm_srai_epi32
(
v6
,
DCT_CONST_BITS
);
u7
=
_mm_srai_epi32
(
v7
,
DCT_CONST_BITS
);
// back to 16-bit intergers
s4
=
_mm_packs_epi32
(
u0
,
u1
);
s5
=
_mm_packs_epi32
(
u2
,
u3
);
s6
=
_mm_packs_epi32
(
u4
,
u5
);
s7
=
_mm_packs_epi32
(
u6
,
u7
);
// stage 3
u0
=
_mm_unpacklo_epi16
(
s2
,
s3
);
u1
=
_mm_unpackhi_epi16
(
s2
,
s3
);
u2
=
_mm_unpacklo_epi16
(
s6
,
s7
);
u3
=
_mm_unpackhi_epi16
(
s6
,
s7
);
v0
=
_mm_madd_epi16
(
u0
,
k__cospi_p16_p16
);
v1
=
_mm_madd_epi16
(
u1
,
k__cospi_p16_p16
);
v2
=
_mm_madd_epi16
(
u0
,
k__cospi_p16_m16
);
v3
=
_mm_madd_epi16
(
u1
,
k__cospi_p16_m16
);
v4
=
_mm_madd_epi16
(
u2
,
k__cospi_p16_p16
);
v5
=
_mm_madd_epi16
(
u3
,
k__cospi_p16_p16
);
v6
=
_mm_madd_epi16
(
u2
,
k__cospi_p16_m16
);
v7
=
_mm_madd_epi16
(
u3
,
k__cospi_p16_m16
);
u0
=
_mm_add_epi32
(
v0
,
k__DCT_CONST_ROUNDING
);
u1
=
_mm_add_epi32
(
v1
,
k__DCT_CONST_ROUNDING
);
u2
=
_mm_add_epi32
(
v2
,
k__DCT_CONST_ROUNDING
);
u3
=
_mm_add_epi32
(
v3
,
k__DCT_CONST_ROUNDING
);
u4
=
_mm_add_epi32
(
v4
,
k__DCT_CONST_ROUNDING
);
u5
=
_mm_add_epi32
(
v5
,
k__DCT_CONST_ROUNDING
);
u6
=
_mm_add_epi32
(
v6
,
k__DCT_CONST_ROUNDING
);
u7
=
_mm_add_epi32
(
v7
,
k__DCT_CONST_ROUNDING
);
v0
=
_mm_srai_epi32
(
u0
,
DCT_CONST_BITS
);
v1
=
_mm_srai_epi32
(
u1
,
DCT_CONST_BITS
);
v2
=
_mm_srai_epi32
(
u2
,
DCT_CONST_BITS
);
v3
=
_mm_srai_epi32
(
u3
,
DCT_CONST_BITS
);
v4
=
_mm_srai_epi32
(
u4
,
DCT_CONST_BITS
);
v5
=
_mm_srai_epi32
(
u5
,
DCT_CONST_BITS
);
v6
=
_mm_srai_epi32
(
u6
,
DCT_CONST_BITS
);
v7
=
_mm_srai_epi32
(
u7
,
DCT_CONST_BITS
);
s2
=
_mm_packs_epi32
(
v0
,
v1
);
s3
=
_mm_packs_epi32
(
v2
,
v3
);
s6
=
_mm_packs_epi32
(
v4
,
v5
);
s7
=
_mm_packs_epi32
(
v6
,
v7
);
// FIXME(jingning): do subtract using bit inversion?
in
[
0
]
=
s0
;
in
[
1
]
=
_mm_sub_epi16
(
k__const_0
,
s4
);
in
[
2
]
=
s6
;
in
[
3
]
=
_mm_sub_epi16
(
k__const_0
,
s2
);
in
[
4
]
=
s3
;
in
[
5
]
=
_mm_sub_epi16
(
k__const_0
,
s7
);
in
[
6
]
=
s5
;
in
[
7
]
=
_mm_sub_epi16
(
k__const_0
,
s1
);
// transpose
array_transpose_8x8
(
in
);
}
void
vp9_short_fht8x8_sse2
(
int16_t
*
input
,
int16_t
*
output
,
int
stride
,
int
tx_type
)
{
__m128i
in
[
8
];
load_buffer_8x8
(
input
,
in
,
stride
);
switch
(
tx_type
)
{
case
0
:
// DCT_DCT
fdct8_1d_sse2
(
in
);
fadst8_1d_sse2
(
in
);
break
;
case
1
:
// ADST_DCT
fadst8_1d_sse2
(
in
);
fdct8_1d_sse2
(
in
);
break
;
case
2
:
// DCT_ADST
fdct8_1d_sse2
(
in
);
fadst8_1d_sse2
(
in
);
break
;
case
3
:
// ADST_ADST
fadst8_1d_sse2
(
in
);
fadst8_1d_sse2
(
in
);
break
;
default:
assert
(
0
);
break
;
}
write_buffer_8x8
(
output
,
in
);
}
void
vp9_short_fdct16x16_sse2
(
int16_t
*
input
,
int16_t
*
output
,
int
pitch
)
{
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment