Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
33231d48
Commit
33231d48
authored
Oct 06, 2016
by
David Barker
Browse files
Add sse2 forward and inverse 16x32 and 32x16 transforms
Change-Id: I1241257430f1e08ead1ce0f31db8272b50783102
parent
cad8283e
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
aom_dsp/aom_dsp.mk
View file @
33231d48
...
...
@@ -186,6 +186,7 @@ DSP_SRCS-yes += fwd_txfm.c
DSP_SRCS-yes
+=
fwd_txfm.h
DSP_SRCS-$(HAVE_SSE2)
+=
x86/fwd_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2)
+=
x86/fwd_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2)
+=
x86/fwd_dct32_8cols_sse2.c
DSP_SRCS-$(HAVE_SSE2)
+=
x86/fwd_txfm_impl_sse2.h
DSP_SRCS-$(HAVE_SSE2)
+=
x86/fwd_dct32x32_impl_sse2.h
ifeq
($(ARCH_X86_64),yes)
...
...
aom_dsp/x86/fwd_dct32_8cols_sse2.c
0 → 100644
View file @
33231d48
This diff is collapsed.
Click to expand it.
aom_dsp/x86/fwd_txfm_sse2.h
View file @
33231d48
...
...
@@ -365,6 +365,8 @@ static INLINE void transpose_and_output8x8(
}
}
void
fdct32_8col
(
__m128i
*
in0
,
__m128i
*
in1
);
#ifdef __cplusplus
}
// extern "C"
#endif
...
...
aom_dsp/x86/inv_txfm_sse2.c
View file @
33231d48
...
...
@@ -2669,28 +2669,28 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
stp1_31 = stp2_31; \
}
#define IDCT32
\
#define IDCT32
(in0, in1)
\
/* Stage1 */
\
{ \
const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in
[31]);
\
const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in
[31]);
\
const __m128i lo_17_15 = _mm_unpacklo_epi16(in[1
7
], in[15]);
\
const __m128i hi_17_15 = _mm_unpackhi_epi16(in[1
7
], in[15]);
\
\
const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in
[23]);
\
const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in
[23]);
\
const __m128i lo_25_7 = _mm_unpacklo_epi16(in
[25
], in[7]);
\
const __m128i hi_25_7 = _mm_unpackhi_epi16(in
[25
], in[7]);
\
\
const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in
[27]);
\
const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in
[27]);
\
const __m128i lo_21_11 = _mm_unpacklo_epi16(in
[21
], in[11]);
\
const __m128i hi_21_11 = _mm_unpackhi_epi16(in
[21
], in[11]);
\
\
const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in
[19]);
\
const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in
[19]);
\
const __m128i lo_29_3 = _mm_unpacklo_epi16(in
[29
], in[3]);
\
const __m128i hi_29_3 = _mm_unpackhi_epi16(in
[29
], in[3]);
\
const __m128i lo_1_31 = _mm_unpacklo_epi16(
(
in
0)
[1],
(
in
1)[15]);
\
const __m128i hi_1_31 = _mm_unpackhi_epi16(
(
in
0)
[1],
(
in
1)[15]);
\
const __m128i lo_17_15 = _mm_unpacklo_epi16(
(
in
1)
[1],
(
in
0)
[15]); \
const __m128i hi_17_15 = _mm_unpackhi_epi16(
(
in
1)
[1],
(
in
0)
[15]); \
\
const __m128i lo_9_23 = _mm_unpacklo_epi16(
(
in
0)
[9],
(
in
1)[7]);
\
const __m128i hi_9_23 = _mm_unpackhi_epi16(
(
in
0)
[9],
(
in
1)[7]);
\
const __m128i lo_25_7 = _mm_unpacklo_epi16(
(
in
1)[9
],
(
in
0)
[7]); \
const __m128i hi_25_7 = _mm_unpackhi_epi16(
(
in
1)[9
],
(
in
0)
[7]); \
\
const __m128i lo_5_27 = _mm_unpacklo_epi16(
(
in
0)
[5],
(
in
1)[11]);
\
const __m128i hi_5_27 = _mm_unpackhi_epi16(
(
in
0)
[5],
(
in
1)[11]);
\
const __m128i lo_21_11 = _mm_unpacklo_epi16(
(
in
1)[5
],
(
in
0)
[11]); \
const __m128i hi_21_11 = _mm_unpackhi_epi16(
(
in
1)[5
],
(
in
0)
[11]); \
\
const __m128i lo_13_19 = _mm_unpacklo_epi16(
(
in
0)
[13],
(
in
1)[3]);
\
const __m128i hi_13_19 = _mm_unpackhi_epi16(
(
in
0)
[13],
(
in
1)[3]);
\
const __m128i lo_29_3 = _mm_unpacklo_epi16(
(
in
1)[13
],
(
in
0)
[3]); \
const __m128i hi_29_3 = _mm_unpackhi_epi16(
(
in
1)[13
],
(
in
0)
[3]); \
\
MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \
...
...
@@ -2707,15 +2707,15 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
\
/* Stage2 */
\
{ \
const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in
[30]);
\
const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in
[30]);
\
const __m128i lo_18_14 = _mm_unpacklo_epi16(in
[18
], in[14]);
\
const __m128i hi_18_14 = _mm_unpackhi_epi16(in
[18
], in[14]);
\
const __m128i lo_2_30 = _mm_unpacklo_epi16(
(
in
0)
[2],
(
in
1)[14]);
\
const __m128i hi_2_30 = _mm_unpackhi_epi16(
(
in
0)
[2],
(
in
1)[14]);
\
const __m128i lo_18_14 = _mm_unpacklo_epi16(
(
in
1)[2
],
(
in
0)
[14]); \
const __m128i hi_18_14 = _mm_unpackhi_epi16(
(
in
1)[2
],
(
in
0)
[14]); \
\
const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in
[22]);
\
const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in
[22]);
\
const __m128i lo_26_6 = _mm_unpacklo_epi16(in
[26
], in[6]);
\
const __m128i hi_26_6 = _mm_unpackhi_epi16(in
[26
], in[6]);
\
const __m128i lo_10_22 = _mm_unpacklo_epi16(
(
in
0)
[10],
(
in
1)[6]);
\
const __m128i hi_10_22 = _mm_unpackhi_epi16(
(
in
0)
[10],
(
in
1)[6]);
\
const __m128i lo_26_6 = _mm_unpacklo_epi16(
(
in
1)[10
],
(
in
0)
[6]); \
const __m128i hi_26_6 = _mm_unpackhi_epi16(
(
in
1)[10
],
(
in
0)
[6]); \
\
MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
...
...
@@ -2747,10 +2747,10 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
\
/* Stage3 */
\
{ \
const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in
[28]);
\
const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in
[28]);
\
const __m128i lo_20_12 = _mm_unpacklo_epi16(in
[20
], in[12]);
\
const __m128i hi_20_12 = _mm_unpackhi_epi16(in
[20
], in[12]);
\
const __m128i lo_4_28 = _mm_unpacklo_epi16(
(
in
0)
[4],
(
in
1)[12]);
\
const __m128i hi_4_28 = _mm_unpackhi_epi16(
(
in
0)
[4],
(
in
1)[12]);
\
const __m128i lo_20_12 = _mm_unpacklo_epi16(
(
in
1)[4
],
(
in
0)
[12]); \
const __m128i hi_20_12 = _mm_unpackhi_epi16(
(
in
1)[4
],
(
in
0)
[12]); \
\
const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
...
...
@@ -2794,10 +2794,10 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
\
/* Stage4 */
\
{ \
const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in
[16]);
\
const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in
[16]);
\
const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in
[24]);
\
const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in
[24]);
\
const __m128i lo_0_16 = _mm_unpacklo_epi16(
(
in
0)
[0],
(
in
1)[0]);
\
const __m128i hi_0_16 = _mm_unpackhi_epi16(
(
in
0)
[0],
(
in
1)[0]);
\
const __m128i lo_8_24 = _mm_unpacklo_epi16(
(
in
0)
[8],
(
in
1)[8]);
\
const __m128i hi_8_24 = _mm_unpackhi_epi16(
(
in
0)
[8],
(
in
1)[8]);
\
\
const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
...
...
@@ -3338,7 +3338,7 @@ void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
array_transpose_8x8
(
in
+
16
,
in
+
16
);
array_transpose_8x8
(
in
+
24
,
in
+
24
);
IDCT32
IDCT32
(
in
,
in
+
16
)
// 1_D: Store 32 intermediate results for each 8x32 block.
col
[
i32
+
0
]
=
_mm_add_epi16
(
stp1_0
,
stp1_31
);
...
...
@@ -3384,7 +3384,7 @@ void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
array_transpose_8x8
(
col
+
j
+
64
,
in
+
16
);
array_transpose_8x8
(
col
+
j
+
96
,
in
+
24
);
IDCT32
IDCT32
(
in
,
in
+
16
)
// 2_D: Calculate the results and store them to destination.
in
[
0
]
=
_mm_add_epi16
(
stp1_0
,
stp1_31
);
...
...
@@ -3451,6 +3451,107 @@ void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
}
}
// Apply a 32-element IDCT to 8 columns. This does not do any transposition
// of its input - the caller is expected to have done that.
// The input buffers are the top and bottom halves of an 8x32 block.
void
idct32_8col
(
__m128i
*
in0
,
__m128i
*
in1
)
{
const
__m128i
rounding
=
_mm_set1_epi32
(
DCT_CONST_ROUNDING
);
// idct constants for each stage
const
__m128i
stg1_0
=
pair_set_epi16
(
cospi_31_64
,
-
cospi_1_64
);
const
__m128i
stg1_1
=
pair_set_epi16
(
cospi_1_64
,
cospi_31_64
);
const
__m128i
stg1_2
=
pair_set_epi16
(
cospi_15_64
,
-
cospi_17_64
);
const
__m128i
stg1_3
=
pair_set_epi16
(
cospi_17_64
,
cospi_15_64
);
const
__m128i
stg1_4
=
pair_set_epi16
(
cospi_23_64
,
-
cospi_9_64
);
const
__m128i
stg1_5
=
pair_set_epi16
(
cospi_9_64
,
cospi_23_64
);
const
__m128i
stg1_6
=
pair_set_epi16
(
cospi_7_64
,
-
cospi_25_64
);
const
__m128i
stg1_7
=
pair_set_epi16
(
cospi_25_64
,
cospi_7_64
);
const
__m128i
stg1_8
=
pair_set_epi16
(
cospi_27_64
,
-
cospi_5_64
);
const
__m128i
stg1_9
=
pair_set_epi16
(
cospi_5_64
,
cospi_27_64
);
const
__m128i
stg1_10
=
pair_set_epi16
(
cospi_11_64
,
-
cospi_21_64
);
const
__m128i
stg1_11
=
pair_set_epi16
(
cospi_21_64
,
cospi_11_64
);
const
__m128i
stg1_12
=
pair_set_epi16
(
cospi_19_64
,
-
cospi_13_64
);
const
__m128i
stg1_13
=
pair_set_epi16
(
cospi_13_64
,
cospi_19_64
);
const
__m128i
stg1_14
=
pair_set_epi16
(
cospi_3_64
,
-
cospi_29_64
);
const
__m128i
stg1_15
=
pair_set_epi16
(
cospi_29_64
,
cospi_3_64
);
const
__m128i
stg2_0
=
pair_set_epi16
(
cospi_30_64
,
-
cospi_2_64
);
const
__m128i
stg2_1
=
pair_set_epi16
(
cospi_2_64
,
cospi_30_64
);
const
__m128i
stg2_2
=
pair_set_epi16
(
cospi_14_64
,
-
cospi_18_64
);
const
__m128i
stg2_3
=
pair_set_epi16
(
cospi_18_64
,
cospi_14_64
);
const
__m128i
stg2_4
=
pair_set_epi16
(
cospi_22_64
,
-
cospi_10_64
);
const
__m128i
stg2_5
=
pair_set_epi16
(
cospi_10_64
,
cospi_22_64
);
const
__m128i
stg2_6
=
pair_set_epi16
(
cospi_6_64
,
-
cospi_26_64
);
const
__m128i
stg2_7
=
pair_set_epi16
(
cospi_26_64
,
cospi_6_64
);
const
__m128i
stg3_0
=
pair_set_epi16
(
cospi_28_64
,
-
cospi_4_64
);
const
__m128i
stg3_1
=
pair_set_epi16
(
cospi_4_64
,
cospi_28_64
);
const
__m128i
stg3_2
=
pair_set_epi16
(
cospi_12_64
,
-
cospi_20_64
);
const
__m128i
stg3_3
=
pair_set_epi16
(
cospi_20_64
,
cospi_12_64
);
const
__m128i
stg3_4
=
pair_set_epi16
(
-
cospi_4_64
,
cospi_28_64
);
const
__m128i
stg3_5
=
pair_set_epi16
(
cospi_28_64
,
cospi_4_64
);
const
__m128i
stg3_6
=
pair_set_epi16
(
-
cospi_28_64
,
-
cospi_4_64
);
const
__m128i
stg3_8
=
pair_set_epi16
(
-
cospi_20_64
,
cospi_12_64
);
const
__m128i
stg3_9
=
pair_set_epi16
(
cospi_12_64
,
cospi_20_64
);
const
__m128i
stg3_10
=
pair_set_epi16
(
-
cospi_12_64
,
-
cospi_20_64
);
const
__m128i
stg4_0
=
pair_set_epi16
(
cospi_16_64
,
cospi_16_64
);
const
__m128i
stg4_1
=
pair_set_epi16
(
cospi_16_64
,
-
cospi_16_64
);
const
__m128i
stg4_2
=
pair_set_epi16
(
cospi_24_64
,
-
cospi_8_64
);
const
__m128i
stg4_3
=
pair_set_epi16
(
cospi_8_64
,
cospi_24_64
);
const
__m128i
stg4_4
=
pair_set_epi16
(
-
cospi_8_64
,
cospi_24_64
);
const
__m128i
stg4_5
=
pair_set_epi16
(
cospi_24_64
,
cospi_8_64
);
const
__m128i
stg4_6
=
pair_set_epi16
(
-
cospi_24_64
,
-
cospi_8_64
);
const
__m128i
stg6_0
=
pair_set_epi16
(
-
cospi_16_64
,
cospi_16_64
);
__m128i
stp1_0
,
stp1_1
,
stp1_2
,
stp1_3
,
stp1_4
,
stp1_5
,
stp1_6
,
stp1_7
,
stp1_8
,
stp1_9
,
stp1_10
,
stp1_11
,
stp1_12
,
stp1_13
,
stp1_14
,
stp1_15
,
stp1_16
,
stp1_17
,
stp1_18
,
stp1_19
,
stp1_20
,
stp1_21
,
stp1_22
,
stp1_23
,
stp1_24
,
stp1_25
,
stp1_26
,
stp1_27
,
stp1_28
,
stp1_29
,
stp1_30
,
stp1_31
;
__m128i
stp2_0
,
stp2_1
,
stp2_2
,
stp2_3
,
stp2_4
,
stp2_5
,
stp2_6
,
stp2_7
,
stp2_8
,
stp2_9
,
stp2_10
,
stp2_11
,
stp2_12
,
stp2_13
,
stp2_14
,
stp2_15
,
stp2_16
,
stp2_17
,
stp2_18
,
stp2_19
,
stp2_20
,
stp2_21
,
stp2_22
,
stp2_23
,
stp2_24
,
stp2_25
,
stp2_26
,
stp2_27
,
stp2_28
,
stp2_29
,
stp2_30
,
stp2_31
;
__m128i
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
tmp6
,
tmp7
;
IDCT32
(
in0
,
in1
)
// 2_D: Calculate the results and store them to destination.
in0
[
0
]
=
_mm_add_epi16
(
stp1_0
,
stp1_31
);
in0
[
1
]
=
_mm_add_epi16
(
stp1_1
,
stp1_30
);
in0
[
2
]
=
_mm_add_epi16
(
stp1_2
,
stp1_29
);
in0
[
3
]
=
_mm_add_epi16
(
stp1_3
,
stp1_28
);
in0
[
4
]
=
_mm_add_epi16
(
stp1_4
,
stp1_27
);
in0
[
5
]
=
_mm_add_epi16
(
stp1_5
,
stp1_26
);
in0
[
6
]
=
_mm_add_epi16
(
stp1_6
,
stp1_25
);
in0
[
7
]
=
_mm_add_epi16
(
stp1_7
,
stp1_24
);
in0
[
8
]
=
_mm_add_epi16
(
stp1_8
,
stp1_23
);
in0
[
9
]
=
_mm_add_epi16
(
stp1_9
,
stp1_22
);
in0
[
10
]
=
_mm_add_epi16
(
stp1_10
,
stp1_21
);
in0
[
11
]
=
_mm_add_epi16
(
stp1_11
,
stp1_20
);
in0
[
12
]
=
_mm_add_epi16
(
stp1_12
,
stp1_19
);
in0
[
13
]
=
_mm_add_epi16
(
stp1_13
,
stp1_18
);
in0
[
14
]
=
_mm_add_epi16
(
stp1_14
,
stp1_17
);
in0
[
15
]
=
_mm_add_epi16
(
stp1_15
,
stp1_16
);
in1
[
0
]
=
_mm_sub_epi16
(
stp1_15
,
stp1_16
);
in1
[
1
]
=
_mm_sub_epi16
(
stp1_14
,
stp1_17
);
in1
[
2
]
=
_mm_sub_epi16
(
stp1_13
,
stp1_18
);
in1
[
3
]
=
_mm_sub_epi16
(
stp1_12
,
stp1_19
);
in1
[
4
]
=
_mm_sub_epi16
(
stp1_11
,
stp1_20
);
in1
[
5
]
=
_mm_sub_epi16
(
stp1_10
,
stp1_21
);
in1
[
6
]
=
_mm_sub_epi16
(
stp1_9
,
stp1_22
);
in1
[
7
]
=
_mm_sub_epi16
(
stp1_8
,
stp1_23
);
in1
[
8
]
=
_mm_sub_epi16
(
stp1_7
,
stp1_24
);
in1
[
9
]
=
_mm_sub_epi16
(
stp1_6
,
stp1_25
);
in1
[
10
]
=
_mm_sub_epi16
(
stp1_5
,
stp1_26
);
in1
[
11
]
=
_mm_sub_epi16
(
stp1_4
,
stp1_27
);
in1
[
12
]
=
_mm_sub_epi16
(
stp1_3
,
stp1_28
);
in1
[
13
]
=
_mm_sub_epi16
(
stp1_2
,
stp1_29
);
in1
[
14
]
=
_mm_sub_epi16
(
stp1_1
,
stp1_30
);
in1
[
15
]
=
_mm_sub_epi16
(
stp1_0
,
stp1_31
);
}
#if CONFIG_AOM_HIGHBITDEPTH
static
INLINE
__m128i
clamp_high_sse2
(
__m128i
value
,
int
bd
)
{
__m128i
ubounded
,
retval
;
...
...
aom_dsp/x86/inv_txfm_sse2.h
View file @
33231d48
...
...
@@ -203,5 +203,6 @@ void idct16_sse2(__m128i *in0, __m128i *in1);
void
iadst4_sse2
(
__m128i
*
in
);
void
iadst8_sse2
(
__m128i
*
in
);
void
iadst16_sse2
(
__m128i
*
in0
,
__m128i
*
in1
);
void
idct32_8col
(
__m128i
*
in0
,
__m128i
*
in1
);
#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_
av1/common/av1_rtcd_defs.pl
View file @
33231d48
...
...
@@ -103,10 +103,10 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize
qw/av1_iht16x8_128_add sse2/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
specialize
qw/av1_iht16x32_512_add
sse2
/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
specialize
qw/av1_iht32x16_512_add
sse2
/
;
}
add_proto
qw/void av1_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
...
...
@@ -164,10 +164,10 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize
qw/av1_iht16x8_128_add sse2/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
specialize
qw/av1_iht16x32_512_add
sse2
/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
specialize
qw/av1_iht32x16_512_add
sse2
/
;
}
add_proto
qw/void av1_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
...
...
@@ -404,10 +404,10 @@ if (aom_config("CONFIG_EXT_TX") eq "yes") {
specialize
qw/av1_fht16x8 sse2/
;
add_proto
qw/void av1_fht16x32/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x32/
;
specialize
qw/av1_fht16x32
sse2
/
;
add_proto
qw/void av1_fht32x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht32x16/
;
specialize
qw/av1_fht32x16
sse2
/
;
add_proto
qw/void av1_fht32x32/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht32x32 avx2/
;
...
...
av1/common/x86/idct_intrin_sse2.c
View file @
33231d48
...
...
@@ -496,6 +496,12 @@ static void iidtx16_8col(__m128i *in) {
in
[
15
]
=
_mm_packs_epi32
(
u7
,
y7
);
}
static
void
iidtx16_sse2
(
__m128i
*
in0
,
__m128i
*
in1
)
{
array_transpose_16x16
(
in0
,
in1
);
iidtx16_8col
(
in0
);
iidtx16_8col
(
in1
);
}
static
void
iidtx8_sse2
(
__m128i
*
in
)
{
in
[
0
]
=
_mm_slli_epi16
(
in
[
0
],
1
);
in
[
1
]
=
_mm_slli_epi16
(
in
[
1
],
1
);
...
...
@@ -628,6 +634,11 @@ static INLINE void scale_sqrt2_8x8(__m128i *in) {
xx_roundn_epi32_unsigned
(
v_p7b_d
,
DCT_CONST_BITS
));
}
static
INLINE
void
scale_sqrt2_8x16
(
__m128i
*
in
)
{
scale_sqrt2_8x8
(
in
);
scale_sqrt2_8x8
(
in
+
8
);
}
void
av1_iht8x16_128_add_sse2
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
,
int
tx_type
)
{
__m128i
in
[
16
];
...
...
@@ -1202,4 +1213,322 @@ void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
in
[
3
]
=
_mm_unpacklo_epi64
(
in
[
6
],
in
[
7
]);
write_buffer_4x8_round5
(
dest
,
in
,
stride
);
}
// Note: The 16-column 32-element transforms take input in the form of four
// 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
// of the overall 16x32 input buffer.
static
INLINE
void
idct32_16col
(
__m128i
*
tl
,
__m128i
*
tr
,
__m128i
*
bl
,
__m128i
*
br
)
{
array_transpose_16x16
(
tl
,
tr
);
array_transpose_16x16
(
bl
,
br
);
idct32_8col
(
tl
,
bl
);
idct32_8col
(
tr
,
br
);
}
static
INLINE
void
ihalfright32_16col
(
__m128i
*
tl
,
__m128i
*
tr
,
__m128i
*
bl
,
__m128i
*
br
)
{
__m128i
tmpl
[
16
],
tmpr
[
16
];
int
i
;
// Copy the top half of the input to temporary storage
for
(
i
=
0
;
i
<
16
;
++
i
)
{
tmpl
[
i
]
=
tl
[
i
];
tmpr
[
i
]
=
tr
[
i
];
}
// Generate the top half of the output
for
(
i
=
0
;
i
<
16
;
++
i
)
{
tl
[
i
]
=
_mm_slli_epi16
(
bl
[
i
],
2
);
tr
[
i
]
=
_mm_slli_epi16
(
br
[
i
],
2
);
}
array_transpose_16x16
(
tl
,
tr
);
// Copy the temporary storage back to the bottom half of the input
for
(
i
=
0
;
i
<
16
;
++
i
)
{
bl
[
i
]
=
tmpl
[
i
];
br
[
i
]
=
tmpr
[
i
];
}
// Generate the bottom half of the output
scale_sqrt2_8x16
(
bl
);
scale_sqrt2_8x16
(
br
);
idct16_sse2
(
bl
,
br
);
// Includes a transposition
}
static
INLINE
void
iidtx32_16col
(
__m128i
*
tl
,
__m128i
*
tr
,
__m128i
*
bl
,
__m128i
*
br
)
{
int
i
;
array_transpose_16x16
(
tl
,
tr
);
array_transpose_16x16
(
bl
,
br
);
for
(
i
=
0
;
i
<
16
;
++
i
)
{
tl
[
i
]
=
_mm_slli_epi16
(
tl
[
i
],
2
);
tr
[
i
]
=
_mm_slli_epi16
(
tr
[
i
],
2
);
bl
[
i
]
=
_mm_slli_epi16
(
bl
[
i
],
2
);
br
[
i
]
=
_mm_slli_epi16
(
br
[
i
],
2
);
}
}
static
INLINE
void
write_buffer_16x32_round6
(
uint8_t
*
dest
,
__m128i
*
intl
,
__m128i
*
intr
,
__m128i
*
inbl
,
__m128i
*
inbr
,
int
stride
)
{
const
__m128i
zero
=
_mm_setzero_si128
();
const
__m128i
final_rounding
=
_mm_set1_epi16
(
1
<<
5
);
int
i
;
for
(
i
=
0
;
i
<
16
;
++
i
)
{
intl
[
i
]
=
_mm_adds_epi16
(
intl
[
i
],
final_rounding
);
intr
[
i
]
=
_mm_adds_epi16
(
intr
[
i
],
final_rounding
);
inbl
[
i
]
=
_mm_adds_epi16
(
inbl
[
i
],
final_rounding
);
inbr
[
i
]
=
_mm_adds_epi16
(
inbr
[
i
],
final_rounding
);
intl
[
i
]
=
_mm_srai_epi16
(
intl
[
i
],
6
);
intr
[
i
]
=
_mm_srai_epi16
(
intr
[
i
],
6
);
inbl
[
i
]
=
_mm_srai_epi16
(
inbl
[
i
],
6
);
inbr
[
i
]
=
_mm_srai_epi16
(
inbr
[
i
],
6
);
RECON_AND_STORE
(
dest
+
i
*
stride
+
0
,
intl
[
i
]);
RECON_AND_STORE
(
dest
+
i
*
stride
+
8
,
intr
[
i
]);
RECON_AND_STORE
(
dest
+
(
i
+
16
)
*
stride
+
0
,
inbl
[
i
]);
RECON_AND_STORE
(
dest
+
(
i
+
16
)
*
stride
+
8
,
inbr
[
i
]);
}
}
void
av1_iht16x32_512_add_sse2
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
,
int
tx_type
)
{
__m128i
intl
[
16
],
intr
[
16
],
inbl
[
16
],
inbr
[
16
];
int
i
;
for
(
i
=
0
;
i
<
16
;
++
i
)
{
intl
[
i
]
=
load_input_data
(
input
+
i
*
16
+
0
);
intr
[
i
]
=
load_input_data
(
input
+
i
*
16
+
8
);
inbl
[
i
]
=
load_input_data
(
input
+
(
i
+
16
)
*
16
+
0
);
inbr
[
i
]
=
load_input_data
(
input
+
(
i
+
16
)
*
16
+
8
);
}
// Row transform
switch
(
tx_type
)
{
case
DCT_DCT
:
case
ADST_DCT
:
case
FLIPADST_DCT
:
case
H_DCT
:
idct16_sse2
(
intl
,
intr
);
idct16_sse2
(
inbl
,
inbr
);
break
;
case
DCT_ADST
:
case
ADST_ADST
:
case
DCT_FLIPADST
:
case
FLIPADST_FLIPADST
:
case
ADST_FLIPADST
:
case
FLIPADST_ADST
:
case
H_ADST
:
case
H_FLIPADST
:
iadst16_sse2
(
intl
,
intr
);
iadst16_sse2
(
inbl
,
inbr
);
break
;
case
V_FLIPADST
:
case
V_ADST
:
case
V_DCT
:
case
IDTX
:
iidtx16_sse2
(
intl
,
intr
);
iidtx16_sse2
(
inbl
,
inbr
);
break
;
default:
assert
(
0
);
break
;
}
scale_sqrt2_8x16
(
intl
);
scale_sqrt2_8x16
(
intr
);
scale_sqrt2_8x16
(
inbl
);
scale_sqrt2_8x16
(
inbr
);
// Column transform
switch
(
tx_type
)
{
case
DCT_DCT
:
case
DCT_ADST
:
case
DCT_FLIPADST
:
case
V_DCT
:
idct32_16col
(
intl
,
intr
,
inbl
,
inbr
);
break
;
case
ADST_DCT
:
case
ADST_ADST
:
case
FLIPADST_ADST
:
case
ADST_FLIPADST
:
case
FLIPADST_FLIPADST
:
case
FLIPADST_DCT
:
case
V_ADST
:
case
V_FLIPADST
:
ihalfright32_16col
(
intl
,
intr
,
inbl
,
inbr
);
break
;
case
H_DCT
:
case
H_ADST
:
case
H_FLIPADST
:
case
IDTX
:
iidtx32_16col
(
intl
,
intr
,
inbl
,
inbr
);
break
;
default:
assert
(
0
);
break
;
}
switch
(
tx_type
)
{
case
DCT_DCT
:
case
ADST_DCT
:
case
H_DCT
:
case
DCT_ADST
:
case
ADST_ADST
:
case
H_ADST
:
case
V_ADST
:
case
V_DCT
:
case
IDTX
:
break
;
case
FLIPADST_DCT
:
case
FLIPADST_ADST
:
case
V_FLIPADST
:
FLIPUD_PTR
(
dest
,
stride
,
32
);
break
;
case
DCT_FLIPADST
:
case
ADST_FLIPADST
:
case
H_FLIPADST
:
for
(
i
=
0
;
i
<
16
;
++
i
)
{
__m128i
tmp
=
intl
[
i
];
intl
[
i
]
=
mm_reverse_epi16
(
intr
[
i
]);
intr
[
i
]
=
mm_reverse_epi16
(
tmp
);
tmp
=
inbl
[
i
];
inbl
[
i
]
=
mm_reverse_epi16
(
inbr
[
i
]);
inbr
[
i
]
=
mm_reverse_epi16
(
tmp
);
}
break
;
case
FLIPADST_FLIPADST
:
for
(
i
=
0
;
i
<
16
;
++
i
)
{
__m128i
tmp
=
intl
[
i
];
intl
[
i
]
=
mm_reverse_epi16
(
intr
[
i
]);
intr
[
i
]
=
mm_reverse_epi16
(
tmp
);
tmp
=
inbl
[
i
];
inbl
[
i
]
=
mm_reverse_epi16
(
inbr
[
i
]);
inbr
[
i
]
=
mm_reverse_epi16
(
tmp
);
}
FLIPUD_PTR
(
dest
,
stride
,
32
);
break
;
default:
assert
(
0
);
break
;
}
write_buffer_16x32_round6
(
dest
,
intl
,
intr
,
inbl
,
inbr
,
stride
);
}
static
INLINE
void
write_buffer_32x16_round6
(
uint8_t
*
dest
,
__m128i
*
in0
,
__m128i
*
in1
,
__m128i
*
in2
,
__m128i
*
in3
,
int
stride
)
{
const
__m128i
zero
=
_mm_setzero_si128
();
const
__m128i
final_rounding
=
_mm_set1_epi16
(
1
<<
5
);
int
i
;
for
(
i
=
0
;
i
<
16
;
++
i
)
{
in0
[
i
]
=
_mm_adds_epi16
(
in0
[
i
],
final_rounding
);
in1
[
i
]
=
_mm_adds_epi16
(
in1
[
i
],
final_rounding
);
in2
[
i
]
=
_mm_adds_epi16
(
in2
[
i
],
final_rounding
);
in3
[
i
]
=
_mm_adds_epi16
(
in3
[
i
],
final_rounding
);
in0
[
i
]
=
_mm_srai_epi16
(
in0
[
i
],
6
);
in1
[
i
]
=
_mm_srai_epi16
(
in1
[
i
],
6
);
in2
[
i
]
=
_mm_srai_epi16
(
in2
[
i
],
6
);
in3
[
i
]
=
_mm_srai_epi16
(
in3
[
i
],
6
);
RECON_AND_STORE
(
dest
+
i
*
stride
+
0
,
in0
[
i
]);
RECON_AND_STORE
(
dest
+
i
*
stride
+
8
,
in1
[
i
]);
RECON_AND_STORE
(
dest
+
i
*
stride
+
16
,
in2
[
i
]);
RECON_AND_STORE
(
dest
+
i
*
stride
+
24
,
in3
[
i
]);
}
}
void
av1_iht32x16_512_add_sse2
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
,
int
tx_type
)
{
__m128i
in0
[
16
],
in1
[
16
],
in2
[
16
],
in3
[
16
];
int
i
;
for
(
i
=
0
;
i
<
16
;
++
i
)
{
in0
[
i
]
=
load_input_data
(
input
+
i
*
32
+
0
);
in1
[
i
]
=
load_input_data
(
input
+
i
*
32
+
8
);
in2
[
i
]
=
load_input_data
(
input
+
i
*
32
+
16
);
in3
[
i
]
=
load_input_data
(
input
+
i
*
32
+
24
);
}
// Row transform
switch
(
tx_type
)
{
case
DCT_DCT
:
case
ADST_DCT
:
case
FLIPADST_DCT
:
case
H_DCT
:
idct32_16col
(
in0
,
in1
,
in2
,
in3
);
break
;
case
DCT_ADST
:
case
ADST_ADST
:
case
DCT_FLIPADST
:
case
FLIPADST_FLIPADST
:
case
ADST_FLIPADST
:
case
FLIPADST_ADST
:
case
H_ADST
:
case
H_FLIPADST
:
ihalfright32_16col
(
in0
,
in1
,
in2
,
in3
);
break
;
case
V_FLIPADST
:
case
V_ADST
:
case
V_DCT
:
case
IDTX
:
iidtx32_16col
(
in0
,
in1
,
in2
,
in3
);
break
;
default:
assert
(
0
);
break
;
}
scale_sqrt2_8x16
(
in0
);
scale_sqrt2_8x16
(
in1
);
scale_sqrt2_8x16
(
in2
);
scale_sqrt2_8x16
(
in3
);
// Column transform
switch
(
tx_type
)
{
case
DCT_DCT
:
case
DCT_ADST
:
case
DCT_FLIPADST
:
case
V_DCT
:
idct16_sse2
(
in0
,
in1
);
idct16_sse2
(
in2
,
in3
);
break
;
case
ADST_DCT
:
case
ADST_ADST
:
case
FLIPADST_ADST
:
case
ADST_FLIPADST
:
case
FLIPADST_FLIPADST
:
case
FLIPADST_DCT
:
case
V_ADST
:
case
V_FLIPADST
:
iadst16_sse2
(
in0
,
in1
);