Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
a720f4b3
Commit
a720f4b3
authored
Oct 14, 2016
by
Debargha Mukherjee
Committed by
Gerrit Code Review
Oct 14, 2016
Browse files
Merge "Add sse2 forward and inverse 16x32 and 32x16 transforms" into nextgenv2
parents
a48764d0
33231d48
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
aom_dsp/aom_dsp.mk
View file @
a720f4b3
...
...
@@ -186,6 +186,7 @@ DSP_SRCS-yes += fwd_txfm.c
DSP_SRCS-yes
+=
fwd_txfm.h
DSP_SRCS-$(HAVE_SSE2)
+=
x86/fwd_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2)
+=
x86/fwd_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2)
+=
x86/fwd_dct32_8cols_sse2.c
DSP_SRCS-$(HAVE_SSE2)
+=
x86/fwd_txfm_impl_sse2.h
DSP_SRCS-$(HAVE_SSE2)
+=
x86/fwd_dct32x32_impl_sse2.h
ifeq
($(ARCH_X86_64),yes)
...
...
aom_dsp/x86/fwd_dct32_8cols_sse2.c
0 → 100644
View file @
a720f4b3
This diff is collapsed.
Click to expand it.
aom_dsp/x86/fwd_txfm_sse2.h
View file @
a720f4b3
...
...
@@ -365,6 +365,8 @@ static INLINE void transpose_and_output8x8(
}
}
void
fdct32_8col
(
__m128i
*
in0
,
__m128i
*
in1
);
#ifdef __cplusplus
}
// extern "C"
#endif
...
...
aom_dsp/x86/inv_txfm_sse2.c
View file @
a720f4b3
...
...
@@ -2669,28 +2669,28 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
stp1_31 = stp2_31; \
}
#define IDCT32
\
#define IDCT32
(in0, in1)
\
/* Stage1 */
\
{ \
const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in
[31]);
\
const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in
[31]);
\
const __m128i lo_17_15 = _mm_unpacklo_epi16(in[1
7
], in[15]);
\
const __m128i hi_17_15 = _mm_unpackhi_epi16(in[1
7
], in[15]);
\
\
const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in
[23]);
\
const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in
[23]);
\
const __m128i lo_25_7 = _mm_unpacklo_epi16(in
[25
], in[7]);
\
const __m128i hi_25_7 = _mm_unpackhi_epi16(in
[25
], in[7]);
\
\
const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in
[27]);
\
const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in
[27]);
\
const __m128i lo_21_11 = _mm_unpacklo_epi16(in
[21
], in[11]);
\
const __m128i hi_21_11 = _mm_unpackhi_epi16(in
[21
], in[11]);
\
\
const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in
[19]);
\
const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in
[19]);
\
const __m128i lo_29_3 = _mm_unpacklo_epi16(in
[29
], in[3]);
\
const __m128i hi_29_3 = _mm_unpackhi_epi16(in
[29
], in[3]);
\
const __m128i lo_1_31 = _mm_unpacklo_epi16(
(
in
0)
[1],
(
in
1)[15]);
\
const __m128i hi_1_31 = _mm_unpackhi_epi16(
(
in
0)
[1],
(
in
1)[15]);
\
const __m128i lo_17_15 = _mm_unpacklo_epi16(
(
in
1)
[1],
(
in
0)
[15]); \
const __m128i hi_17_15 = _mm_unpackhi_epi16(
(
in
1)
[1],
(
in
0)
[15]); \
\
const __m128i lo_9_23 = _mm_unpacklo_epi16(
(
in
0)
[9],
(
in
1)[7]);
\
const __m128i hi_9_23 = _mm_unpackhi_epi16(
(
in
0)
[9],
(
in
1)[7]);
\
const __m128i lo_25_7 = _mm_unpacklo_epi16(
(
in
1)[9
],
(
in
0)
[7]); \
const __m128i hi_25_7 = _mm_unpackhi_epi16(
(
in
1)[9
],
(
in
0)
[7]); \
\
const __m128i lo_5_27 = _mm_unpacklo_epi16(
(
in
0)
[5],
(
in
1)[11]);
\
const __m128i hi_5_27 = _mm_unpackhi_epi16(
(
in
0)
[5],
(
in
1)[11]);
\
const __m128i lo_21_11 = _mm_unpacklo_epi16(
(
in
1)[5
],
(
in
0)
[11]); \
const __m128i hi_21_11 = _mm_unpackhi_epi16(
(
in
1)[5
],
(
in
0)
[11]); \
\
const __m128i lo_13_19 = _mm_unpacklo_epi16(
(
in
0)
[13],
(
in
1)[3]);
\
const __m128i hi_13_19 = _mm_unpackhi_epi16(
(
in
0)
[13],
(
in
1)[3]);
\
const __m128i lo_29_3 = _mm_unpacklo_epi16(
(
in
1)[13
],
(
in
0)
[3]); \
const __m128i hi_29_3 = _mm_unpackhi_epi16(
(
in
1)[13
],
(
in
0)
[3]); \
\
MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \
...
...
@@ -2707,15 +2707,15 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
\
/* Stage2 */
\
{ \
const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in
[30]);
\
const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in
[30]);
\
const __m128i lo_18_14 = _mm_unpacklo_epi16(in
[18
], in[14]);
\
const __m128i hi_18_14 = _mm_unpackhi_epi16(in
[18
], in[14]);
\
const __m128i lo_2_30 = _mm_unpacklo_epi16(
(
in
0)
[2],
(
in
1)[14]);
\
const __m128i hi_2_30 = _mm_unpackhi_epi16(
(
in
0)
[2],
(
in
1)[14]);
\
const __m128i lo_18_14 = _mm_unpacklo_epi16(
(
in
1)[2
],
(
in
0)
[14]); \
const __m128i hi_18_14 = _mm_unpackhi_epi16(
(
in
1)[2
],
(
in
0)
[14]); \
\
const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in
[22]);
\
const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in
[22]);
\
const __m128i lo_26_6 = _mm_unpacklo_epi16(in
[26
], in[6]);
\
const __m128i hi_26_6 = _mm_unpackhi_epi16(in
[26
], in[6]);
\
const __m128i lo_10_22 = _mm_unpacklo_epi16(
(
in
0)
[10],
(
in
1)[6]);
\
const __m128i hi_10_22 = _mm_unpackhi_epi16(
(
in
0)
[10],
(
in
1)[6]);
\
const __m128i lo_26_6 = _mm_unpacklo_epi16(
(
in
1)[10
],
(
in
0)
[6]); \
const __m128i hi_26_6 = _mm_unpackhi_epi16(
(
in
1)[10
],
(
in
0)
[6]); \
\
MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
...
...
@@ -2747,10 +2747,10 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
\
/* Stage3 */
\
{ \
const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in
[28]);
\
const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in
[28]);
\
const __m128i lo_20_12 = _mm_unpacklo_epi16(in
[20
], in[12]);
\
const __m128i hi_20_12 = _mm_unpackhi_epi16(in
[20
], in[12]);
\
const __m128i lo_4_28 = _mm_unpacklo_epi16(
(
in
0)
[4],
(
in
1)[12]);
\
const __m128i hi_4_28 = _mm_unpackhi_epi16(
(
in
0)
[4],
(
in
1)[12]);
\
const __m128i lo_20_12 = _mm_unpacklo_epi16(
(
in
1)[4
],
(
in
0)
[12]); \
const __m128i hi_20_12 = _mm_unpackhi_epi16(
(
in
1)[4
],
(
in
0)
[12]); \
\
const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
...
...
@@ -2794,10 +2794,10 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
\
/* Stage4 */
\
{ \
const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in
[16]);
\
const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in
[16]);
\
const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in
[24]);
\
const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in
[24]);
\
const __m128i lo_0_16 = _mm_unpacklo_epi16(
(
in
0)
[0],
(
in
1)[0]);
\
const __m128i hi_0_16 = _mm_unpackhi_epi16(
(
in
0)
[0],
(
in
1)[0]);
\
const __m128i lo_8_24 = _mm_unpacklo_epi16(
(
in
0)
[8],
(
in
1)[8]);
\
const __m128i hi_8_24 = _mm_unpackhi_epi16(
(
in
0)
[8],
(
in
1)[8]);
\
\
const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
...
...
@@ -3338,7 +3338,7 @@ void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
array_transpose_8x8
(
in
+
16
,
in
+
16
);
array_transpose_8x8
(
in
+
24
,
in
+
24
);
IDCT32
IDCT32
(
in
,
in
+
16
)
// 1_D: Store 32 intermediate results for each 8x32 block.
col
[
i32
+
0
]
=
_mm_add_epi16
(
stp1_0
,
stp1_31
);
...
...
@@ -3384,7 +3384,7 @@ void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
array_transpose_8x8
(
col
+
j
+
64
,
in
+
16
);
array_transpose_8x8
(
col
+
j
+
96
,
in
+
24
);
IDCT32
IDCT32
(
in
,
in
+
16
)
// 2_D: Calculate the results and store them to destination.
in
[
0
]
=
_mm_add_epi16
(
stp1_0
,
stp1_31
);
...
...
@@ -3451,6 +3451,107 @@ void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
}
}
// Apply a 32-element IDCT to 8 columns. This does not do any transposition
// of its input - the caller is expected to have done that.
// The input buffers are the top and bottom halves of an 8x32 block.
void
idct32_8col
(
__m128i
*
in0
,
__m128i
*
in1
)
{
const
__m128i
rounding
=
_mm_set1_epi32
(
DCT_CONST_ROUNDING
);
// idct constants for each stage
const
__m128i
stg1_0
=
pair_set_epi16
(
cospi_31_64
,
-
cospi_1_64
);
const
__m128i
stg1_1
=
pair_set_epi16
(
cospi_1_64
,
cospi_31_64
);
const
__m128i
stg1_2
=
pair_set_epi16
(
cospi_15_64
,
-
cospi_17_64
);
const
__m128i
stg1_3
=
pair_set_epi16
(
cospi_17_64
,
cospi_15_64
);
const
__m128i
stg1_4
=
pair_set_epi16
(
cospi_23_64
,
-
cospi_9_64
);
const
__m128i
stg1_5
=
pair_set_epi16
(
cospi_9_64
,
cospi_23_64
);
const
__m128i
stg1_6
=
pair_set_epi16
(
cospi_7_64
,
-
cospi_25_64
);
const
__m128i
stg1_7
=
pair_set_epi16
(
cospi_25_64
,
cospi_7_64
);
const
__m128i
stg1_8
=
pair_set_epi16
(
cospi_27_64
,
-
cospi_5_64
);
const
__m128i
stg1_9
=
pair_set_epi16
(
cospi_5_64
,
cospi_27_64
);
const
__m128i
stg1_10
=
pair_set_epi16
(
cospi_11_64
,
-
cospi_21_64
);
const
__m128i
stg1_11
=
pair_set_epi16
(
cospi_21_64
,
cospi_11_64
);
const
__m128i
stg1_12
=
pair_set_epi16
(
cospi_19_64
,
-
cospi_13_64
);
const
__m128i
stg1_13
=
pair_set_epi16
(
cospi_13_64
,
cospi_19_64
);
const
__m128i
stg1_14
=
pair_set_epi16
(
cospi_3_64
,
-
cospi_29_64
);
const
__m128i
stg1_15
=
pair_set_epi16
(
cospi_29_64
,
cospi_3_64
);
const
__m128i
stg2_0
=
pair_set_epi16
(
cospi_30_64
,
-
cospi_2_64
);
const
__m128i
stg2_1
=
pair_set_epi16
(
cospi_2_64
,
cospi_30_64
);
const
__m128i
stg2_2
=
pair_set_epi16
(
cospi_14_64
,
-
cospi_18_64
);
const
__m128i
stg2_3
=
pair_set_epi16
(
cospi_18_64
,
cospi_14_64
);
const
__m128i
stg2_4
=
pair_set_epi16
(
cospi_22_64
,
-
cospi_10_64
);
const
__m128i
stg2_5
=
pair_set_epi16
(
cospi_10_64
,
cospi_22_64
);
const
__m128i
stg2_6
=
pair_set_epi16
(
cospi_6_64
,
-
cospi_26_64
);
const
__m128i
stg2_7
=
pair_set_epi16
(
cospi_26_64
,
cospi_6_64
);
const
__m128i
stg3_0
=
pair_set_epi16
(
cospi_28_64
,
-
cospi_4_64
);
const
__m128i
stg3_1
=
pair_set_epi16
(
cospi_4_64
,
cospi_28_64
);
const
__m128i
stg3_2
=
pair_set_epi16
(
cospi_12_64
,
-
cospi_20_64
);
const
__m128i
stg3_3
=
pair_set_epi16
(
cospi_20_64
,
cospi_12_64
);
const
__m128i
stg3_4
=
pair_set_epi16
(
-
cospi_4_64
,
cospi_28_64
);
const
__m128i
stg3_5
=
pair_set_epi16
(
cospi_28_64
,
cospi_4_64
);
const
__m128i
stg3_6
=
pair_set_epi16
(
-
cospi_28_64
,
-
cospi_4_64
);
const
__m128i
stg3_8
=
pair_set_epi16
(
-
cospi_20_64
,
cospi_12_64
);
const
__m128i
stg3_9
=
pair_set_epi16
(
cospi_12_64
,
cospi_20_64
);
const
__m128i
stg3_10
=
pair_set_epi16
(
-
cospi_12_64
,
-
cospi_20_64
);
const
__m128i
stg4_0
=
pair_set_epi16
(
cospi_16_64
,
cospi_16_64
);
const
__m128i
stg4_1
=
pair_set_epi16
(
cospi_16_64
,
-
cospi_16_64
);
const
__m128i
stg4_2
=
pair_set_epi16
(
cospi_24_64
,
-
cospi_8_64
);
const
__m128i
stg4_3
=
pair_set_epi16
(
cospi_8_64
,
cospi_24_64
);
const
__m128i
stg4_4
=
pair_set_epi16
(
-
cospi_8_64
,
cospi_24_64
);
const
__m128i
stg4_5
=
pair_set_epi16
(
cospi_24_64
,
cospi_8_64
);
const
__m128i
stg4_6
=
pair_set_epi16
(
-
cospi_24_64
,
-
cospi_8_64
);
const
__m128i
stg6_0
=
pair_set_epi16
(
-
cospi_16_64
,
cospi_16_64
);
__m128i
stp1_0
,
stp1_1
,
stp1_2
,
stp1_3
,
stp1_4
,
stp1_5
,
stp1_6
,
stp1_7
,
stp1_8
,
stp1_9
,
stp1_10
,
stp1_11
,
stp1_12
,
stp1_13
,
stp1_14
,
stp1_15
,
stp1_16
,
stp1_17
,
stp1_18
,
stp1_19
,
stp1_20
,
stp1_21
,
stp1_22
,
stp1_23
,
stp1_24
,
stp1_25
,
stp1_26
,
stp1_27
,
stp1_28
,
stp1_29
,
stp1_30
,
stp1_31
;
__m128i
stp2_0
,
stp2_1
,
stp2_2
,
stp2_3
,
stp2_4
,
stp2_5
,
stp2_6
,
stp2_7
,
stp2_8
,
stp2_9
,
stp2_10
,
stp2_11
,
stp2_12
,
stp2_13
,
stp2_14
,
stp2_15
,
stp2_16
,
stp2_17
,
stp2_18
,
stp2_19
,
stp2_20
,
stp2_21
,
stp2_22
,
stp2_23
,
stp2_24
,
stp2_25
,
stp2_26
,
stp2_27
,
stp2_28
,
stp2_29
,
stp2_30
,
stp2_31
;
__m128i
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
tmp6
,
tmp7
;
IDCT32
(
in0
,
in1
)
// 2_D: Calculate the results and store them to destination.
in0
[
0
]
=
_mm_add_epi16
(
stp1_0
,
stp1_31
);
in0
[
1
]
=
_mm_add_epi16
(
stp1_1
,
stp1_30
);
in0
[
2
]
=
_mm_add_epi16
(
stp1_2
,
stp1_29
);
in0
[
3
]
=
_mm_add_epi16
(
stp1_3
,
stp1_28
);
in0
[
4
]
=
_mm_add_epi16
(
stp1_4
,
stp1_27
);
in0
[
5
]
=
_mm_add_epi16
(
stp1_5
,
stp1_26
);
in0
[
6
]
=
_mm_add_epi16
(
stp1_6
,
stp1_25
);
in0
[
7
]
=
_mm_add_epi16
(
stp1_7
,
stp1_24
);
in0
[
8
]
=
_mm_add_epi16
(
stp1_8
,
stp1_23
);
in0
[
9
]
=
_mm_add_epi16
(
stp1_9
,
stp1_22
);
in0
[
10
]
=
_mm_add_epi16
(
stp1_10
,
stp1_21
);
in0
[
11
]
=
_mm_add_epi16
(
stp1_11
,
stp1_20
);
in0
[
12
]
=
_mm_add_epi16
(
stp1_12
,
stp1_19
);
in0
[
13
]
=
_mm_add_epi16
(
stp1_13
,
stp1_18
);
in0
[
14
]
=
_mm_add_epi16
(
stp1_14
,
stp1_17
);
in0
[
15
]
=
_mm_add_epi16
(
stp1_15
,
stp1_16
);
in1
[
0
]
=
_mm_sub_epi16
(
stp1_15
,
stp1_16
);
in1
[
1
]
=
_mm_sub_epi16
(
stp1_14
,
stp1_17
);
in1
[
2
]
=
_mm_sub_epi16
(
stp1_13
,
stp1_18
);
in1
[
3
]
=
_mm_sub_epi16
(
stp1_12
,
stp1_19
);
in1
[
4
]
=
_mm_sub_epi16
(
stp1_11
,
stp1_20
);
in1
[
5
]
=
_mm_sub_epi16
(
stp1_10
,
stp1_21
);
in1
[
6
]
=
_mm_sub_epi16
(
stp1_9
,
stp1_22
);
in1
[
7
]
=
_mm_sub_epi16
(
stp1_8
,
stp1_23
);
in1
[
8
]
=
_mm_sub_epi16
(
stp1_7
,
stp1_24
);
in1
[
9
]
=
_mm_sub_epi16
(
stp1_6
,
stp1_25
);
in1
[
10
]
=
_mm_sub_epi16
(
stp1_5
,
stp1_26
);
in1
[
11
]
=
_mm_sub_epi16
(
stp1_4
,
stp1_27
);
in1
[
12
]
=
_mm_sub_epi16
(
stp1_3
,
stp1_28
);
in1
[
13
]
=
_mm_sub_epi16
(
stp1_2
,
stp1_29
);
in1
[
14
]
=
_mm_sub_epi16
(
stp1_1
,
stp1_30
);
in1
[
15
]
=
_mm_sub_epi16
(
stp1_0
,
stp1_31
);
}
#if CONFIG_AOM_HIGHBITDEPTH
static
INLINE
__m128i
clamp_high_sse2
(
__m128i
value
,
int
bd
)
{
__m128i
ubounded
,
retval
;
...
...
aom_dsp/x86/inv_txfm_sse2.h
View file @
a720f4b3
...
...
@@ -203,5 +203,6 @@ void idct16_sse2(__m128i *in0, __m128i *in1);
void
iadst4_sse2
(
__m128i
*
in
);
void
iadst8_sse2
(
__m128i
*
in
);
void
iadst16_sse2
(
__m128i
*
in0
,
__m128i
*
in1
);
void
idct32_8col
(
__m128i
*
in0
,
__m128i
*
in1
);
#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_
av1/common/av1_rtcd_defs.pl
View file @
a720f4b3
...
...
@@ -104,10 +104,10 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize
qw/av1_iht16x8_128_add sse2/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
specialize
qw/av1_iht16x32_512_add
sse2
/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
specialize
qw/av1_iht32x16_512_add
sse2
/
;
}
add_proto
qw/void av1_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
...
...
@@ -165,10 +165,10 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize
qw/av1_iht16x8_128_add sse2/
;
add_proto
qw/void av1_iht16x32_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht16x32_512_add/
;
specialize
qw/av1_iht16x32_512_add
sse2
/
;
add_proto
qw/void av1_iht32x16_512_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
specialize
qw/av1_iht32x16_512_add/
;
specialize
qw/av1_iht32x16_512_add
sse2
/
;
}
add_proto
qw/void av1_iht8x8_64_add/
,
"
const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type
";
...
...
@@ -405,10 +405,10 @@ if (aom_config("CONFIG_EXT_TX") eq "yes") {
specialize
qw/av1_fht16x8 sse2/
;
add_proto
qw/void av1_fht16x32/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht16x32/
;
specialize
qw/av1_fht16x32
sse2
/
;
add_proto
qw/void av1_fht32x16/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht32x16/
;
specialize
qw/av1_fht32x16
sse2
/
;
add_proto
qw/void av1_fht32x32/
,
"
const int16_t *input, tran_low_t *output, int stride, int tx_type
";
specialize
qw/av1_fht32x32 avx2/
;
...
...
av1/common/x86/idct_intrin_sse2.c
View file @
a720f4b3
...
...
@@ -496,6 +496,12 @@ static void iidtx16_8col(__m128i *in) {
in
[
15
]
=
_mm_packs_epi32
(
u7
,
y7
);
}
static
void
iidtx16_sse2
(
__m128i
*
in0
,
__m128i
*
in1
)
{
array_transpose_16x16
(
in0
,
in1
);
iidtx16_8col
(
in0
);
iidtx16_8col
(
in1
);
}
static
void
iidtx8_sse2
(
__m128i
*
in
)
{
in
[
0
]
=
_mm_slli_epi16
(
in
[
0
],
1
);
in
[
1
]
=
_mm_slli_epi16
(
in
[
1
],
1
);
...
...
@@ -628,6 +634,11 @@ static INLINE void scale_sqrt2_8x8(__m128i *in) {
xx_roundn_epi32_unsigned
(
v_p7b_d
,
DCT_CONST_BITS
));
}
static
INLINE
void
scale_sqrt2_8x16
(
__m128i
*
in
)
{
scale_sqrt2_8x8
(
in
);
scale_sqrt2_8x8
(
in
+
8
);
}
void
av1_iht8x16_128_add_sse2
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
,
int
tx_type
)
{
__m128i
in
[
16
];
...
...
@@ -1202,4 +1213,322 @@ void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
in
[
3
]
=
_mm_unpacklo_epi64
(
in
[
6
],
in
[
7
]);
write_buffer_4x8_round5
(
dest
,
in
,
stride
);
}
// Note: The 16-column 32-element transforms take input in the form of four
// 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
// of the overall 16x32 input buffer.
static
INLINE
void
idct32_16col
(
__m128i
*
tl
,
__m128i
*
tr
,
__m128i
*
bl
,
__m128i
*
br
)
{
array_transpose_16x16
(
tl
,
tr
);
array_transpose_16x16
(
bl
,
br
);
idct32_8col
(
tl
,
bl
);
idct32_8col
(
tr
,
br
);
}
static
INLINE
void
ihalfright32_16col
(
__m128i
*
tl
,
__m128i
*
tr
,
__m128i
*
bl
,
__m128i
*
br
)
{
__m128i
tmpl
[
16
],
tmpr
[
16
];
int
i
;
// Copy the top half of the input to temporary storage
for
(
i
=
0
;
i
<
16
;
++
i
)
{
tmpl
[
i
]
=
tl
[
i
];
tmpr
[
i
]
=
tr
[
i
];
}
// Generate the top half of the output
for
(
i
=
0
;
i
<
16
;
++
i
)
{
tl
[
i
]
=
_mm_slli_epi16
(
bl
[
i
],
2
);
tr
[
i
]
=
_mm_slli_epi16
(
br
[
i
],
2
);
}
array_transpose_16x16
(
tl
,
tr
);
// Copy the temporary storage back to the bottom half of the input
for
(
i
=
0
;
i
<
16
;
++
i
)
{
bl
[
i
]
=
tmpl
[
i
];
br
[
i
]
=
tmpr
[
i
];
}
// Generate the bottom half of the output
scale_sqrt2_8x16
(
bl
);
scale_sqrt2_8x16
(
br
);
idct16_sse2
(
bl
,
br
);
// Includes a transposition
}
static
INLINE
void
iidtx32_16col
(
__m128i
*
tl
,
__m128i
*
tr
,
__m128i
*
bl
,
__m128i
*
br
)
{
int
i
;
array_transpose_16x16
(
tl
,
tr
);
array_transpose_16x16
(
bl
,
br
);
for
(
i
=
0
;
i
<
16
;
++
i
)
{
tl
[
i
]
=
_mm_slli_epi16
(
tl
[
i
],
2
);
tr
[
i
]
=
_mm_slli_epi16
(
tr
[
i
],
2
);
bl
[
i
]
=
_mm_slli_epi16
(
bl
[
i
],
2
);
br
[
i
]
=
_mm_slli_epi16
(
br
[
i
],
2
);
}
}
static
INLINE
void
write_buffer_16x32_round6
(
uint8_t
*
dest
,
__m128i
*
intl
,
__m128i
*
intr
,
__m128i
*
inbl
,
__m128i
*
inbr
,
int
stride
)
{
const
__m128i
zero
=
_mm_setzero_si128
();
const
__m128i
final_rounding
=
_mm_set1_epi16
(
1
<<
5
);
int
i
;
for
(
i
=
0
;
i
<
16
;
++
i
)
{
intl
[
i
]
=
_mm_adds_epi16
(
intl
[
i
],
final_rounding
);
intr
[
i
]
=
_mm_adds_epi16
(
intr
[
i
],
final_rounding
);
inbl
[
i
]
=
_mm_adds_epi16
(
inbl
[
i
],
final_rounding
);
inbr
[
i
]
=
_mm_adds_epi16
(
inbr
[
i
],
final_rounding
);
intl
[
i
]
=
_mm_srai_epi16
(
intl
[
i
],
6
);
intr
[
i
]
=
_mm_srai_epi16
(
intr
[
i
],
6
);
inbl
[
i
]
=
_mm_srai_epi16
(
inbl
[
i
],
6
);
inbr
[
i
]
=
_mm_srai_epi16
(
inbr
[
i
],
6
);
RECON_AND_STORE
(
dest
+
i
*
stride
+
0
,
intl
[
i
]);
RECON_AND_STORE
(
dest
+
i
*
stride
+
8
,
intr
[
i
]);
RECON_AND_STORE
(
dest
+
(
i
+
16
)
*
stride
+
0
,
inbl
[
i
]);
RECON_AND_STORE
(
dest
+
(
i
+
16
)
*
stride
+
8
,
inbr
[
i
]);
}
}
void
av1_iht16x32_512_add_sse2
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
,
int
tx_type
)
{
__m128i
intl
[
16
],
intr
[
16
],
inbl
[
16
],
inbr
[
16
];
int
i
;
for
(
i
=
0
;
i
<
16
;
++
i
)
{
intl
[
i
]
=
load_input_data
(
input
+
i
*
16
+
0
);
intr
[
i
]
=
load_input_data
(
input
+
i
*
16
+
8
);
inbl
[
i
]
=
load_input_data
(
input
+
(
i
+
16
)
*
16
+
0
);
inbr
[
i
]
=
load_input_data
(
input
+
(
i
+
16
)
*
16
+
8
);
}
// Row transform
switch
(
tx_type
)
{
case
DCT_DCT
:
case
ADST_DCT
:
case
FLIPADST_DCT
:
case
H_DCT
:
idct16_sse2
(
intl
,
intr
);
idct16_sse2
(
inbl
,
inbr
);
break
;
case
DCT_ADST
:
case
ADST_ADST
:
case
DCT_FLIPADST
:
case
FLIPADST_FLIPADST
:
case
ADST_FLIPADST
:
case
FLIPADST_ADST
:
case
H_ADST
:
case
H_FLIPADST
:
iadst16_sse2
(
intl
,
intr
);
iadst16_sse2
(
inbl
,
inbr
);
break
;
case
V_FLIPADST
:
case
V_ADST
:
case
V_DCT
:
case
IDTX
:
iidtx16_sse2
(
intl
,
intr
);
iidtx16_sse2
(
inbl
,
inbr
);
break
;
default:
assert
(
0
);
break
;
}
scale_sqrt2_8x16
(
intl
);
scale_sqrt2_8x16
(
intr
);
scale_sqrt2_8x16
(
inbl
);
scale_sqrt2_8x16
(
inbr
);
// Column transform
switch
(
tx_type
)
{
case
DCT_DCT
:
case
DCT_ADST
:
case
DCT_FLIPADST
:
case
V_DCT
:
idct32_16col
(
intl
,
intr
,
inbl
,
inbr
);
break
;
case
ADST_DCT
:
case
ADST_ADST
:
case
FLIPADST_ADST
:
case
ADST_FLIPADST
:
case
FLIPADST_FLIPADST
:
case
FLIPADST_DCT
:
case
V_ADST
:
case
V_FLIPADST
:
ihalfright32_16col
(
intl
,
intr
,
inbl
,
inbr
);
break
;
case
H_DCT
:
case
H_ADST
:
case
H_FLIPADST
:
case
IDTX
:
iidtx32_16col
(
intl
,
intr
,
inbl
,
inbr
);
break
;
default:
assert
(
0
);
break
;
}
switch
(
tx_type
)
{
case
DCT_DCT
:
case
ADST_DCT
:
case
H_DCT
:
case
DCT_ADST
:
case
ADST_ADST
:
case
H_ADST
:
case
V_ADST
:
case
V_DCT
:
case
IDTX
:
break
;
case
FLIPADST_DCT
:
case
FLIPADST_ADST
:
case
V_FLIPADST
:
FLIPUD_PTR
(
dest
,
stride
,
32
);
break
;
case
DCT_FLIPADST
:
case
ADST_FLIPADST
:
case
H_FLIPADST
:
for
(
i
=
0
;
i
<
16
;
++
i
)
{
__m128i
tmp
=
intl
[
i
];
intl
[
i
]
=
mm_reverse_epi16
(
intr
[
i
]);
intr
[
i
]
=
mm_reverse_epi16
(
tmp
);
tmp
=
inbl
[
i
];
inbl
[
i
]
=
mm_reverse_epi16
(
inbr
[
i
]);
inbr
[
i
]
=
mm_reverse_epi16
(
tmp
);
}
break
;
case
FLIPADST_FLIPADST
:
for
(
i
=
0
;
i
<
16
;
++
i
)
{
__m128i
tmp
=
intl
[
i
];
intl
[
i
]
=
mm_reverse_epi16
(
intr
[
i
]);
intr
[
i
]
=
mm_reverse_epi16
(
tmp
);
tmp
=
inbl
[
i
];
inbl
[
i
]
=
mm_reverse_epi16
(
inbr
[
i
]);
inbr
[
i
]
=
mm_reverse_epi16
(
tmp
);
}
FLIPUD_PTR
(
dest
,
stride
,
32
);
break
;
default:
assert
(
0
);
break
;
}
write_buffer_16x32_round6
(
dest
,
intl
,
intr
,
inbl
,
inbr
,
stride
);
}
static
INLINE
void
write_buffer_32x16_round6
(
uint8_t
*
dest
,
__m128i
*
in0
,
__m128i
*
in1
,
__m128i
*
in2
,
__m128i
*
in3
,
int
stride
)
{
const
__m128i
zero
=
_mm_setzero_si128
();
const
__m128i
final_rounding
=
_mm_set1_epi16
(
1
<<
5
);
int
i
;
for
(
i
=
0
;
i
<
16
;
++
i
)
{
in0
[
i
]
=
_mm_adds_epi16
(
in0
[
i
],
final_rounding
);
in1
[
i
]
=
_mm_adds_epi16
(
in1
[
i
],
final_rounding
);
in2
[
i
]
=
_mm_adds_epi16
(
in2
[
i
],
final_rounding
);
in3
[
i
]
=
_mm_adds_epi16
(
in3
[
i
],
final_rounding
);
in0
[
i
]
=
_mm_srai_epi16
(
in0
[
i
],
6
);
in1
[
i
]
=
_mm_srai_epi16
(
in1
[
i
],
6
);
in2
[
i
]
=
_mm_srai_epi16
(
in2
[
i
],
6
);
in3
[
i
]
=
_mm_srai_epi16
(
in3
[
i
],
6
);
RECON_AND_STORE
(
dest
+
i
*
stride
+
0
,
in0
[
i
]);
RECON_AND_STORE
(
dest
+
i
*
stride
+
8
,
in1
[
i
]);
RECON_AND_STORE
(
dest
+
i
*
stride
+
16
,
in2
[
i
]);
RECON_AND_STORE
(
dest
+
i
*
stride
+
24
,
in3
[
i
]);
}
}
void
av1_iht32x16_512_add_sse2
(
const
tran_low_t
*
input
,
uint8_t
*
dest
,
int
stride
,
int
tx_type
)
{
__m128i
in0
[
16
],
in1
[
16
],
in2
[
16
],
in3
[
16
];
int
i
;
for
(
i
=
0
;
i
<
16
;
++
i
)
{
in0
[
i
]
=
load_input_data
(
input
+
i
*
32
+
0
);
in1
[
i
]
=
load_input_data
(
input
+
i
*
32
+
8
);
in2
[
i
]
=
load_input_data
(
input
+
i
*
32
+
16
);
in3
[
i
]
=
load_input_data
(
input
+
i
*
32
+
24
);
}
// Row transform
switch
(
tx_type
)
{
case
DCT_DCT
:
case
ADST_DCT
:
case
FLIPADST_DCT
:
case
H_DCT
:
idct32_16col
(
in0
,
in1
,
in2
,
in3
);
break
;
case
DCT_ADST
:
case
ADST_ADST
:
case
DCT_FLIPADST
:
case
FLIPADST_FLIPADST
:
case
ADST_FLIPADST
:
case
FLIPADST_ADST
:
case
H_ADST
:
case
H_FLIPADST
:
ihalfright32_16col
(
in0
,
in1
,
in2
,
in3
);
break
;
case
V_FLIPADST
:
case
V_ADST
:
case
V_DCT
:
case
IDTX
:
iidtx32_16col
(
in0
,
in1
,
in2
,
in3
);
break
;
default:
assert
(
0
);
break
;
}
scale_sqrt2_8x16
(
in0
);
scale_sqrt2_8x16
(
in1
);
scale_sqrt2_8x16
(
in2
);
scale_sqrt2_8x16
(
in3
);
// Column transform
switch
(
tx_type
)
{
case
DCT_DCT
:
case
DCT_ADST
:
case
DCT_FLIPADST
:
case
V_DCT
:
idct16_sse2
(
in0
,
in1
);
idct16_sse2
(
in2
,
in3
);
break
;
case
ADST_DCT
:
case
ADST_ADST
:
case
FLIPADST_ADST
:
case
ADST_FLIPADST
:
case
FLIPADST_FLIPADST
:
case
FLIPADST_DCT
:
case
V_ADST
:
case
V_FLIPADST
: