Commit 33231d48 authored by David Barker's avatar David Barker
Browse files

Add sse2 forward and inverse 16x32 and 32x16 transforms

Change-Id: I1241257430f1e08ead1ce0f31db8272b50783102
parent cad8283e
......@@ -186,6 +186,7 @@ DSP_SRCS-yes += fwd_txfm.c
DSP_SRCS-yes += fwd_txfm.h
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32_8cols_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h
ifeq ($(ARCH_X86_64),yes)
......
This diff is collapsed.
......@@ -365,6 +365,8 @@ static INLINE void transpose_and_output8x8(
}
}
void fdct32_8col(__m128i *in0, __m128i *in1);
#ifdef __cplusplus
} // extern "C"
#endif
......
......@@ -2669,28 +2669,28 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
stp1_31 = stp2_31; \
}
#define IDCT32 \
#define IDCT32(in0, in1) \
/* Stage1 */ \
{ \
const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
\
const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \
const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
\
const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
\
const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
const __m128i lo_1_31 = _mm_unpacklo_epi16((in0)[1], (in1)[15]); \
const __m128i hi_1_31 = _mm_unpackhi_epi16((in0)[1], (in1)[15]); \
const __m128i lo_17_15 = _mm_unpacklo_epi16((in1)[1], (in0)[15]); \
const __m128i hi_17_15 = _mm_unpackhi_epi16((in1)[1], (in0)[15]); \
\
const __m128i lo_9_23 = _mm_unpacklo_epi16((in0)[9], (in1)[7]); \
const __m128i hi_9_23 = _mm_unpackhi_epi16((in0)[9], (in1)[7]); \
const __m128i lo_25_7 = _mm_unpacklo_epi16((in1)[9], (in0)[7]); \
const __m128i hi_25_7 = _mm_unpackhi_epi16((in1)[9], (in0)[7]); \
\
const __m128i lo_5_27 = _mm_unpacklo_epi16((in0)[5], (in1)[11]); \
const __m128i hi_5_27 = _mm_unpackhi_epi16((in0)[5], (in1)[11]); \
const __m128i lo_21_11 = _mm_unpacklo_epi16((in1)[5], (in0)[11]); \
const __m128i hi_21_11 = _mm_unpackhi_epi16((in1)[5], (in0)[11]); \
\
const __m128i lo_13_19 = _mm_unpacklo_epi16((in0)[13], (in1)[3]); \
const __m128i hi_13_19 = _mm_unpackhi_epi16((in0)[13], (in1)[3]); \
const __m128i lo_29_3 = _mm_unpacklo_epi16((in1)[13], (in0)[3]); \
const __m128i hi_29_3 = _mm_unpackhi_epi16((in1)[13], (in0)[3]); \
\
MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \
......@@ -2707,15 +2707,15 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
\
/* Stage2 */ \
{ \
const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
const __m128i lo_2_30 = _mm_unpacklo_epi16((in0)[2], (in1)[14]); \
const __m128i hi_2_30 = _mm_unpackhi_epi16((in0)[2], (in1)[14]); \
const __m128i lo_18_14 = _mm_unpacklo_epi16((in1)[2], (in0)[14]); \
const __m128i hi_18_14 = _mm_unpackhi_epi16((in1)[2], (in0)[14]); \
\
const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
const __m128i lo_10_22 = _mm_unpacklo_epi16((in0)[10], (in1)[6]); \
const __m128i hi_10_22 = _mm_unpackhi_epi16((in0)[10], (in1)[6]); \
const __m128i lo_26_6 = _mm_unpacklo_epi16((in1)[10], (in0)[6]); \
const __m128i hi_26_6 = _mm_unpackhi_epi16((in1)[10], (in0)[6]); \
\
MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
......@@ -2747,10 +2747,10 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
\
/* Stage3 */ \
{ \
const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
const __m128i lo_4_28 = _mm_unpacklo_epi16((in0)[4], (in1)[12]); \
const __m128i hi_4_28 = _mm_unpackhi_epi16((in0)[4], (in1)[12]); \
const __m128i lo_20_12 = _mm_unpacklo_epi16((in1)[4], (in0)[12]); \
const __m128i hi_20_12 = _mm_unpackhi_epi16((in1)[4], (in0)[12]); \
\
const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
......@@ -2794,10 +2794,10 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
\
/* Stage4 */ \
{ \
const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
const __m128i lo_0_16 = _mm_unpacklo_epi16((in0)[0], (in1)[0]); \
const __m128i hi_0_16 = _mm_unpackhi_epi16((in0)[0], (in1)[0]); \
const __m128i lo_8_24 = _mm_unpacklo_epi16((in0)[8], (in1)[8]); \
const __m128i hi_8_24 = _mm_unpackhi_epi16((in0)[8], (in1)[8]); \
\
const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
......@@ -3338,7 +3338,7 @@ void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
array_transpose_8x8(in + 16, in + 16);
array_transpose_8x8(in + 24, in + 24);
IDCT32
IDCT32(in, in + 16)
// 1_D: Store 32 intermediate results for each 8x32 block.
col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
......@@ -3384,7 +3384,7 @@ void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
array_transpose_8x8(col + j + 64, in + 16);
array_transpose_8x8(col + j + 96, in + 24);
IDCT32
IDCT32(in, in + 16)
// 2_D: Calculate the results and store them to destination.
in[0] = _mm_add_epi16(stp1_0, stp1_31);
......@@ -3451,6 +3451,107 @@ void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
}
}
// Apply a 32-element IDCT to 8 columns. This does not do any transposition
// of its input - the caller is expected to have done that.
// The input buffers are the top and bottom halves of an 8x32 block.
void idct32_8col(__m128i *in0, __m128i *in1) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
// idct constants for each stage
const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
IDCT32(in0, in1)
// 2_D: Calculate the results and store them to destination.
in0[0] = _mm_add_epi16(stp1_0, stp1_31);
in0[1] = _mm_add_epi16(stp1_1, stp1_30);
in0[2] = _mm_add_epi16(stp1_2, stp1_29);
in0[3] = _mm_add_epi16(stp1_3, stp1_28);
in0[4] = _mm_add_epi16(stp1_4, stp1_27);
in0[5] = _mm_add_epi16(stp1_5, stp1_26);
in0[6] = _mm_add_epi16(stp1_6, stp1_25);
in0[7] = _mm_add_epi16(stp1_7, stp1_24);
in0[8] = _mm_add_epi16(stp1_8, stp1_23);
in0[9] = _mm_add_epi16(stp1_9, stp1_22);
in0[10] = _mm_add_epi16(stp1_10, stp1_21);
in0[11] = _mm_add_epi16(stp1_11, stp1_20);
in0[12] = _mm_add_epi16(stp1_12, stp1_19);
in0[13] = _mm_add_epi16(stp1_13, stp1_18);
in0[14] = _mm_add_epi16(stp1_14, stp1_17);
in0[15] = _mm_add_epi16(stp1_15, stp1_16);
in1[0] = _mm_sub_epi16(stp1_15, stp1_16);
in1[1] = _mm_sub_epi16(stp1_14, stp1_17);
in1[2] = _mm_sub_epi16(stp1_13, stp1_18);
in1[3] = _mm_sub_epi16(stp1_12, stp1_19);
in1[4] = _mm_sub_epi16(stp1_11, stp1_20);
in1[5] = _mm_sub_epi16(stp1_10, stp1_21);
in1[6] = _mm_sub_epi16(stp1_9, stp1_22);
in1[7] = _mm_sub_epi16(stp1_8, stp1_23);
in1[8] = _mm_sub_epi16(stp1_7, stp1_24);
in1[9] = _mm_sub_epi16(stp1_6, stp1_25);
in1[10] = _mm_sub_epi16(stp1_5, stp1_26);
in1[11] = _mm_sub_epi16(stp1_4, stp1_27);
in1[12] = _mm_sub_epi16(stp1_3, stp1_28);
in1[13] = _mm_sub_epi16(stp1_2, stp1_29);
in1[14] = _mm_sub_epi16(stp1_1, stp1_30);
in1[15] = _mm_sub_epi16(stp1_0, stp1_31);
}
#if CONFIG_AOM_HIGHBITDEPTH
static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
__m128i ubounded, retval;
......
......@@ -203,5 +203,6 @@ void idct16_sse2(__m128i *in0, __m128i *in1);
void iadst4_sse2(__m128i *in);
void iadst8_sse2(__m128i *in);
void iadst16_sse2(__m128i *in0, __m128i *in1);
void idct32_8col(__m128i *in0, __m128i *in1);
#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_
......@@ -103,10 +103,10 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_iht16x8_128_add sse2/;
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x32_512_add/;
specialize qw/av1_iht16x32_512_add sse2/;
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht32x16_512_add/;
specialize qw/av1_iht32x16_512_add sse2/;
}
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
......@@ -164,10 +164,10 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_iht16x8_128_add sse2/;
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x32_512_add/;
specialize qw/av1_iht16x32_512_add sse2/;
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht32x16_512_add/;
specialize qw/av1_iht32x16_512_add sse2/;
}
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
......@@ -404,10 +404,10 @@ if (aom_config("CONFIG_EXT_TX") eq "yes") {
specialize qw/av1_fht16x8 sse2/;
add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht16x32/;
specialize qw/av1_fht16x32 sse2/;
add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht32x16/;
specialize qw/av1_fht32x16 sse2/;
add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht32x32 avx2/;
......
......@@ -496,6 +496,12 @@ static void iidtx16_8col(__m128i *in) {
in[15] = _mm_packs_epi32(u7, y7);
}
static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
iidtx16_8col(in0);
iidtx16_8col(in1);
}
static void iidtx8_sse2(__m128i *in) {
in[0] = _mm_slli_epi16(in[0], 1);
in[1] = _mm_slli_epi16(in[1], 1);
......@@ -628,6 +634,11 @@ static INLINE void scale_sqrt2_8x8(__m128i *in) {
xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
}
static INLINE void scale_sqrt2_8x16(__m128i *in) {
scale_sqrt2_8x8(in);
scale_sqrt2_8x8(in + 8);
}
void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride, int tx_type) {
__m128i in[16];
......@@ -1202,4 +1213,322 @@ void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
in[3] = _mm_unpacklo_epi64(in[6], in[7]);
write_buffer_4x8_round5(dest, in, stride);
}
// Note: The 16-column 32-element transforms take input in the form of four
// 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
// of the overall 16x32 input buffer.
static INLINE void idct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
__m128i *br) {
array_transpose_16x16(tl, tr);
array_transpose_16x16(bl, br);
idct32_8col(tl, bl);
idct32_8col(tr, br);
}
static INLINE void ihalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
__m128i *br) {
__m128i tmpl[16], tmpr[16];
int i;
// Copy the top half of the input to temporary storage
for (i = 0; i < 16; ++i) {
tmpl[i] = tl[i];
tmpr[i] = tr[i];
}
// Generate the top half of the output
for (i = 0; i < 16; ++i) {
tl[i] = _mm_slli_epi16(bl[i], 2);
tr[i] = _mm_slli_epi16(br[i], 2);
}
array_transpose_16x16(tl, tr);
// Copy the temporary storage back to the bottom half of the input
for (i = 0; i < 16; ++i) {
bl[i] = tmpl[i];
br[i] = tmpr[i];
}
// Generate the bottom half of the output
scale_sqrt2_8x16(bl);
scale_sqrt2_8x16(br);
idct16_sse2(bl, br); // Includes a transposition
}
static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
__m128i *br) {
int i;
array_transpose_16x16(tl, tr);
array_transpose_16x16(bl, br);
for (i = 0; i < 16; ++i) {
tl[i] = _mm_slli_epi16(tl[i], 2);
tr[i] = _mm_slli_epi16(tr[i], 2);
bl[i] = _mm_slli_epi16(bl[i], 2);
br[i] = _mm_slli_epi16(br[i], 2);
}
}
static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
__m128i *intr, __m128i *inbl,
__m128i *inbr, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1 << 5);
int i;
for (i = 0; i < 16; ++i) {
intl[i] = _mm_adds_epi16(intl[i], final_rounding);
intr[i] = _mm_adds_epi16(intr[i], final_rounding);
inbl[i] = _mm_adds_epi16(inbl[i], final_rounding);
inbr[i] = _mm_adds_epi16(inbr[i], final_rounding);
intl[i] = _mm_srai_epi16(intl[i], 6);
intr[i] = _mm_srai_epi16(intr[i], 6);
inbl[i] = _mm_srai_epi16(inbl[i], 6);
inbr[i] = _mm_srai_epi16(inbr[i], 6);
RECON_AND_STORE(dest + i * stride + 0, intl[i]);
RECON_AND_STORE(dest + i * stride + 8, intr[i]);
RECON_AND_STORE(dest + (i + 16) * stride + 0, inbl[i]);
RECON_AND_STORE(dest + (i + 16) * stride + 8, inbr[i]);
}
}
void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride, int tx_type) {
__m128i intl[16], intr[16], inbl[16], inbr[16];
int i;
for (i = 0; i < 16; ++i) {
intl[i] = load_input_data(input + i * 16 + 0);
intr[i] = load_input_data(input + i * 16 + 8);
inbl[i] = load_input_data(input + (i + 16) * 16 + 0);
inbr[i] = load_input_data(input + (i + 16) * 16 + 8);
}
// Row transform
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
case FLIPADST_DCT:
case H_DCT:
idct16_sse2(intl, intr);
idct16_sse2(inbl, inbr);
break;
case DCT_ADST:
case ADST_ADST:
case DCT_FLIPADST:
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
case H_ADST:
case H_FLIPADST:
iadst16_sse2(intl, intr);
iadst16_sse2(inbl, inbr);
break;
case V_FLIPADST:
case V_ADST:
case V_DCT:
case IDTX:
iidtx16_sse2(intl, intr);
iidtx16_sse2(inbl, inbr);
break;
default: assert(0); break;
}
scale_sqrt2_8x16(intl);
scale_sqrt2_8x16(intr);
scale_sqrt2_8x16(inbl);
scale_sqrt2_8x16(inbr);
// Column transform
switch (tx_type) {
case DCT_DCT:
case DCT_ADST:
case DCT_FLIPADST:
case V_DCT: idct32_16col(intl, intr, inbl, inbr); break;
case ADST_DCT:
case ADST_ADST:
case FLIPADST_ADST:
case ADST_FLIPADST:
case FLIPADST_FLIPADST:
case FLIPADST_DCT:
case V_ADST:
case V_FLIPADST: ihalfright32_16col(intl, intr, inbl, inbr); break;
case H_DCT:
case H_ADST:
case H_FLIPADST:
case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
default: assert(0); break;
}
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
case H_DCT:
case DCT_ADST:
case ADST_ADST:
case H_ADST:
case V_ADST:
case V_DCT:
case IDTX: break;
case FLIPADST_DCT:
case FLIPADST_ADST:
case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
case DCT_FLIPADST:
case ADST_FLIPADST:
case H_FLIPADST:
for (i = 0; i < 16; ++i) {
__m128i tmp = intl[i];
intl[i] = mm_reverse_epi16(intr[i]);
intr[i] = mm_reverse_epi16(tmp);
tmp = inbl[i];
inbl[i] = mm_reverse_epi16(inbr[i]);
inbr[i] = mm_reverse_epi16(tmp);
}
break;
case FLIPADST_FLIPADST:
for (i = 0; i < 16; ++i) {
__m128i tmp = intl[i];
intl[i] = mm_reverse_epi16(intr[i]);
intr[i] = mm_reverse_epi16(tmp);
tmp = inbl[i];
inbl[i] = mm_reverse_epi16(inbr[i]);
inbr[i] = mm_reverse_epi16(tmp);
}
FLIPUD_PTR(dest, stride, 32);
break;
default: assert(0); break;
}
write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
}
static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
__m128i *in1, __m128i *in2,
__m128i *in3, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1 << 5);
int i;
for (i = 0; i < 16; ++i) {
in0[i] = _mm_adds_epi16(in0[i], final_rounding);
in1[i] = _mm_adds_epi16(in1[i], final_rounding);
in2[i] = _mm_adds_epi16(in2[i], final_rounding);
in3[i] = _mm_adds_epi16(in3[i], final_rounding);
in0[i] = _mm_srai_epi16(in0[i], 6);
in1[i] = _mm_srai_epi16(in1[i], 6);
in2[i] = _mm_srai_epi16(in2[i], 6);
in3[i] = _mm_srai_epi16(in3[i], 6);
RECON_AND_STORE(dest + i * stride + 0, in0[i]);
RECON_AND_STORE(dest + i * stride + 8, in1[i]);
RECON_AND_STORE(dest + i * stride + 16, in2[i]);
RECON_AND_STORE(dest + i * stride + 24, in3[i]);
}
}
void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride, int tx_type) {
__m128i in0[16], in1[16], in2[16], in3[16];
int i;
for (i = 0; i < 16; ++i) {
in0[i] = load_input_data(input + i * 32 + 0);
in1[i] = load_input_data(input + i * 32 + 8);
in2[i] = load_input_data(input + i * 32 + 16);
in3[i] = load_input_data(input + i * 32 + 24);
}
// Row transform
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
case FLIPADST_DCT:
case H_DCT: idct32_16col(in0, in1, in2, in3); break;
case DCT_ADST:
case ADST_ADST:
case DCT_FLIPADST:
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
case H_ADST:
case H_FLIPADST: ihalfright32_16col(in0, in1, in2, in3); break;
case V_FLIPADST:
case V_ADST:
case V_DCT:
case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
default: assert(0); break;
}
scale_sqrt2_8x16(in0);
scale_sqrt2_8x16(in1);
scale_sqrt2_8x16(in2);
scale_sqrt2_8x16(in3);
// Column transform
switch (tx_type) {
case DCT_DCT:
case DCT_ADST:
case DCT_FLIPADST:
case V_DCT:
idct16_sse2(in0, in1);
idct16_sse2(in2, in3);
break;
case ADST_DCT:
case ADST_ADST:
case FLIPADST_ADST:
case ADST_FLIPADST:
case FLIPADST_FLIPADST:
case FLIPADST_DCT:
case V_ADST:
case V_FLIPADST:
iadst16_sse2(in0, in1);
iadst16_sse2(in2, in3);