Commit a720f4b3 authored by Debargha Mukherjee's avatar Debargha Mukherjee Committed by Gerrit Code Review
Browse files

Merge "Add sse2 forward and inverse 16x32 and 32x16 transforms" into nextgenv2

parents a48764d0 33231d48
......@@ -186,6 +186,7 @@ DSP_SRCS-yes += fwd_txfm.c
DSP_SRCS-yes += fwd_txfm.h
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32_8cols_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h
ifeq ($(ARCH_X86_64),yes)
......
This diff is collapsed.
......@@ -365,6 +365,8 @@ static INLINE void transpose_and_output8x8(
}
}
void fdct32_8col(__m128i *in0, __m128i *in1);
#ifdef __cplusplus
} // extern "C"
#endif
......
......@@ -2669,28 +2669,28 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
stp1_31 = stp2_31; \
}
#define IDCT32 \
#define IDCT32(in0, in1) \
/* Stage1 */ \
{ \
const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
\
const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \
const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
\
const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
\
const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
const __m128i lo_1_31 = _mm_unpacklo_epi16((in0)[1], (in1)[15]); \
const __m128i hi_1_31 = _mm_unpackhi_epi16((in0)[1], (in1)[15]); \
const __m128i lo_17_15 = _mm_unpacklo_epi16((in1)[1], (in0)[15]); \
const __m128i hi_17_15 = _mm_unpackhi_epi16((in1)[1], (in0)[15]); \
\
const __m128i lo_9_23 = _mm_unpacklo_epi16((in0)[9], (in1)[7]); \
const __m128i hi_9_23 = _mm_unpackhi_epi16((in0)[9], (in1)[7]); \
const __m128i lo_25_7 = _mm_unpacklo_epi16((in1)[9], (in0)[7]); \
const __m128i hi_25_7 = _mm_unpackhi_epi16((in1)[9], (in0)[7]); \
\
const __m128i lo_5_27 = _mm_unpacklo_epi16((in0)[5], (in1)[11]); \
const __m128i hi_5_27 = _mm_unpackhi_epi16((in0)[5], (in1)[11]); \
const __m128i lo_21_11 = _mm_unpacklo_epi16((in1)[5], (in0)[11]); \
const __m128i hi_21_11 = _mm_unpackhi_epi16((in1)[5], (in0)[11]); \
\
const __m128i lo_13_19 = _mm_unpacklo_epi16((in0)[13], (in1)[3]); \
const __m128i hi_13_19 = _mm_unpackhi_epi16((in0)[13], (in1)[3]); \
const __m128i lo_29_3 = _mm_unpacklo_epi16((in1)[13], (in0)[3]); \
const __m128i hi_29_3 = _mm_unpackhi_epi16((in1)[13], (in0)[3]); \
\
MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \
......@@ -2707,15 +2707,15 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
\
/* Stage2 */ \
{ \
const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
const __m128i lo_2_30 = _mm_unpacklo_epi16((in0)[2], (in1)[14]); \
const __m128i hi_2_30 = _mm_unpackhi_epi16((in0)[2], (in1)[14]); \
const __m128i lo_18_14 = _mm_unpacklo_epi16((in1)[2], (in0)[14]); \
const __m128i hi_18_14 = _mm_unpackhi_epi16((in1)[2], (in0)[14]); \
\
const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
const __m128i lo_10_22 = _mm_unpacklo_epi16((in0)[10], (in1)[6]); \
const __m128i hi_10_22 = _mm_unpackhi_epi16((in0)[10], (in1)[6]); \
const __m128i lo_26_6 = _mm_unpacklo_epi16((in1)[10], (in0)[6]); \
const __m128i hi_26_6 = _mm_unpackhi_epi16((in1)[10], (in0)[6]); \
\
MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
......@@ -2747,10 +2747,10 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
\
/* Stage3 */ \
{ \
const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
const __m128i lo_4_28 = _mm_unpacklo_epi16((in0)[4], (in1)[12]); \
const __m128i hi_4_28 = _mm_unpackhi_epi16((in0)[4], (in1)[12]); \
const __m128i lo_20_12 = _mm_unpacklo_epi16((in1)[4], (in0)[12]); \
const __m128i hi_20_12 = _mm_unpackhi_epi16((in1)[4], (in0)[12]); \
\
const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
......@@ -2794,10 +2794,10 @@ void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
\
/* Stage4 */ \
{ \
const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
const __m128i lo_0_16 = _mm_unpacklo_epi16((in0)[0], (in1)[0]); \
const __m128i hi_0_16 = _mm_unpackhi_epi16((in0)[0], (in1)[0]); \
const __m128i lo_8_24 = _mm_unpacklo_epi16((in0)[8], (in1)[8]); \
const __m128i hi_8_24 = _mm_unpackhi_epi16((in0)[8], (in1)[8]); \
\
const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
......@@ -3338,7 +3338,7 @@ void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
array_transpose_8x8(in + 16, in + 16);
array_transpose_8x8(in + 24, in + 24);
IDCT32
IDCT32(in, in + 16)
// 1_D: Store 32 intermediate results for each 8x32 block.
col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
......@@ -3384,7 +3384,7 @@ void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
array_transpose_8x8(col + j + 64, in + 16);
array_transpose_8x8(col + j + 96, in + 24);
IDCT32
IDCT32(in, in + 16)
// 2_D: Calculate the results and store them to destination.
in[0] = _mm_add_epi16(stp1_0, stp1_31);
......@@ -3451,6 +3451,107 @@ void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
}
}
// Apply a 32-element IDCT to 8 columns. This does not do any transposition
// of its input - the caller is expected to have done that.
// The input buffers are the top and bottom halves of an 8x32 block.
void idct32_8col(__m128i *in0, __m128i *in1) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
// idct constants for each stage
const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
IDCT32(in0, in1)
// 2_D: Calculate the results and store them to destination.
in0[0] = _mm_add_epi16(stp1_0, stp1_31);
in0[1] = _mm_add_epi16(stp1_1, stp1_30);
in0[2] = _mm_add_epi16(stp1_2, stp1_29);
in0[3] = _mm_add_epi16(stp1_3, stp1_28);
in0[4] = _mm_add_epi16(stp1_4, stp1_27);
in0[5] = _mm_add_epi16(stp1_5, stp1_26);
in0[6] = _mm_add_epi16(stp1_6, stp1_25);
in0[7] = _mm_add_epi16(stp1_7, stp1_24);
in0[8] = _mm_add_epi16(stp1_8, stp1_23);
in0[9] = _mm_add_epi16(stp1_9, stp1_22);
in0[10] = _mm_add_epi16(stp1_10, stp1_21);
in0[11] = _mm_add_epi16(stp1_11, stp1_20);
in0[12] = _mm_add_epi16(stp1_12, stp1_19);
in0[13] = _mm_add_epi16(stp1_13, stp1_18);
in0[14] = _mm_add_epi16(stp1_14, stp1_17);
in0[15] = _mm_add_epi16(stp1_15, stp1_16);
in1[0] = _mm_sub_epi16(stp1_15, stp1_16);
in1[1] = _mm_sub_epi16(stp1_14, stp1_17);
in1[2] = _mm_sub_epi16(stp1_13, stp1_18);
in1[3] = _mm_sub_epi16(stp1_12, stp1_19);
in1[4] = _mm_sub_epi16(stp1_11, stp1_20);
in1[5] = _mm_sub_epi16(stp1_10, stp1_21);
in1[6] = _mm_sub_epi16(stp1_9, stp1_22);
in1[7] = _mm_sub_epi16(stp1_8, stp1_23);
in1[8] = _mm_sub_epi16(stp1_7, stp1_24);
in1[9] = _mm_sub_epi16(stp1_6, stp1_25);
in1[10] = _mm_sub_epi16(stp1_5, stp1_26);
in1[11] = _mm_sub_epi16(stp1_4, stp1_27);
in1[12] = _mm_sub_epi16(stp1_3, stp1_28);
in1[13] = _mm_sub_epi16(stp1_2, stp1_29);
in1[14] = _mm_sub_epi16(stp1_1, stp1_30);
in1[15] = _mm_sub_epi16(stp1_0, stp1_31);
}
#if CONFIG_AOM_HIGHBITDEPTH
static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
__m128i ubounded, retval;
......
......@@ -203,5 +203,6 @@ void idct16_sse2(__m128i *in0, __m128i *in1);
void iadst4_sse2(__m128i *in);
void iadst8_sse2(__m128i *in);
void iadst16_sse2(__m128i *in0, __m128i *in1);
void idct32_8col(__m128i *in0, __m128i *in1);
#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_
......@@ -104,10 +104,10 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_iht16x8_128_add sse2/;
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x32_512_add/;
specialize qw/av1_iht16x32_512_add sse2/;
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht32x16_512_add/;
specialize qw/av1_iht32x16_512_add sse2/;
}
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
......@@ -165,10 +165,10 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_iht16x8_128_add sse2/;
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x32_512_add/;
specialize qw/av1_iht16x32_512_add sse2/;
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht32x16_512_add/;
specialize qw/av1_iht32x16_512_add sse2/;
}
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
......@@ -405,10 +405,10 @@ if (aom_config("CONFIG_EXT_TX") eq "yes") {
specialize qw/av1_fht16x8 sse2/;
add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht16x32/;
specialize qw/av1_fht16x32 sse2/;
add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht32x16/;
specialize qw/av1_fht32x16 sse2/;
add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht32x32 avx2/;
......
......@@ -496,6 +496,12 @@ static void iidtx16_8col(__m128i *in) {
in[15] = _mm_packs_epi32(u7, y7);
}
static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
iidtx16_8col(in0);
iidtx16_8col(in1);
}
static void iidtx8_sse2(__m128i *in) {
in[0] = _mm_slli_epi16(in[0], 1);
in[1] = _mm_slli_epi16(in[1], 1);
......@@ -628,6 +634,11 @@ static INLINE void scale_sqrt2_8x8(__m128i *in) {
xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
}
static INLINE void scale_sqrt2_8x16(__m128i *in) {
scale_sqrt2_8x8(in);
scale_sqrt2_8x8(in + 8);
}
void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride, int tx_type) {
__m128i in[16];
......@@ -1202,4 +1213,322 @@ void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
in[3] = _mm_unpacklo_epi64(in[6], in[7]);
write_buffer_4x8_round5(dest, in, stride);
}
// Note: The 16-column 32-element transforms take input in the form of four
// 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
// of the overall 16x32 input buffer.
static INLINE void idct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
__m128i *br) {
array_transpose_16x16(tl, tr);
array_transpose_16x16(bl, br);
idct32_8col(tl, bl);
idct32_8col(tr, br);
}
static INLINE void ihalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
__m128i *br) {
__m128i tmpl[16], tmpr[16];
int i;
// Copy the top half of the input to temporary storage
for (i = 0; i < 16; ++i) {
tmpl[i] = tl[i];
tmpr[i] = tr[i];
}
// Generate the top half of the output
for (i = 0; i < 16; ++i) {
tl[i] = _mm_slli_epi16(bl[i], 2);
tr[i] = _mm_slli_epi16(br[i], 2);
}
array_transpose_16x16(tl, tr);
// Copy the temporary storage back to the bottom half of the input
for (i = 0; i < 16; ++i) {
bl[i] = tmpl[i];
br[i] = tmpr[i];
}
// Generate the bottom half of the output
scale_sqrt2_8x16(bl);
scale_sqrt2_8x16(br);
idct16_sse2(bl, br); // Includes a transposition
}
static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
__m128i *br) {
int i;
array_transpose_16x16(tl, tr);
array_transpose_16x16(bl, br);
for (i = 0; i < 16; ++i) {
tl[i] = _mm_slli_epi16(tl[i], 2);
tr[i] = _mm_slli_epi16(tr[i], 2);
bl[i] = _mm_slli_epi16(bl[i], 2);
br[i] = _mm_slli_epi16(br[i], 2);
}
}
static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
__m128i *intr, __m128i *inbl,
__m128i *inbr, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1 << 5);
int i;
for (i = 0; i < 16; ++i) {
intl[i] = _mm_adds_epi16(intl[i], final_rounding);
intr[i] = _mm_adds_epi16(intr[i], final_rounding);
inbl[i] = _mm_adds_epi16(inbl[i], final_rounding);
inbr[i] = _mm_adds_epi16(inbr[i], final_rounding);
intl[i] = _mm_srai_epi16(intl[i], 6);
intr[i] = _mm_srai_epi16(intr[i], 6);
inbl[i] = _mm_srai_epi16(inbl[i], 6);
inbr[i] = _mm_srai_epi16(inbr[i], 6);
RECON_AND_STORE(dest + i * stride + 0, intl[i]);
RECON_AND_STORE(dest + i * stride + 8, intr[i]);
RECON_AND_STORE(dest + (i + 16) * stride + 0, inbl[i]);
RECON_AND_STORE(dest + (i + 16) * stride + 8, inbr[i]);
}
}
void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride, int tx_type) {
__m128i intl[16], intr[16], inbl[16], inbr[16];
int i;
for (i = 0; i < 16; ++i) {
intl[i] = load_input_data(input + i * 16 + 0);
intr[i] = load_input_data(input + i * 16 + 8);
inbl[i] = load_input_data(input + (i + 16) * 16 + 0);
inbr[i] = load_input_data(input + (i + 16) * 16 + 8);
}
// Row transform
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
case FLIPADST_DCT:
case H_DCT:
idct16_sse2(intl, intr);
idct16_sse2(inbl, inbr);
break;
case DCT_ADST:
case ADST_ADST:
case DCT_FLIPADST:
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
case H_ADST:
case H_FLIPADST:
iadst16_sse2(intl, intr);
iadst16_sse2(inbl, inbr);
break;
case V_FLIPADST:
case V_ADST:
case V_DCT:
case IDTX:
iidtx16_sse2(intl, intr);
iidtx16_sse2(inbl, inbr);
break;
default: assert(0); break;
}
scale_sqrt2_8x16(intl);
scale_sqrt2_8x16(intr);
scale_sqrt2_8x16(inbl);
scale_sqrt2_8x16(inbr);
// Column transform
switch (tx_type) {
case DCT_DCT:
case DCT_ADST:
case DCT_FLIPADST:
case V_DCT: idct32_16col(intl, intr, inbl, inbr); break;
case ADST_DCT:
case ADST_ADST:
case FLIPADST_ADST:
case ADST_FLIPADST:
case FLIPADST_FLIPADST:
case FLIPADST_DCT:
case V_ADST:
case V_FLIPADST: ihalfright32_16col(intl, intr, inbl, inbr); break;
case H_DCT:
case H_ADST:
case H_FLIPADST:
case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
default: assert(0); break;
}
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
case H_DCT:
case DCT_ADST:
case ADST_ADST:
case H_ADST:
case V_ADST:
case V_DCT:
case IDTX: break;
case FLIPADST_DCT:
case FLIPADST_ADST:
case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
case DCT_FLIPADST:
case ADST_FLIPADST:
case H_FLIPADST:
for (i = 0; i < 16; ++i) {
__m128i tmp = intl[i];
intl[i] = mm_reverse_epi16(intr[i]);
intr[i] = mm_reverse_epi16(tmp);
tmp = inbl[i];
inbl[i] = mm_reverse_epi16(inbr[i]);
inbr[i] = mm_reverse_epi16(tmp);
}
break;
case FLIPADST_FLIPADST:
for (i = 0; i < 16; ++i) {
__m128i tmp = intl[i];
intl[i] = mm_reverse_epi16(intr[i]);
intr[i] = mm_reverse_epi16(tmp);
tmp = inbl[i];
inbl[i] = mm_reverse_epi16(inbr[i]);
inbr[i] = mm_reverse_epi16(tmp);
}
FLIPUD_PTR(dest, stride, 32);
break;
default: assert(0); break;
}
write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
}
static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
__m128i *in1, __m128i *in2,
__m128i *in3, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1 << 5);
int i;
for (i = 0; i < 16; ++i) {
in0[i] = _mm_adds_epi16(in0[i], final_rounding);
in1[i] = _mm_adds_epi16(in1[i], final_rounding);
in2[i] = _mm_adds_epi16(in2[i], final_rounding);
in3[i] = _mm_adds_epi16(in3[i], final_rounding);
in0[i] = _mm_srai_epi16(in0[i], 6);
in1[i] = _mm_srai_epi16(in1[i], 6);
in2[i] = _mm_srai_epi16(in2[i], 6);
in3[i] = _mm_srai_epi16(in3[i], 6);
RECON_AND_STORE(dest + i * stride + 0, in0[i]);
RECON_AND_STORE(dest + i * stride + 8, in1[i]);
RECON_AND_STORE(dest + i * stride + 16, in2[i]);
RECON_AND_STORE(dest + i * stride + 24, in3[i]);
}
}
void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride, int tx_type) {
__m128i in0[16], in1[16], in2[16], in3[16];
int i;
for (i = 0; i < 16; ++i) {
in0[i] = load_input_data(input + i * 32 + 0);
in1[i] = load_input_data(input + i * 32 + 8);
in2[i] = load_input_data(input + i * 32 + 16);
in3[i] = load_input_data(input + i * 32 + 24);
}
// Row transform
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
case FLIPADST_DCT:
case H_DCT: idct32_16col(in0, in1, in2, in3); break;
case DCT_ADST:
case ADST_ADST:
case DCT_FLIPADST:
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
case H_ADST:
case H_FLIPADST: ihalfright32_16col(in0, in1, in2, in3); break;
case V_FLIPADST:
case V_ADST:
case V_DCT:
case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
default: assert(0); break;
}
scale_sqrt2_8x16(in0);
scale_sqrt2_8x16(in1);
scale_sqrt2_8x16(in2);
scale_sqrt2_8x16(in3);
// Column transform
switch (tx_type) {
case DCT_DCT:
case DCT_ADST:
case DCT_FLIPADST:
case V_DCT:
idct16_sse2(in0, in1);
idct16_sse2(in2, in3);
break;
case ADST_DCT:
case ADST_ADST:
case FLIPADST_ADST:
case ADST_FLIPADST:
case FLIPADST_FLIPADST:
case FLIPADST_DCT:
case V_ADST:
case V_FLIPADST:
iadst16_sse2(in0, in1);
iadst16_sse2(in2, in3);