Commit 1baecfeb authored by Peter de Rivaz's avatar Peter de Rivaz Committed by Debargha Mukherjee
Browse files

Added sse2 inverse 8x16 and 16x8 transforms

Change-Id: I43628407b11e5c8e6af4df69f2acdc67ac827834
parent 71e4553c
...@@ -1308,7 +1308,7 @@ void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, ...@@ -1308,7 +1308,7 @@ void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
} }
} }
static void iadst16_8col(__m128i *in) { void iadst16_8col(__m128i *in) {
// perform 16x16 1-D ADST for 8 columns // perform 16x16 1-D ADST for 8 columns
__m128i s[16], x[16], u[32], v[32]; __m128i s[16], x[16], u[32], v[32];
const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
...@@ -1778,7 +1778,7 @@ static void iadst16_8col(__m128i *in) { ...@@ -1778,7 +1778,7 @@ static void iadst16_8col(__m128i *in) {
in[15] = _mm_sub_epi16(kZero, s[1]); in[15] = _mm_sub_epi16(kZero, s[1]);
} }
static void idct16_8col(__m128i *in) { void idct16_8col(__m128i *in) {
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
......
...@@ -187,6 +187,8 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { ...@@ -187,6 +187,8 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
RECON_AND_STORE(dest + 15 * stride, in[15]); RECON_AND_STORE(dest + 15 * stride, in[15]);
} }
void iadst16_8col(__m128i *in);
void idct16_8col(__m128i *in);
void idct4_sse2(__m128i *in); void idct4_sse2(__m128i *in);
void idct8_sse2(__m128i *in); void idct8_sse2(__m128i *in);
void idct16_sse2(__m128i *in0, __m128i *in1); void idct16_sse2(__m128i *in0, __m128i *in1);
......
...@@ -73,6 +73,14 @@ static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) { ...@@ -73,6 +73,14 @@ static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
return _mm_srli_epi32(v_tmp_d, bits); return _mm_srli_epi32(v_tmp_d, bits);
} }
// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
return _mm_srai_epi32(v_tmp_d, bits);
}
// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
......
...@@ -60,23 +60,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { ...@@ -60,23 +60,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x4_16_add/; specialize qw/av1_iht4x4_16_add/;
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; if (aom_config("CONFIG_EXT_TX") eq "yes") {
specialize qw/av1_iht4x8_32_add/; add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x8_32_add/;
add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x4_32_add/; specialize qw/av1_iht8x4_32_add/;
add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x16_128_add/; specialize qw/av1_iht8x16_128_add/;
add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x8_128_add/; specialize qw/av1_iht16x8_128_add/;
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x32_512_add/; specialize qw/av1_iht16x32_512_add/;
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht32x16_512_add/; specialize qw/av1_iht32x16_512_add/;
}
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x8_64_add/; specialize qw/av1_iht8x8_64_add/;
...@@ -87,23 +89,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { ...@@ -87,23 +89,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x4_16_add sse2/; specialize qw/av1_iht4x4_16_add sse2/;
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; if (aom_config("CONFIG_EXT_TX") eq "yes") {
specialize qw/av1_iht4x8_32_add/; add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x8_32_add/;
add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x4_32_add/; specialize qw/av1_iht8x4_32_add/;
add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x16_128_add/; specialize qw/av1_iht8x16_128_add sse2/;
add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x8_128_add/; specialize qw/av1_iht16x8_128_add sse2/;
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x32_512_add/; specialize qw/av1_iht16x32_512_add/;
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht32x16_512_add/; specialize qw/av1_iht32x16_512_add/;
}
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x8_64_add sse2/; specialize qw/av1_iht8x8_64_add sse2/;
...@@ -117,23 +121,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { ...@@ -117,23 +121,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x4_16_add/; specialize qw/av1_iht4x4_16_add/;
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; if (aom_config("CONFIG_EXT_TX") eq "yes") {
specialize qw/av1_iht4x8_32_add/; add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x8_32_add/;
add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x4_32_add/; specialize qw/av1_iht8x4_32_add/;
add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x16_128_add/; specialize qw/av1_iht8x16_128_add/;
add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x8_128_add/; specialize qw/av1_iht16x8_128_add/;
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x32_512_add/; specialize qw/av1_iht16x32_512_add/;
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht32x16_512_add/; specialize qw/av1_iht32x16_512_add/;
}
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x8_64_add/; specialize qw/av1_iht8x8_64_add/;
...@@ -144,23 +150,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { ...@@ -144,23 +150,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x4_16_add sse2 neon dspr2/; specialize qw/av1_iht4x4_16_add sse2 neon dspr2/;
add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; if (aom_config("CONFIG_EXT_TX") eq "yes") {
specialize qw/av1_iht4x8_32_add/; add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x8_32_add/;
add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x4_32_add/; specialize qw/av1_iht8x4_32_add/;
add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x16_128_add/; specialize qw/av1_iht8x16_128_add sse2/;
add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x8_128_add/; specialize qw/av1_iht16x8_128_add sse2/;
add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x32_512_add/; specialize qw/av1_iht16x32_512_add/;
add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht32x16_512_add/; specialize qw/av1_iht32x16_512_add/;
}
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x8_64_add sse2 neon dspr2/; specialize qw/av1_iht8x8_64_add sse2 neon dspr2/;
...@@ -274,23 +282,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { ...@@ -274,23 +282,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; add_proto qw/void av1_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht4x4_16_add/; specialize qw/av1_highbd_iht4x4_16_add/;
add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; if (aom_config("CONFIG_EXT_TX") eq "yes") {
specialize qw/av1_highbd_iht4x8_32_add/; add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht4x8_32_add/;
add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht8x4_32_add/; specialize qw/av1_highbd_iht8x4_32_add/;
add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht8x16_128_add/; specialize qw/av1_highbd_iht8x16_128_add/;
add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht16x8_128_add/; specialize qw/av1_highbd_iht16x8_128_add/;
add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht16x32_512_add/; specialize qw/av1_highbd_iht16x32_512_add/;
add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht32x16_512_add/; specialize qw/av1_highbd_iht32x16_512_add/;
}
add_proto qw/void av1_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; add_proto qw/void av1_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht8x8_64_add/; specialize qw/av1_highbd_iht8x8_64_add/;
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "./av1_rtcd.h" #include "./av1_rtcd.h"
#include "aom_dsp/x86/inv_txfm_sse2.h" #include "aom_dsp/x86/inv_txfm_sse2.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/txfm_common_sse2.h" #include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_ports/mem.h" #include "aom_ports/mem.h"
#include "av1/common/enums.h" #include "av1/common/enums.h"
...@@ -303,3 +304,535 @@ void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, ...@@ -303,3 +304,535 @@ void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
dest += 8; dest += 8;
write_buffer_8x16(dest, in1, stride); write_buffer_8x16(dest, in1, stride);
} }
#if CONFIG_EXT_TX
static void iidtx16_8col(__m128i *in) {
const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
__m128i u0, u1, u2, u3, u4, u5, u6, u7;
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
__m128i y0, y1, y2, y3, y4, y5, y6, y7;
in[0] = _mm_slli_epi16(in[0], 1);
in[1] = _mm_slli_epi16(in[1], 1);
in[2] = _mm_slli_epi16(in[2], 1);
in[3] = _mm_slli_epi16(in[3], 1);
in[4] = _mm_slli_epi16(in[4], 1);
in[5] = _mm_slli_epi16(in[5], 1);
in[6] = _mm_slli_epi16(in[6], 1);
in[7] = _mm_slli_epi16(in[7], 1);
in[8] = _mm_slli_epi16(in[8], 1);
in[9] = _mm_slli_epi16(in[9], 1);
in[10] = _mm_slli_epi16(in[10], 1);
in[11] = _mm_slli_epi16(in[11], 1);
in[12] = _mm_slli_epi16(in[12], 1);
in[13] = _mm_slli_epi16(in[13], 1);
in[14] = _mm_slli_epi16(in[14], 1);
in[15] = _mm_slli_epi16(in[15], 1);
v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
in[0] = _mm_packs_epi32(v0, x0);
in[1] = _mm_packs_epi32(v1, x1);
in[2] = _mm_packs_epi32(v2, x2);
in[3] = _mm_packs_epi32(v3, x3);
in[4] = _mm_packs_epi32(v4, x4);
in[5] = _mm_packs_epi32(v5, x5);
in[6] = _mm_packs_epi32(v6, x6);
in[7] = _mm_packs_epi32(v7, x7);
in[8] = _mm_packs_epi32(u0, y0);
in[9] = _mm_packs_epi32(u1, y1);
in[10] = _mm_packs_epi32(u2, y2);
in[11] = _mm_packs_epi32(u3, y3);
in[12] = _mm_packs_epi32(u4, y4);
in[13] = _mm_packs_epi32(u5, y5);
in[14] = _mm_packs_epi32(u6, y6);
in[15] = _mm_packs_epi32(u7, y7);
}
static void iidtx8_sse2(__m128i *in) {
in[0] = _mm_slli_epi16(in[0], 1);
in[1] = _mm_slli_epi16(in[1], 1);
in[2] = _mm_slli_epi16(in[2], 1);
in[3] = _mm_slli_epi16(in[3], 1);
in[4] = _mm_slli_epi16(in[4], 1);
in[5] = _mm_slli_epi16(in[5], 1);
in[6] = _mm_slli_epi16(in[6], 1);
in[7] = _mm_slli_epi16(in[7], 1);
}
// load 8x8 array
static INLINE void flip_buffer_lr_8x8(__m128i *in) {
in[0] = mm_reverse_epi16(in[0]);
in[1] = mm_reverse_epi16(in[1]);
in[2] = mm_reverse_epi16(in[2]);
in[3] = mm_reverse_epi16(in[3]);
in[4] = mm_reverse_epi16(in[4]);
in[5] = mm_reverse_epi16(in[5]);
in[6] = mm_reverse_epi16(in[6]);
in[7] = mm_reverse_epi16(in[7]);
}
static INLINE void scale_sqrt2_8x8(__m128i *in) {
// Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
// for each element
const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);
const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS),
xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS));
in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS),
xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS));
in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS),
xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS));
in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS),
xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
}
void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride, int tx_type) {
__m128i in[16];
in[0] = load_input_data(input + 0 * 8);
in[1] = load_input_data(input + 1 * 8);
in[2] = load_input_data(input + 2 * 8);
in[3] = load_input_data(input + 3 * 8);
in[4] = load_input_data(input + 4 * 8);
in[5] = load_input_data(input + 5 * 8);
in[6] = load_input_data(input + 6 * 8);
in[7] = load_input_data(input + 7 * 8);
in[8] = load_input_data(input + 8 * 8);