Commit 1a800f65 authored by Geza Lore's avatar Geza Lore Committed by Debargha Mukherjee
Browse files

Add SSE2 versions of av1_fht8x16 and av1_fht16x8

Encoder speedup ~2% with ext-tx + rect-tx

Change-Id: Id56ddf102a887de31d181bde6d8ef8c4f03da945
parent e51ee021
...@@ -68,13 +68,13 @@ static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) { ...@@ -68,13 +68,13 @@ static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
} }
static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) { static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1)); const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
return _mm_srli_epi32(v_tmp_d, bits); return _mm_srli_epi32(v_tmp_d, bits);
} }
static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1)); const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
const __m128i v_tmp_d = const __m128i v_tmp_d =
_mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
......
...@@ -51,7 +51,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { ...@@ -51,7 +51,7 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
} }
# #
# dct # Inverse dct
# #
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure # Note as optimized versions of these functions are added we need to add a check to ensure
...@@ -368,10 +368,22 @@ if (aom_config("CONFIG_AOM_QM") eq "yes") { ...@@ -368,10 +368,22 @@ if (aom_config("CONFIG_AOM_QM") eq "yes") {
# fdct functions # fdct functions
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/av1_fht4x4 sse2/;
specialize qw/av1_fht4x4 sse2/;
add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fwht4x4/;
add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht8x8 sse2/;
add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht16x16 sse2/;
add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht32x32/;
if (aom_config("CONFIG_EXT_TX") eq "yes") {
add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht4x8/; specialize qw/av1_fht4x8/;
...@@ -379,56 +391,84 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { ...@@ -379,56 +391,84 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize qw/av1_fht8x4/; specialize qw/av1_fht8x4/;
add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht8x16/; specialize qw/av1_fht8x16 sse2/;
add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht16x8/; specialize qw/av1_fht16x8 sse2/;
add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht16x32/; specialize qw/av1_fht16x32/;
add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht32x16/; specialize qw/av1_fht32x16/;
}
add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
specialize qw/av1_fht8x8 sse2/; add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4/;
add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fht16x16 sse2/; specialize qw/av1_fdct4x4_1/;
add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fht32x32/; specialize qw/av1_fdct8x8/;
add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fwht4x4/; specialize qw/av1_fdct8x8_1/;
if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4/; specialize qw/av1_fdct16x16/;
add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct16x16_1/;
add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32/;
add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4_1/; specialize qw/av1_fdct32x32_rd/;
add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct8x8/; specialize qw/av1_fdct32x32_1/;
} else {
add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4 sse2/;
add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4_1 sse2/;
add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct8x8_1/; specialize qw/av1_fdct8x8 sse2/;
add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct16x16/; specialize qw/av1_fdct8x8_1 sse2/;
add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct16x16_1/; specialize qw/av1_fdct16x16 sse2/;
add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32/; specialize qw/av1_fdct16x16_1 sse2/;
add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32_rd/; specialize qw/av1_fdct32x32 sse2/;
add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32_1/; specialize qw/av1_fdct32x32_rd sse2/;
add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32_1 sse2/;
}
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") ne "yes") {
if (aom_config("CONFIG_EXT_TX") ne "yes") {
specialize qw/av1_fht4x4 msa/;
specialize qw/av1_fht8x8 msa/;
specialize qw/av1_fht16x16 msa/;
}
}
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_highbd_fdct4x4/; specialize qw/av1_highbd_fdct4x4/;
...@@ -453,33 +493,6 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { ...@@ -453,33 +493,6 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_highbd_fdct32x32_1/; specialize qw/av1_highbd_fdct32x32_1/;
} else { } else {
add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4 sse2/;
add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4_1 sse2/;
add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct8x8 sse2/;
add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct8x8_1 sse2/;
add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct16x16 sse2/;
add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct16x16_1 sse2/;
add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32 sse2/;
add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32_rd sse2/;
add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32_1 sse2/;
add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_highbd_fdct4x4 sse2/; specialize qw/av1_highbd_fdct4x4 sse2/;
...@@ -504,100 +517,6 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { ...@@ -504,100 +517,6 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_highbd_fdct32x32_1/; specialize qw/av1_highbd_fdct32x32_1/;
} }
} else {
add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht4x4 sse2/;
add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht4x8/;
add_proto qw/void av1_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht8x4/;
add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht8x16/;
add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht16x8/;
add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht16x32/;
add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht32x16/;
add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht8x8 sse2/;
add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht16x16 sse2/;
if (aom_config("CONFIG_EXT_TX") ne "yes") {
specialize qw/av1_fht4x4 msa/;
specialize qw/av1_fht8x8 msa/;
specialize qw/av1_fht16x16 msa/;
}
add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht32x32/;
add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fwht4x4/;
if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4/;
add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4_1/;
add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct8x8/;
add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct8x8_1/;
add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct16x16/;
add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct16x16_1/;
add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32/;
add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32_rd/;
add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32_1/;
} else {
add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4 sse2/;
add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct4x4_1 sse2/;
add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct8x8 sse2/;
add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct8x8_1 sse2/;
add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct16x16 sse2/;
add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct16x16_1 sse2/;
add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32 sse2/;
add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32_rd sse2/;
add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_fdct32x32_1 sse2/;
}
} }
add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type"; add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
......
...@@ -1311,8 +1311,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride, ...@@ -1311,8 +1311,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
// Columns // Columns
for (i = 0; i < n; ++i) { for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) for (j = 0; j < n2; ++j)
temp_in[j] = temp_in[j] = ROUND_POWER_OF_TWO_SIGNED(input[j * stride + i] * 4 * Sqrt2,
(tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2); DCT_CONST_BITS);
ht.cols(temp_in, temp_out); ht.cols(temp_in, temp_out);
for (j = 0; j < n2; ++j) out[j * n + i] = temp_out[j]; for (j = 0; j < n2; ++j) out[j * n + i] = temp_out[j];
} }
...@@ -1321,7 +1321,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride, ...@@ -1321,7 +1321,8 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
for (i = 0; i < n2; ++i) { for (i = 0; i < n2; ++i) {
for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n]; for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
ht.rows(temp_in, temp_out); ht.rows(temp_in, temp_out);
for (j = 0; j < n; ++j) output[j + i * n] = (temp_out[j] + 1) >> 2; for (j = 0; j < n; ++j)
output[j + i * n] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
} }
// Note: overall scale factor of transform is 8 times unitary // Note: overall scale factor of transform is 8 times unitary
} }
...@@ -1358,8 +1359,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride, ...@@ -1358,8 +1359,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
// Columns // Columns
for (i = 0; i < n2; ++i) { for (i = 0; i < n2; ++i) {
for (j = 0; j < n; ++j) for (j = 0; j < n; ++j)
temp_in[j] = temp_in[j] = ROUND_POWER_OF_TWO_SIGNED(input[j * stride + i] * 4 * Sqrt2,
(tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2); DCT_CONST_BITS);
ht.cols(temp_in, temp_out); ht.cols(temp_in, temp_out);
for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j]; for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
} }
...@@ -1368,7 +1369,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride, ...@@ -1368,7 +1369,8 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
for (i = 0; i < n; ++i) { for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.rows(temp_in, temp_out); ht.rows(temp_in, temp_out);
for (j = 0; j < n2; ++j) output[j + i * n2] = (temp_out[j] + 1) >> 2; for (j = 0; j < n2; ++j)
output[j + i * n2] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
} }
// Note: overall scale factor of transform is 8 times unitary // Note: overall scale factor of transform is 8 times unitary
} }
......
...@@ -12,10 +12,11 @@ ...@@ -12,10 +12,11 @@
#include <assert.h> #include <assert.h>
#include <emmintrin.h> // SSE2 #include <emmintrin.h> // SSE2
#include "./av1_rtcd.h"
#include "./aom_dsp_rtcd.h" #include "./aom_dsp_rtcd.h"
#include "./av1_rtcd.h"
#include "aom_dsp/txfm_common.h" #include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/fwd_txfm_sse2.h" #include "aom_dsp/x86/fwd_txfm_sse2.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/txfm_common_sse2.h" #include "aom_dsp/x86/txfm_common_sse2.h"
#include "aom_ports/mem.h" #include "aom_ports/mem.h"
...@@ -2584,3 +2585,362 @@ void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, ...@@ -2584,3 +2585,362 @@ void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
default: assert(0); break; default: assert(0); break;
} }
} }
#if CONFIG_EXT_TX
static INLINE void scale_sqrt2_8x8(__m128i *in) {
// Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
// for each element
const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);
const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
in[0] = _mm_packs_epi32(xx_roundn_epi32(v_p0a_d, DCT_CONST_BITS),
xx_roundn_epi32(v_p0b_d, DCT_CONST_BITS));
in[1] = _mm_packs_epi32(xx_roundn_epi32(v_p1a_d, DCT_CONST_BITS),
xx_roundn_epi32(v_p1b_d, DCT_CONST_BITS));
in[2] = _mm_packs_epi32(xx_roundn_epi32(v_p2a_d, DCT_CONST_BITS),
xx_roundn_epi32(v_p2b_d, DCT_CONST_BITS));
in[3] = _mm_packs_epi32(xx_roundn_epi32(v_p3a_d, DCT_CONST_BITS),
xx_roundn_epi32(v_p3b_d, DCT_CONST_BITS));
in[4] = _mm_packs_epi32(xx_roundn_epi32(v_p4a_d, DCT_CONST_BITS),
xx_roundn_epi32(v_p4b_d, DCT_CONST_BITS));
in[5] = _mm_packs_epi32(xx_roundn_epi32(v_p5a_d, DCT_CONST_BITS),
xx_roundn_epi32(v_p5b_d, DCT_CONST_BITS));
in[6] = _mm_packs_epi32(xx_roundn_epi32(v_p6a_d, DCT_CONST_BITS),
xx_roundn_epi32(v_p6b_d, DCT_CONST_BITS));
in[7] = _mm_packs_epi32(xx_roundn_epi32(v_p7a_d, DCT_CONST_BITS),
xx_roundn_epi32(v_p7b_d, DCT_CONST_BITS));
}
static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in,
int stride, int flipud, int fliplr) {
// Load 2 8x8 blocks
const int16_t *t = input;
const int16_t *b = input + 8 * stride;
if (flipud) {
const int16_t *const tmp = t;
t = b;
b = tmp;
}
load_buffer_8x8(t, in, stride, flipud, fliplr);
scale_sqrt2_8x8(in);
load_buffer_8x8(b, in + 8, stride, flipud, fliplr);
scale_sqrt2_8x8(in + 8);
}
void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
__m128i in[16];
__m128i *const t = in; // Alias to top 8x8 sub block
__m128i *const b = in + 8; // Alias to bottom 8x8 sub block
switch (tx_type) {
case DCT_DCT:
load_buffer_8x16(input, in, stride, 0, 0);
fdct16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fdct8_sse2(t);
fdct8_sse2(b);
break;
case ADST_DCT:
load_buffer_8x16(input, in, stride, 0, 0);
fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fdct8_sse2(t);
fdct8_sse2(b);
break;
case DCT_ADST:
load_buffer_8x16(input, in, stride, 0, 0);
fdct16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
break;
case ADST_ADST:
load_buffer_8x16(input, in, stride, 0, 0);
fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
load_buffer_8x16(input, in, stride, 1, 0);
fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fdct8_sse2(t);
fdct8_sse2(b);
break;
case DCT_FLIPADST:
load_buffer_8x16(input, in, stride, 0, 1);
fdct16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
break;
case FLIPADST_FLIPADST:
load_buffer_8x16(input, in, stride, 1, 1);
fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
break;
case ADST_FLIPADST:
load_buffer_8x16(input, in, stride, 0, 1);
fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
break;
case FLIPADST_ADST:
load_buffer_8x16(input, in, stride, 1, 0);
fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
break;
case IDTX:
load_buffer_8x16(input, in, stride, 0, 0);
fidtx16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fidtx8_sse2(t);
fidtx8_sse2(b);
break;
case V_DCT:
load_buffer_8x16(input, in, stride, 0, 0);
fdct16_8col(in);