Commit 770bf715 authored by Yi Luo's avatar Yi Luo

8x8/16x16 HT types V_DCT to H_FLIPADST SSE2 optimization

- Wrote function: fidtx8_sse2() and fidtx16_sse2().
- Turned on vp10_fht8x8_sse2()/vp10_fht16x16_sse2() for new types.
- Updated 8x8/16x16 unit tests for accuracy/speed.
- Running 20K times with random numbers and getting through
  tx type from V_DCT to H_FLIPADST, SSE2 speed improvement:
  8x8: ~131%
  16x16: ~66%

Change-Id: Ibbb707e932a08fec3b1f423a7dab280a1d696c9a
parent f9d77d66
......@@ -70,6 +70,61 @@ TEST_P(VP10Trans16x16HT, CoeffCheck) {
RunCoeffCheck();
}
#if CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
TEST(VP10Trans16x16HTSpeedTest, C_version) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 20000;
int bit_depth = 8;
int mask = (1 << bit_depth) - 1;
const int num_coeffs = 256;
int16_t *input = new int16_t[num_coeffs];
tran_low_t *output = new tran_low_t[num_coeffs];
const int stride = 16;
int tx_type;
for (int i = 0; i < count_test_block; ++i) {
for (int j = 0; j < num_coeffs; ++j) {
input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask);
}
for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) {
vp10_fht16x16_c(input, output, stride, tx_type);
}
}
delete[] input;
delete[] output;
}
#endif // CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
#if HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
TEST(VP10Trans16x16HTSpeedTest, SSE2_version) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 20000;
int bit_depth = 8;
int mask = (1 << bit_depth) - 1;
const int num_coeffs = 256;
int16_t *input = reinterpret_cast<int16_t *>
(vpx_memalign(16, sizeof(int16_t) * num_coeffs));
tran_low_t *output = reinterpret_cast<tran_low_t *>
(vpx_memalign(16, sizeof(tran_low_t) * num_coeffs));
const int stride = 16;
int tx_type;
for (int i = 0; i < count_test_block; ++i) {
for (int j = 0; j < num_coeffs; ++j) {
input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask);
}
for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) {
vp10_fht16x16_sse2(input, output, stride, tx_type);
}
}
vpx_free(input);
vpx_free(output);
}
#endif // HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
using std::tr1::make_tuple;
#if HAVE_SSE2
......@@ -103,6 +158,18 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 7,
VPX_BITS_8, 256),
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 8,
VPX_BITS_8, 256),
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 10,
VPX_BITS_8, 256),
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 11,
VPX_BITS_8, 256),
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 12,
VPX_BITS_8, 256),
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 13,
VPX_BITS_8, 256),
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 14,
VPX_BITS_8, 256),
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 15,
VPX_BITS_8, 256)));
#endif // !CONFIG_EXT_TX
#endif // HAVE_SSE2
......
......@@ -69,6 +69,61 @@ TEST_P(VP10Trans8x8HT, CoeffCheck) {
RunCoeffCheck();
}
#if CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
TEST(VP10Trans8x8HTSpeedTest, C_version) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 20000;
int bit_depth = 8;
int mask = (1 << bit_depth) - 1;
const int num_coeffs = 64;
int16_t *input = new int16_t[num_coeffs];
tran_low_t *output = new tran_low_t[num_coeffs];
const int stride = 8;
int tx_type;
for (int i = 0; i < count_test_block; ++i) {
for (int j = 0; j < num_coeffs; ++j) {
input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask);
}
for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) {
vp10_fht8x8_c(input, output, stride, tx_type);
}
}
delete[] input;
delete[] output;
}
#endif // CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
#if HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
TEST(VP10Trans8x8HTSpeedTest, SSE2_version) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 20000;
int bit_depth = 8;
int mask = (1 << bit_depth) - 1;
const int num_coeffs = 64;
int16_t *input = reinterpret_cast<int16_t *>
(vpx_memalign(16, sizeof(int16_t) * num_coeffs));
tran_low_t *output = reinterpret_cast<tran_low_t *>
(vpx_memalign(16, sizeof(tran_low_t) * num_coeffs));
const int stride = 8;
int tx_type;
for (int i = 0; i < count_test_block; ++i) {
for (int j = 0; j < num_coeffs; ++j) {
input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask);
}
for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) {
vp10_fht8x8_sse2(input, output, stride, tx_type);
}
}
vpx_free(input);
vpx_free(output);
}
#endif // HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
using std::tr1::make_tuple;
#if HAVE_SSE2
......@@ -102,6 +157,18 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 7,
VPX_BITS_8, 64),
make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 8,
VPX_BITS_8, 64),
make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 10,
VPX_BITS_8, 64),
make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 11,
VPX_BITS_8, 64),
make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 12,
VPX_BITS_8, 64),
make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 13,
VPX_BITS_8, 64),
make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 14,
VPX_BITS_8, 64),
make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 15,
VPX_BITS_8, 64)));
#endif // !CONFIG_EXT_TX
#endif // HAVE_SSE2
......
......@@ -54,8 +54,6 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
break;
case V_DCT:
case H_DCT:
case V_ADST:
......@@ -70,7 +68,6 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
}
}
......@@ -93,15 +90,13 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
break;
case V_DCT:
case H_DCT:
case V_ADST:
case H_ADST:
case V_FLIPADST:
case H_FLIPADST:
vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
break;
case IDTX:
vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
......@@ -109,7 +104,6 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
}
}
......@@ -132,15 +126,13 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
break;
case V_DCT:
case H_DCT:
case V_ADST:
case H_ADST:
case V_FLIPADST:
case H_FLIPADST:
vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
break;
case IDTX:
vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
......@@ -148,7 +140,6 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
}
}
......
......@@ -1280,6 +1280,21 @@ static void fadst8_sse2(__m128i *in) {
array_transpose_8x8(in, in);
}
#if CONFIG_EXT_TX
static void fidtx8_sse2(__m128i *in) {
in[0] = _mm_slli_epi16(in[0], 1);
in[1] = _mm_slli_epi16(in[1], 1);
in[2] = _mm_slli_epi16(in[2], 1);
in[3] = _mm_slli_epi16(in[3], 1);
in[4] = _mm_slli_epi16(in[4], 1);
in[5] = _mm_slli_epi16(in[5], 1);
in[6] = _mm_slli_epi16(in[6], 1);
in[7] = _mm_slli_epi16(in[7], 1);
array_transpose_8x8(in, in);
}
#endif // CONFIG_EXT_TX
void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
__m128i in[8];
......@@ -1345,10 +1360,51 @@ void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case V_DCT:
load_buffer_8x8(input, in, stride, 0, 0);
fdct8_sse2(in);
fidtx8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case H_DCT:
load_buffer_8x8(input, in, stride, 0, 0);
fidtx8_sse2(in);
fdct8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case V_ADST:
load_buffer_8x8(input, in, stride, 0, 0);
fadst8_sse2(in);
fidtx8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case H_ADST:
load_buffer_8x8(input, in, stride, 0, 0);
fidtx8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case V_FLIPADST:
load_buffer_8x8(input, in, stride, 1, 0);
fadst8_sse2(in);
fidtx8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case H_FLIPADST:
load_buffer_8x8(input, in, stride, 0, 1);
fidtx8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
}
}
......@@ -2226,6 +2282,204 @@ static void fadst16_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
}
#if CONFIG_EXT_TX
static void fidtx16_8col(__m128i *in) {
const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
__m128i u0, u1, u2, u3, u4, u5, u6, u7;
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
__m128i y0, y1, y2, y3, y4, y5, y6, y7;
in[0] = _mm_slli_epi16(in[0], 1);
in[1] = _mm_slli_epi16(in[1], 1);
in[2] = _mm_slli_epi16(in[2], 1);
in[3] = _mm_slli_epi16(in[3], 1);
in[4] = _mm_slli_epi16(in[4], 1);
in[5] = _mm_slli_epi16(in[5], 1);
in[6] = _mm_slli_epi16(in[6], 1);
in[7] = _mm_slli_epi16(in[7], 1);
in[8] = _mm_slli_epi16(in[8], 1);
in[9] = _mm_slli_epi16(in[9], 1);
in[10] = _mm_slli_epi16(in[10], 1);
in[11] = _mm_slli_epi16(in[11], 1);
in[12] = _mm_slli_epi16(in[12], 1);
in[13] = _mm_slli_epi16(in[13], 1);
in[14] = _mm_slli_epi16(in[14], 1);
in[15] = _mm_slli_epi16(in[15], 1);
v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
in[0] = _mm_packs_epi32(v0, x0);
in[1] = _mm_packs_epi32(v1, x1);
in[2] = _mm_packs_epi32(v2, x2);
in[3] = _mm_packs_epi32(v3, x3);
in[4] = _mm_packs_epi32(v4, x4);
in[5] = _mm_packs_epi32(v5, x5);
in[6] = _mm_packs_epi32(v6, x6);
in[7] = _mm_packs_epi32(v7, x7);
in[8] = _mm_packs_epi32(u0, y0);
in[9] = _mm_packs_epi32(u1, y1);
in[10] = _mm_packs_epi32(u2, y2);
in[11] = _mm_packs_epi32(u3, y3);
in[12] = _mm_packs_epi32(u4, y4);
in[13] = _mm_packs_epi32(u5, y5);
in[14] = _mm_packs_epi32(u6, y6);
in[15] = _mm_packs_epi32(u7, y7);
}
static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
fidtx16_8col(in0);
fidtx16_8col(in1);
array_transpose_16x16(in0, in1);
}
#endif // CONFIG_EXT_TX
void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
__m128i in0[16], in1[16];
......@@ -2291,6 +2545,48 @@ void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case V_DCT:
load_buffer_16x16(input, in0, in1, stride, 0, 0);
fdct16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fidtx16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case H_DCT:
load_buffer_16x16(input, in0, in1, stride, 0, 0);
fidtx16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fdct16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case V_ADST:
load_buffer_16x16(input, in0, in1, stride, 0, 0);
fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fidtx16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case H_ADST:
load_buffer_16x16(input, in0, in1, stride, 0, 0);
fidtx16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case V_FLIPADST:
load_buffer_16x16(input, in0, in1, stride, 1, 0);
fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fidtx16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case H_FLIPADST:
load_buffer_16x16(input, in0, in1, stride, 0, 1);
fidtx16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment