Commit d8b26caa authored by Jingning Han's avatar Jingning Han Committed by Gerrit Code Review

Merge "Adjust the forward 16x16 DCT computation steps"

parents e39860be 7f547336
......@@ -512,7 +512,9 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
#if HAVE_NEON_ASM
// FIXME (jingning, fgalligan): need to simplify the corresponding steps
// in neov version accordingly, and re-enable the unit test
#if HAVE_NEON_ASM && 0
INSTANTIATE_TEST_CASE_P(
NEON, Trans16x16DCT,
::testing::Values(
......
......@@ -445,20 +445,20 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
step3[7] = step1[7] + step2[4];
// step 4
temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64;
temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
step2[1] = fdct_round_shift(temp1);
step2[2] = fdct_round_shift(temp2);
temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
step2[5] = fdct_round_shift(temp1);
step2[6] = fdct_round_shift(temp2);
// step 5
step1[0] = step3[0] + step2[1];
step1[1] = step3[0] - step2[1];
step1[2] = step3[3] - step2[2];
step1[3] = step3[3] + step2[2];
step1[4] = step3[4] + step2[5];
step1[5] = step3[4] - step2[5];
step1[2] = step3[3] + step2[2];
step1[3] = step3[3] - step2[2];
step1[4] = step3[4] - step2[5];
step1[5] = step3[4] + step2[5];
step1[6] = step3[7] - step2[6];
step1[7] = step3[7] + step2[6];
// step 6
......@@ -755,10 +755,10 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
// step 4
temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64;
temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
step2[1] = fdct_round_shift(temp1);
step2[2] = fdct_round_shift(temp2);
temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
step2[5] = fdct_round_shift(temp1);
step2[6] = fdct_round_shift(temp2);
......@@ -766,10 +766,10 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
// step 5
step1[0] = step3[0] + step2[1];
step1[1] = step3[0] - step2[1];
step1[2] = step3[3] - step2[2];
step1[3] = step3[3] + step2[2];
step1[4] = step3[4] + step2[5];
step1[5] = step3[4] - step2[5];
step1[2] = step3[3] + step2[2];
step1[3] = step3[3] - step2[2];
step1[4] = step3[4] - step2[5];
step1[5] = step3[4] + step2[5];
step1[6] = step3[7] - step2[6];
step1[7] = step3[7] + step2[6];
......
......@@ -1187,7 +1187,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
......@@ -1513,8 +1513,8 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
// dct_const_round_shift
const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
......@@ -1535,8 +1535,8 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
// dct_const_round_shift
const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
......@@ -1554,10 +1554,10 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
{
step1_0 = _mm_add_epi16(step3_0, step2_1);
step1_1 = _mm_sub_epi16(step3_0, step2_1);
step1_2 = _mm_sub_epi16(step3_3, step2_2);
step1_3 = _mm_add_epi16(step3_3, step2_2);
step1_4 = _mm_add_epi16(step3_4, step2_5);
step1_5 = _mm_sub_epi16(step3_4, step2_5);
step1_2 = _mm_add_epi16(step3_3, step2_2);
step1_3 = _mm_sub_epi16(step3_3, step2_2);
step1_4 = _mm_sub_epi16(step3_4, step2_5);
step1_5 = _mm_add_epi16(step3_4, step2_5);
step1_6 = _mm_sub_epi16(step3_7, step2_6);
step1_7 = _mm_add_epi16(step3_7, step2_6);
}
......@@ -1848,7 +1848,7 @@ void fdct16_8col(__m128i *in) {
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
......@@ -2052,10 +2052,10 @@ void fdct16_8col(__m128i *in) {
v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
......@@ -2085,10 +2085,10 @@ void fdct16_8col(__m128i *in) {
// stage 5
s[0] = _mm_add_epi16(p[0], t[1]);
s[1] = _mm_sub_epi16(p[0], t[1]);
s[2] = _mm_sub_epi16(p[3], t[2]);
s[3] = _mm_add_epi16(p[3], t[2]);
s[4] = _mm_add_epi16(p[4], t[5]);
s[5] = _mm_sub_epi16(p[4], t[5]);
s[2] = _mm_add_epi16(p[3], t[2]);
s[3] = _mm_sub_epi16(p[3], t[2]);
s[4] = _mm_sub_epi16(p[4], t[5]);
s[5] = _mm_add_epi16(p[4], t[5]);
s[6] = _mm_sub_epi16(p[7], t[6]);
s[7] = _mm_add_epi16(p[7], t[6]);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment