Commit b51e0781 authored by Urvang Joshi's avatar Urvang Joshi
Browse files

av1_fwd_txfm1d_sse4: Use for loops when possible.

BUG=aomedia:442

Change-Id: I80706666b50058c9d0ecd6f568fe0e7a07d00185
parent d6c17d98
......@@ -14,38 +14,10 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
for (col = 0; col < col_num; col++) {
// stage 0;
int32_t stage_idx = 0;
buf0[0] = input[0 * col_num + col];
buf0[1] = input[1 * col_num + col];
buf0[2] = input[2 * col_num + col];
buf0[3] = input[3 * col_num + col];
buf0[4] = input[4 * col_num + col];
buf0[5] = input[5 * col_num + col];
buf0[6] = input[6 * col_num + col];
buf0[7] = input[7 * col_num + col];
buf0[8] = input[8 * col_num + col];
buf0[9] = input[9 * col_num + col];
buf0[10] = input[10 * col_num + col];
buf0[11] = input[11 * col_num + col];
buf0[12] = input[12 * col_num + col];
buf0[13] = input[13 * col_num + col];
buf0[14] = input[14 * col_num + col];
buf0[15] = input[15 * col_num + col];
buf0[16] = input[16 * col_num + col];
buf0[17] = input[17 * col_num + col];
buf0[18] = input[18 * col_num + col];
buf0[19] = input[19 * col_num + col];
buf0[20] = input[20 * col_num + col];
buf0[21] = input[21 * col_num + col];
buf0[22] = input[22 * col_num + col];
buf0[23] = input[23 * col_num + col];
buf0[24] = input[24 * col_num + col];
buf0[25] = input[25 * col_num + col];
buf0[26] = input[26 * col_num + col];
buf0[27] = input[27 * col_num + col];
buf0[28] = input[28 * col_num + col];
buf0[29] = input[29 * col_num + col];
buf0[30] = input[30 * col_num + col];
buf0[31] = input[31 * col_num + col];
int j;
for (j = 0; j < 32; ++j) {
buf0[j] = input[j * col_num + col];
}
// stage 1
stage_idx++;
......@@ -376,38 +348,9 @@ void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
buf1[30] = buf0[15];
buf1[31] = buf0[31];
output[0 * col_num + col] = buf1[0];
output[1 * col_num + col] = buf1[1];
output[2 * col_num + col] = buf1[2];
output[3 * col_num + col] = buf1[3];
output[4 * col_num + col] = buf1[4];
output[5 * col_num + col] = buf1[5];
output[6 * col_num + col] = buf1[6];
output[7 * col_num + col] = buf1[7];
output[8 * col_num + col] = buf1[8];
output[9 * col_num + col] = buf1[9];
output[10 * col_num + col] = buf1[10];
output[11 * col_num + col] = buf1[11];
output[12 * col_num + col] = buf1[12];
output[13 * col_num + col] = buf1[13];
output[14 * col_num + col] = buf1[14];
output[15 * col_num + col] = buf1[15];
output[16 * col_num + col] = buf1[16];
output[17 * col_num + col] = buf1[17];
output[18 * col_num + col] = buf1[18];
output[19 * col_num + col] = buf1[19];
output[20 * col_num + col] = buf1[20];
output[21 * col_num + col] = buf1[21];
output[22 * col_num + col] = buf1[22];
output[23 * col_num + col] = buf1[23];
output[24 * col_num + col] = buf1[24];
output[25 * col_num + col] = buf1[25];
output[26 * col_num + col] = buf1[26];
output[27 * col_num + col] = buf1[27];
output[28 * col_num + col] = buf1[28];
output[29 * col_num + col] = buf1[29];
output[30 * col_num + col] = buf1[30];
output[31 * col_num + col] = buf1[31];
for (j = 0; j < 32; ++j) {
output[j * col_num + col] = buf1[j];
}
}
}
......@@ -425,10 +368,10 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
for (col = 0; col < col_num; col++) {
// stage 0;
int32_t stage_idx = 0;
buf0[0] = input[0 * col_num + col];
buf0[1] = input[1 * col_num + col];
buf0[2] = input[2 * col_num + col];
buf0[3] = input[3 * col_num + col];
int j;
for (j = 0; j < 4; ++j) {
buf0[j] = input[j * col_num + col];
}
// stage 1
stage_idx++;
......@@ -469,10 +412,9 @@ void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
buf1[2] = buf0[3];
buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
output[0 * col_num + col] = buf1[0];
output[1 * col_num + col] = buf1[1];
output[2 * col_num + col] = buf1[2];
output[3 * col_num + col] = buf1[3];
for (j = 0; j < 4; ++j) {
output[j * col_num + col] = buf1[j];
}
}
}
......@@ -490,38 +432,10 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
for (col = 0; col < col_num; col++) {
// stage 0;
int32_t stage_idx = 0;
buf0[0] = input[0 * col_num + col];
buf0[1] = input[1 * col_num + col];
buf0[2] = input[2 * col_num + col];
buf0[3] = input[3 * col_num + col];
buf0[4] = input[4 * col_num + col];
buf0[5] = input[5 * col_num + col];
buf0[6] = input[6 * col_num + col];
buf0[7] = input[7 * col_num + col];
buf0[8] = input[8 * col_num + col];
buf0[9] = input[9 * col_num + col];
buf0[10] = input[10 * col_num + col];
buf0[11] = input[11 * col_num + col];
buf0[12] = input[12 * col_num + col];
buf0[13] = input[13 * col_num + col];
buf0[14] = input[14 * col_num + col];
buf0[15] = input[15 * col_num + col];
buf0[16] = input[16 * col_num + col];
buf0[17] = input[17 * col_num + col];
buf0[18] = input[18 * col_num + col];
buf0[19] = input[19 * col_num + col];
buf0[20] = input[20 * col_num + col];
buf0[21] = input[21 * col_num + col];
buf0[22] = input[22 * col_num + col];
buf0[23] = input[23 * col_num + col];
buf0[24] = input[24 * col_num + col];
buf0[25] = input[25 * col_num + col];
buf0[26] = input[26 * col_num + col];
buf0[27] = input[27 * col_num + col];
buf0[28] = input[28 * col_num + col];
buf0[29] = input[29 * col_num + col];
buf0[30] = input[30 * col_num + col];
buf0[31] = input[31 * col_num + col];
int j;
for (j = 0; j < 32; ++j) {
buf0[j] = input[j * col_num + col];
}
// stage 1
stage_idx++;
......@@ -918,37 +832,8 @@ void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
buf1[30] = buf0[17];
buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
output[0 * col_num + col] = buf1[0];
output[1 * col_num + col] = buf1[1];
output[2 * col_num + col] = buf1[2];
output[3 * col_num + col] = buf1[3];
output[4 * col_num + col] = buf1[4];
output[5 * col_num + col] = buf1[5];
output[6 * col_num + col] = buf1[6];
output[7 * col_num + col] = buf1[7];
output[8 * col_num + col] = buf1[8];
output[9 * col_num + col] = buf1[9];
output[10 * col_num + col] = buf1[10];
output[11 * col_num + col] = buf1[11];
output[12 * col_num + col] = buf1[12];
output[13 * col_num + col] = buf1[13];
output[14 * col_num + col] = buf1[14];
output[15 * col_num + col] = buf1[15];
output[16 * col_num + col] = buf1[16];
output[17 * col_num + col] = buf1[17];
output[18 * col_num + col] = buf1[18];
output[19 * col_num + col] = buf1[19];
output[20 * col_num + col] = buf1[20];
output[21 * col_num + col] = buf1[21];
output[22 * col_num + col] = buf1[22];
output[23 * col_num + col] = buf1[23];
output[24 * col_num + col] = buf1[24];
output[25 * col_num + col] = buf1[25];
output[26 * col_num + col] = buf1[26];
output[27 * col_num + col] = buf1[27];
output[28 * col_num + col] = buf1[28];
output[29 * col_num + col] = buf1[29];
output[30 * col_num + col] = buf1[30];
output[31 * col_num + col] = buf1[31];
for (j = 0; j < 32; ++j) {
output[j * col_num + col] = buf1[j];
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment