Commit aaa65f24 authored by Yi Luo's avatar Yi Luo
Browse files

Correct function signature for Visual Studio

- There would be VS build warning on unaligned formal parameter.

Change-Id: I6e122c4fec2505ef3458e4bdf218d3cd30bb494f
parent fa4ff85b
......@@ -71,8 +71,8 @@ static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
__m256i x0, x1;
x0 = _mm256_unpacklo_epi16(*a0, *a1);
x1 = _mm256_unpackhi_epi16(*a0, *a1);
*b0 = butter_fly(x0, x1, *c0);
*b1 = butter_fly(x0, x1, *c1);
*b0 = butter_fly(&x0, &x1, c0);
*b1 = butter_fly(&x0, &x1, c1);
}
void av1_idct16_avx2(__m256i *in);
......
......@@ -163,10 +163,11 @@ static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
}
static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1,
const __m256i *cospi) {
const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
__m256i y0 = _mm256_madd_epi16(a0, cospi);
__m256i y1 = _mm256_madd_epi16(a1, cospi);
__m256i y0 = _mm256_madd_epi16(*a0, *cospi);
__m256i y1 = _mm256_madd_epi16(*a1, *cospi);
y0 = _mm256_add_epi32(y0, dct_rounding);
y1 = _mm256_add_epi32(y1, dct_rounding);
......
......@@ -269,8 +269,8 @@ static void fdct16_avx2(__m256i *in) {
x0 = _mm256_unpacklo_epi16(v0, v1);
x1 = _mm256_unpackhi_epi16(v0, v1);
t0 = butter_fly(x0, x1, cospi_p16_p16);
t1 = butter_fly(x0, x1, cospi_p16_m16);
t0 = butter_fly(&x0, &x1, &cospi_p16_p16);
t1 = butter_fly(&x0, &x1, &cospi_p16_m16);
// 4, 12
v0 = _mm256_sub_epi16(s1, s2);
......@@ -279,8 +279,8 @@ static void fdct16_avx2(__m256i *in) {
x0 = _mm256_unpacklo_epi16(v0, v1);
x1 = _mm256_unpackhi_epi16(v0, v1);
t2 = butter_fly(x0, x1, cospi_p24_p08);
t3 = butter_fly(x0, x1, cospi_m08_p24);
t2 = butter_fly(&x0, &x1, &cospi_p24_p08);
t3 = butter_fly(&x0, &x1, &cospi_m08_p24);
// 2, 6, 10, 14
s0 = _mm256_sub_epi16(u3, u4);
......@@ -294,8 +294,8 @@ static void fdct16_avx2(__m256i *in) {
x0 = _mm256_unpacklo_epi16(s2, s1);
x1 = _mm256_unpackhi_epi16(s2, s1);
v2 = butter_fly(x0, x1, cospi_p16_p16); // output[5]
v1 = butter_fly(x0, x1, cospi_p16_m16); // output[6]
v2 = butter_fly(&x0, &x1, &cospi_p16_p16); // output[5]
v1 = butter_fly(&x0, &x1, &cospi_p16_m16); // output[6]
s0 = _mm256_add_epi16(v0, v1); // step[4]
s1 = _mm256_sub_epi16(v0, v1); // step[5]
......@@ -306,14 +306,14 @@ static void fdct16_avx2(__m256i *in) {
x0 = _mm256_unpacklo_epi16(s0, s3);
x1 = _mm256_unpackhi_epi16(s0, s3);
t4 = butter_fly(x0, x1, cospi_p28_p04);
t5 = butter_fly(x0, x1, cospi_m04_p28);
t4 = butter_fly(&x0, &x1, &cospi_p28_p04);
t5 = butter_fly(&x0, &x1, &cospi_m04_p28);
// 10, 6
x0 = _mm256_unpacklo_epi16(s1, s2);
x1 = _mm256_unpackhi_epi16(s1, s2);
t6 = butter_fly(x0, x1, cospi_p12_p20);
t7 = butter_fly(x0, x1, cospi_m20_p12);
t6 = butter_fly(&x0, &x1, &cospi_p12_p20);
t7 = butter_fly(&x0, &x1, &cospi_m20_p12);
// 1, 3, 5, 7, 9, 11, 13, 15
s0 = _mm256_sub_epi16(in[7], in[8]); // step[8]
......@@ -337,14 +337,14 @@ static void fdct16_avx2(__m256i *in) {
x0 = _mm256_unpacklo_epi16(u5, u2);
x1 = _mm256_unpackhi_epi16(u5, u2);
s2 = butter_fly(x0, x1, cospi_p16_p16); // step[13]
s5 = butter_fly(x0, x1, cospi_p16_m16); // step[10]
s2 = butter_fly(&x0, &x1, &cospi_p16_p16); // step[13]
s5 = butter_fly(&x0, &x1, &cospi_p16_m16); // step[10]
x0 = _mm256_unpacklo_epi16(u4, u3);
x1 = _mm256_unpackhi_epi16(u4, u3);
s3 = butter_fly(x0, x1, cospi_p16_p16); // step[12]
s4 = butter_fly(x0, x1, cospi_p16_m16); // step[11]
s3 = butter_fly(&x0, &x1, &cospi_p16_p16); // step[12]
s4 = butter_fly(&x0, &x1, &cospi_p16_m16); // step[11]
u0 = _mm256_add_epi16(s0, s4); // output[8]
u1 = _mm256_add_epi16(s1, s5);
......@@ -364,14 +364,14 @@ static void fdct16_avx2(__m256i *in) {
x0 = _mm256_unpacklo_epi16(u1, u6);
x1 = _mm256_unpackhi_epi16(u1, u6);
s1 = butter_fly(x0, x1, cospi_m08_p24);
s6 = butter_fly(x0, x1, cospi_p24_p08);
s1 = butter_fly(&x0, &x1, &cospi_m08_p24);
s6 = butter_fly(&x0, &x1, &cospi_p24_p08);
x0 = _mm256_unpacklo_epi16(u2, u5);
x1 = _mm256_unpackhi_epi16(u2, u5);
s2 = butter_fly(x0, x1, cospi_m24_m08);
s5 = butter_fly(x0, x1, cospi_m08_p24);
s2 = butter_fly(&x0, &x1, &cospi_m24_m08);
s5 = butter_fly(&x0, &x1, &cospi_m08_p24);
// stage 5
u0 = _mm256_add_epi16(s0, s1);
......@@ -386,23 +386,23 @@ static void fdct16_avx2(__m256i *in) {
// stage 6
x0 = _mm256_unpacklo_epi16(u0, u7);
x1 = _mm256_unpackhi_epi16(u0, u7);
in[1] = butter_fly(x0, x1, cospi_p30_p02);
in[15] = butter_fly(x0, x1, cospi_m02_p30);
in[1] = butter_fly(&x0, &x1, &cospi_p30_p02);
in[15] = butter_fly(&x0, &x1, &cospi_m02_p30);
x0 = _mm256_unpacklo_epi16(u1, u6);
x1 = _mm256_unpackhi_epi16(u1, u6);
in[9] = butter_fly(x0, x1, cospi_p14_p18);
in[7] = butter_fly(x0, x1, cospi_m18_p14);
in[9] = butter_fly(&x0, &x1, &cospi_p14_p18);
in[7] = butter_fly(&x0, &x1, &cospi_m18_p14);
x0 = _mm256_unpacklo_epi16(u2, u5);
x1 = _mm256_unpackhi_epi16(u2, u5);
in[5] = butter_fly(x0, x1, cospi_p22_p10);
in[11] = butter_fly(x0, x1, cospi_m10_p22);
in[5] = butter_fly(&x0, &x1, &cospi_p22_p10);
in[11] = butter_fly(&x0, &x1, &cospi_m10_p22);
x0 = _mm256_unpacklo_epi16(u3, u4);
x1 = _mm256_unpackhi_epi16(u3, u4);
in[13] = butter_fly(x0, x1, cospi_p06_p26);
in[3] = butter_fly(x0, x1, cospi_m26_p06);
in[13] = butter_fly(&x0, &x1, &cospi_p06_p26);
in[3] = butter_fly(&x0, &x1, &cospi_m26_p06);
}
void fadst16_avx2(__m256i *in) {
......@@ -1249,23 +1249,23 @@ static void fdct16_odd_avx2(__m256i *in) {
u0 = _mm256_unpacklo_epi16(in[4], in[11]);
u1 = _mm256_unpackhi_epi16(in[4], in[11]);
y4 = butter_fly(u0, u1, cospi_m16_p16);
y11 = butter_fly(u0, u1, cospi_p16_p16);
y4 = butter_fly(&u0, &u1, &cospi_m16_p16);
y11 = butter_fly(&u0, &u1, &cospi_p16_p16);
u0 = _mm256_unpacklo_epi16(in[5], in[10]);
u1 = _mm256_unpackhi_epi16(in[5], in[10]);
y5 = butter_fly(u0, u1, cospi_m16_p16);
y10 = butter_fly(u0, u1, cospi_p16_p16);
y5 = butter_fly(&u0, &u1, &cospi_m16_p16);
y10 = butter_fly(&u0, &u1, &cospi_p16_p16);
u0 = _mm256_unpacklo_epi16(in[6], in[9]);
u1 = _mm256_unpackhi_epi16(in[6], in[9]);
y6 = butter_fly(u0, u1, cospi_m16_p16);
y9 = butter_fly(u0, u1, cospi_p16_p16);
y6 = butter_fly(&u0, &u1, &cospi_m16_p16);
y9 = butter_fly(&u0, &u1, &cospi_p16_p16);
u0 = _mm256_unpacklo_epi16(in[7], in[8]);
u1 = _mm256_unpackhi_epi16(in[7], in[8]);
y7 = butter_fly(u0, u1, cospi_m16_p16);
y8 = butter_fly(u0, u1, cospi_p16_p16);
y7 = butter_fly(&u0, &u1, &cospi_m16_p16);
y8 = butter_fly(&u0, &u1, &cospi_p16_p16);
y12 = in[12];
y13 = in[13];
......@@ -1302,23 +1302,23 @@ static void fdct16_odd_avx2(__m256i *in) {
u0 = _mm256_unpacklo_epi16(x2, x13);
u1 = _mm256_unpackhi_epi16(x2, x13);
y2 = butter_fly(u0, u1, cospi_m08_p24);
y13 = butter_fly(u0, u1, cospi_p24_p08);
y2 = butter_fly(&u0, &u1, &cospi_m08_p24);
y13 = butter_fly(&u0, &u1, &cospi_p24_p08);
u0 = _mm256_unpacklo_epi16(x3, x12);
u1 = _mm256_unpackhi_epi16(x3, x12);
y3 = butter_fly(u0, u1, cospi_m08_p24);
y12 = butter_fly(u0, u1, cospi_p24_p08);
y3 = butter_fly(&u0, &u1, &cospi_m08_p24);
y12 = butter_fly(&u0, &u1, &cospi_p24_p08);
u0 = _mm256_unpacklo_epi16(x4, x11);
u1 = _mm256_unpackhi_epi16(x4, x11);
y4 = butter_fly(u0, u1, cospi_m24_m08);
y11 = butter_fly(u0, u1, cospi_m08_p24);
y4 = butter_fly(&u0, &u1, &cospi_m24_m08);
y11 = butter_fly(&u0, &u1, &cospi_m08_p24);
u0 = _mm256_unpacklo_epi16(x5, x10);
u1 = _mm256_unpackhi_epi16(x5, x10);
y5 = butter_fly(u0, u1, cospi_m24_m08);
y10 = butter_fly(u0, u1, cospi_m08_p24);
y5 = butter_fly(&u0, &u1, &cospi_m24_m08);
y10 = butter_fly(&u0, &u1, &cospi_m08_p24);
// stage 5
x0 = _mm256_add_epi16(y0, y3);
......@@ -1351,23 +1351,23 @@ static void fdct16_odd_avx2(__m256i *in) {
u0 = _mm256_unpacklo_epi16(x1, x14);
u1 = _mm256_unpackhi_epi16(x1, x14);
y1 = butter_fly(u0, u1, cospi_m04_p28);
y14 = butter_fly(u0, u1, cospi_p28_p04);
y1 = butter_fly(&u0, &u1, &cospi_m04_p28);
y14 = butter_fly(&u0, &u1, &cospi_p28_p04);
u0 = _mm256_unpacklo_epi16(x2, x13);
u1 = _mm256_unpackhi_epi16(x2, x13);
y2 = butter_fly(u0, u1, cospi_m28_m04);
y13 = butter_fly(u0, u1, cospi_m04_p28);
y2 = butter_fly(&u0, &u1, &cospi_m28_m04);
y13 = butter_fly(&u0, &u1, &cospi_m04_p28);
u0 = _mm256_unpacklo_epi16(x5, x10);
u1 = _mm256_unpackhi_epi16(x5, x10);
y5 = butter_fly(u0, u1, cospi_m20_p12);
y10 = butter_fly(u0, u1, cospi_p12_p20);
y5 = butter_fly(&u0, &u1, &cospi_m20_p12);
y10 = butter_fly(&u0, &u1, &cospi_p12_p20);
u0 = _mm256_unpacklo_epi16(x6, x9);
u1 = _mm256_unpackhi_epi16(x6, x9);
y6 = butter_fly(u0, u1, cospi_m12_m20);
y9 = butter_fly(u0, u1, cospi_m20_p12);
y6 = butter_fly(&u0, &u1, &cospi_m12_m20);
y9 = butter_fly(&u0, &u1, &cospi_m20_p12);
// stage 7
x0 = _mm256_add_epi16(y0, y1);
......@@ -1391,43 +1391,43 @@ static void fdct16_odd_avx2(__m256i *in) {
// stage 8
u0 = _mm256_unpacklo_epi16(x0, x15);
u1 = _mm256_unpackhi_epi16(x0, x15);
in[0] = butter_fly(u0, u1, cospi_p31_p01);
in[15] = butter_fly(u0, u1, cospi_m01_p31);
in[0] = butter_fly(&u0, &u1, &cospi_p31_p01);
in[15] = butter_fly(&u0, &u1, &cospi_m01_p31);
u0 = _mm256_unpacklo_epi16(x1, x14);
u1 = _mm256_unpackhi_epi16(x1, x14);
in[1] = butter_fly(u0, u1, cospi_p15_p17);
in[14] = butter_fly(u0, u1, cospi_m17_p15);
in[1] = butter_fly(&u0, &u1, &cospi_p15_p17);
in[14] = butter_fly(&u0, &u1, &cospi_m17_p15);
u0 = _mm256_unpacklo_epi16(x2, x13);
u1 = _mm256_unpackhi_epi16(x2, x13);
in[2] = butter_fly(u0, u1, cospi_p23_p09);
in[13] = butter_fly(u0, u1, cospi_m09_p23);
in[2] = butter_fly(&u0, &u1, &cospi_p23_p09);
in[13] = butter_fly(&u0, &u1, &cospi_m09_p23);
u0 = _mm256_unpacklo_epi16(x3, x12);
u1 = _mm256_unpackhi_epi16(x3, x12);
in[3] = butter_fly(u0, u1, cospi_p07_p25);
in[12] = butter_fly(u0, u1, cospi_m25_p07);
in[3] = butter_fly(&u0, &u1, &cospi_p07_p25);
in[12] = butter_fly(&u0, &u1, &cospi_m25_p07);
u0 = _mm256_unpacklo_epi16(x4, x11);
u1 = _mm256_unpackhi_epi16(x4, x11);
in[4] = butter_fly(u0, u1, cospi_p27_p05);
in[11] = butter_fly(u0, u1, cospi_m05_p27);
in[4] = butter_fly(&u0, &u1, &cospi_p27_p05);
in[11] = butter_fly(&u0, &u1, &cospi_m05_p27);
u0 = _mm256_unpacklo_epi16(x5, x10);
u1 = _mm256_unpackhi_epi16(x5, x10);
in[5] = butter_fly(u0, u1, cospi_p11_p21);
in[10] = butter_fly(u0, u1, cospi_m21_p11);
in[5] = butter_fly(&u0, &u1, &cospi_p11_p21);
in[10] = butter_fly(&u0, &u1, &cospi_m21_p11);
u0 = _mm256_unpacklo_epi16(x6, x9);
u1 = _mm256_unpackhi_epi16(x6, x9);
in[6] = butter_fly(u0, u1, cospi_p19_p13);
in[9] = butter_fly(u0, u1, cospi_m13_p19);
in[6] = butter_fly(&u0, &u1, &cospi_p19_p13);
in[9] = butter_fly(&u0, &u1, &cospi_m13_p19);
u0 = _mm256_unpacklo_epi16(x7, x8);
u1 = _mm256_unpackhi_epi16(x7, x8);
in[7] = butter_fly(u0, u1, cospi_p03_p29);
in[8] = butter_fly(u0, u1, cospi_m29_p03);
in[7] = butter_fly(&u0, &u1, &cospi_p03_p29);
in[8] = butter_fly(&u0, &u1, &cospi_m29_p03);
}
static void fdct32_avx2(__m256i *in0, __m256i *in1) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment