Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
A
aom-rav1e
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Xiph.Org
aom-rav1e
Commits
043f4964
Commit
043f4964
authored
Feb 13, 2018
by
Linfeng Zhang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Implement fdct4x8_new_sse2 and fadst4x8_new_sse2
Change-Id: I9ab260c5ca31fe7e06bfc0f806893463c5255c45
parent
1fffc1f4
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
206 additions
and
25 deletions
+206
-25
av1/common/x86/av1_txfm_sse2.h
av1/common/x86/av1_txfm_sse2.h
+16
-0
av1/encoder/x86/av1_fwd_txfm_sse2.c
av1/encoder/x86/av1_fwd_txfm_sse2.c
+190
-25
No files found.
av1/common/x86/av1_txfm_sse2.h
View file @
043f4964
...
...
@@ -24,6 +24,22 @@
extern
"C"
{
#endif
static
INLINE
void
btf_16_w4_sse2
(
const
__m128i
*
const
w0
,
const
__m128i
*
const
w1
,
const
__m128i
__rounding
,
const
int8_t
cos_bit
,
const
__m128i
*
const
in0
,
const
__m128i
*
const
in1
,
__m128i
*
const
out0
,
__m128i
*
const
out1
)
{
const
__m128i
t0
=
_mm_unpacklo_epi16
(
*
in0
,
*
in1
);
const
__m128i
u0
=
_mm_madd_epi16
(
t0
,
*
w0
);
const
__m128i
v0
=
_mm_madd_epi16
(
t0
,
*
w1
);
const
__m128i
a0
=
_mm_add_epi32
(
u0
,
__rounding
);
const
__m128i
b0
=
_mm_add_epi32
(
v0
,
__rounding
);
const
__m128i
c0
=
_mm_srai_epi32
(
a0
,
cos_bit
);
const
__m128i
d0
=
_mm_srai_epi32
(
b0
,
cos_bit
);
*
out0
=
_mm_packs_epi32
(
c0
,
c0
);
*
out1
=
_mm_packs_epi32
(
d0
,
c0
);
}
#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
{ \
__m128i t0 = _mm_unpacklo_epi16(in0, in1); \
...
...
av1/encoder/x86/av1_fwd_txfm_sse2.c
View file @
043f4964
...
...
@@ -12,7 +12,7 @@
#include "av1/common/x86/av1_txfm_sse2.h"
#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
// TODO(linfengz):
specialize fdct4x8 and fadst4x8 optimization
.
// TODO(linfengz):
refine fdct4x8 and fadst4x8 optimization (if possible)
.
static
void
fdct4x4_new_sse2
(
const
__m128i
*
input
,
__m128i
*
output
,
int8_t
cos_bit
)
{
...
...
@@ -78,6 +78,75 @@ void fdct4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
output
[
3
]
=
x2
[
3
];
}
void
fdct4x8_new_sse2
(
const
__m128i
*
input
,
__m128i
*
output
,
int8_t
cos_bit
)
{
const
int32_t
*
cospi
=
cospi_arr
(
cos_bit
);
const
__m128i
__rounding
=
_mm_set1_epi32
(
1
<<
(
cos_bit
-
1
));
__m128i
cospi_m32_p32
=
pair_set_epi16
(
-
cospi
[
32
],
cospi
[
32
]);
__m128i
cospi_p32_p32
=
pair_set_epi16
(
cospi
[
32
],
cospi
[
32
]);
__m128i
cospi_p32_m32
=
pair_set_epi16
(
cospi
[
32
],
-
cospi
[
32
]);
__m128i
cospi_p48_p16
=
pair_set_epi16
(
cospi
[
48
],
cospi
[
16
]);
__m128i
cospi_m16_p48
=
pair_set_epi16
(
-
cospi
[
16
],
cospi
[
48
]);
__m128i
cospi_p56_p08
=
pair_set_epi16
(
cospi
[
56
],
cospi
[
8
]);
__m128i
cospi_m08_p56
=
pair_set_epi16
(
-
cospi
[
8
],
cospi
[
56
]);
__m128i
cospi_p24_p40
=
pair_set_epi16
(
cospi
[
24
],
cospi
[
40
]);
__m128i
cospi_m40_p24
=
pair_set_epi16
(
-
cospi
[
40
],
cospi
[
24
]);
// stage 1
__m128i
x1
[
8
];
x1
[
0
]
=
_mm_adds_epi16
(
input
[
0
],
input
[
7
]);
x1
[
7
]
=
_mm_subs_epi16
(
input
[
0
],
input
[
7
]);
x1
[
1
]
=
_mm_adds_epi16
(
input
[
1
],
input
[
6
]);
x1
[
6
]
=
_mm_subs_epi16
(
input
[
1
],
input
[
6
]);
x1
[
2
]
=
_mm_adds_epi16
(
input
[
2
],
input
[
5
]);
x1
[
5
]
=
_mm_subs_epi16
(
input
[
2
],
input
[
5
]);
x1
[
3
]
=
_mm_adds_epi16
(
input
[
3
],
input
[
4
]);
x1
[
4
]
=
_mm_subs_epi16
(
input
[
3
],
input
[
4
]);
// stage 2
__m128i
x2
[
8
];
x2
[
0
]
=
_mm_adds_epi16
(
x1
[
0
],
x1
[
3
]);
x2
[
3
]
=
_mm_subs_epi16
(
x1
[
0
],
x1
[
3
]);
x2
[
1
]
=
_mm_adds_epi16
(
x1
[
1
],
x1
[
2
]);
x2
[
2
]
=
_mm_subs_epi16
(
x1
[
1
],
x1
[
2
]);
x2
[
4
]
=
x1
[
4
];
btf_16_w4_sse2
(
&
cospi_m32_p32
,
&
cospi_p32_p32
,
__rounding
,
cos_bit
,
&
x1
[
5
],
&
x1
[
6
],
&
x2
[
5
],
&
x2
[
6
]);
x2
[
7
]
=
x1
[
7
];
// stage 3
__m128i
x3
[
8
];
btf_16_w4_sse2
(
&
cospi_p32_p32
,
&
cospi_p32_m32
,
__rounding
,
cos_bit
,
&
x2
[
0
],
&
x2
[
1
],
&
x3
[
0
],
&
x3
[
1
]);
btf_16_w4_sse2
(
&
cospi_p48_p16
,
&
cospi_m16_p48
,
__rounding
,
cos_bit
,
&
x2
[
2
],
&
x2
[
3
],
&
x3
[
2
],
&
x3
[
3
]);
x3
[
4
]
=
_mm_adds_epi16
(
x2
[
4
],
x2
[
5
]);
x3
[
5
]
=
_mm_subs_epi16
(
x2
[
4
],
x2
[
5
]);
x3
[
6
]
=
_mm_subs_epi16
(
x2
[
7
],
x2
[
6
]);
x3
[
7
]
=
_mm_adds_epi16
(
x2
[
7
],
x2
[
6
]);
// stage 4
__m128i
x4
[
8
];
x4
[
0
]
=
x3
[
0
];
x4
[
1
]
=
x3
[
1
];
x4
[
2
]
=
x3
[
2
];
x4
[
3
]
=
x3
[
3
];
btf_16_w4_sse2
(
&
cospi_p56_p08
,
&
cospi_m08_p56
,
__rounding
,
cos_bit
,
&
x3
[
4
],
&
x3
[
7
],
&
x4
[
4
],
&
x4
[
7
]);
btf_16_w4_sse2
(
&
cospi_p24_p40
,
&
cospi_m40_p24
,
__rounding
,
cos_bit
,
&
x3
[
5
],
&
x3
[
6
],
&
x4
[
5
],
&
x4
[
6
]);
// stage 5
output
[
0
]
=
x4
[
0
];
output
[
1
]
=
x4
[
4
];
output
[
2
]
=
x4
[
2
];
output
[
3
]
=
x4
[
6
];
output
[
4
]
=
x4
[
1
];
output
[
5
]
=
x4
[
5
];
output
[
6
]
=
x4
[
3
];
output
[
7
]
=
x4
[
7
];
}
void
fdct8_new_sse2
(
const
__m128i
*
input
,
__m128i
*
output
,
int8_t
cos_bit
)
{
const
int32_t
*
cospi
=
cospi_arr
(
cos_bit
);
const
__m128i
__rounding
=
_mm_set1_epi32
(
1
<<
(
cos_bit
-
1
));
...
...
@@ -1392,6 +1461,102 @@ static void fadst4_new_sse2(const __m128i *input, __m128i *output,
output
[
3
]
=
_mm_srli_si128
(
output
[
1
],
8
);
}
void
fadst4x8_new_sse2
(
const
__m128i
*
input
,
__m128i
*
output
,
int8_t
cos_bit
)
{
const
int32_t
*
cospi
=
cospi_arr
(
cos_bit
);
const
__m128i
__zero
=
_mm_setzero_si128
();
const
__m128i
__rounding
=
_mm_set1_epi32
(
1
<<
(
cos_bit
-
1
));
__m128i
cospi_p32_p32
=
pair_set_epi16
(
cospi
[
32
],
cospi
[
32
]);
__m128i
cospi_p32_m32
=
pair_set_epi16
(
cospi
[
32
],
-
cospi
[
32
]);
__m128i
cospi_p16_p48
=
pair_set_epi16
(
cospi
[
16
],
cospi
[
48
]);
__m128i
cospi_p48_m16
=
pair_set_epi16
(
cospi
[
48
],
-
cospi
[
16
]);
__m128i
cospi_m48_p16
=
pair_set_epi16
(
-
cospi
[
48
],
cospi
[
16
]);
__m128i
cospi_p04_p60
=
pair_set_epi16
(
cospi
[
4
],
cospi
[
60
]);
__m128i
cospi_p60_m04
=
pair_set_epi16
(
cospi
[
60
],
-
cospi
[
4
]);
__m128i
cospi_p20_p44
=
pair_set_epi16
(
cospi
[
20
],
cospi
[
44
]);
__m128i
cospi_p44_m20
=
pair_set_epi16
(
cospi
[
44
],
-
cospi
[
20
]);
__m128i
cospi_p36_p28
=
pair_set_epi16
(
cospi
[
36
],
cospi
[
28
]);
__m128i
cospi_p28_m36
=
pair_set_epi16
(
cospi
[
28
],
-
cospi
[
36
]);
__m128i
cospi_p52_p12
=
pair_set_epi16
(
cospi
[
52
],
cospi
[
12
]);
__m128i
cospi_p12_m52
=
pair_set_epi16
(
cospi
[
12
],
-
cospi
[
52
]);
// stage 1
__m128i
x1
[
8
];
x1
[
0
]
=
input
[
0
];
x1
[
1
]
=
_mm_subs_epi16
(
__zero
,
input
[
7
]);
x1
[
2
]
=
_mm_subs_epi16
(
__zero
,
input
[
3
]);
x1
[
3
]
=
input
[
4
];
x1
[
4
]
=
_mm_subs_epi16
(
__zero
,
input
[
1
]);
x1
[
5
]
=
input
[
6
];
x1
[
6
]
=
input
[
2
];
x1
[
7
]
=
_mm_subs_epi16
(
__zero
,
input
[
5
]);
// stage 2
__m128i
x2
[
8
];
x2
[
0
]
=
x1
[
0
];
x2
[
1
]
=
x1
[
1
];
btf_16_w4_sse2
(
&
cospi_p32_p32
,
&
cospi_p32_m32
,
__rounding
,
cos_bit
,
&
x1
[
2
],
&
x1
[
3
],
&
x2
[
2
],
&
x2
[
3
]);
x2
[
4
]
=
x1
[
4
];
x2
[
5
]
=
x1
[
5
];
btf_16_w4_sse2
(
&
cospi_p32_p32
,
&
cospi_p32_m32
,
__rounding
,
cos_bit
,
&
x1
[
6
],
&
x1
[
7
],
&
x2
[
6
],
&
x2
[
7
]);
// stage 3
__m128i
x3
[
8
];
x3
[
0
]
=
_mm_adds_epi16
(
x2
[
0
],
x2
[
2
]);
x3
[
2
]
=
_mm_subs_epi16
(
x2
[
0
],
x2
[
2
]);
x3
[
1
]
=
_mm_adds_epi16
(
x2
[
1
],
x2
[
3
]);
x3
[
3
]
=
_mm_subs_epi16
(
x2
[
1
],
x2
[
3
]);
x3
[
4
]
=
_mm_adds_epi16
(
x2
[
4
],
x2
[
6
]);
x3
[
6
]
=
_mm_subs_epi16
(
x2
[
4
],
x2
[
6
]);
x3
[
5
]
=
_mm_adds_epi16
(
x2
[
5
],
x2
[
7
]);
x3
[
7
]
=
_mm_subs_epi16
(
x2
[
5
],
x2
[
7
]);
// stage 4
__m128i
x4
[
8
];
x4
[
0
]
=
x3
[
0
];
x4
[
1
]
=
x3
[
1
];
x4
[
2
]
=
x3
[
2
];
x4
[
3
]
=
x3
[
3
];
btf_16_w4_sse2
(
&
cospi_p16_p48
,
&
cospi_p48_m16
,
__rounding
,
cos_bit
,
&
x3
[
4
],
&
x3
[
5
],
&
x4
[
4
],
&
x4
[
5
]);
btf_16_w4_sse2
(
&
cospi_m48_p16
,
&
cospi_p16_p48
,
__rounding
,
cos_bit
,
&
x3
[
6
],
&
x3
[
7
],
&
x4
[
6
],
&
x4
[
7
]);
// stage 5
__m128i
x5
[
8
];
x5
[
0
]
=
_mm_adds_epi16
(
x4
[
0
],
x4
[
4
]);
x5
[
4
]
=
_mm_subs_epi16
(
x4
[
0
],
x4
[
4
]);
x5
[
1
]
=
_mm_adds_epi16
(
x4
[
1
],
x4
[
5
]);
x5
[
5
]
=
_mm_subs_epi16
(
x4
[
1
],
x4
[
5
]);
x5
[
2
]
=
_mm_adds_epi16
(
x4
[
2
],
x4
[
6
]);
x5
[
6
]
=
_mm_subs_epi16
(
x4
[
2
],
x4
[
6
]);
x5
[
3
]
=
_mm_adds_epi16
(
x4
[
3
],
x4
[
7
]);
x5
[
7
]
=
_mm_subs_epi16
(
x4
[
3
],
x4
[
7
]);
// stage 6
__m128i
x6
[
8
];
btf_16_w4_sse2
(
&
cospi_p04_p60
,
&
cospi_p60_m04
,
__rounding
,
cos_bit
,
&
x5
[
0
],
&
x5
[
1
],
&
x6
[
0
],
&
x6
[
1
]);
btf_16_w4_sse2
(
&
cospi_p20_p44
,
&
cospi_p44_m20
,
__rounding
,
cos_bit
,
&
x5
[
2
],
&
x5
[
3
],
&
x6
[
2
],
&
x6
[
3
]);
btf_16_w4_sse2
(
&
cospi_p36_p28
,
&
cospi_p28_m36
,
__rounding
,
cos_bit
,
&
x5
[
4
],
&
x5
[
5
],
&
x6
[
4
],
&
x6
[
5
]);
btf_16_w4_sse2
(
&
cospi_p52_p12
,
&
cospi_p12_m52
,
__rounding
,
cos_bit
,
&
x5
[
6
],
&
x5
[
7
],
&
x6
[
6
],
&
x6
[
7
]);
// stage 7
output
[
0
]
=
x6
[
1
];
output
[
1
]
=
x6
[
6
];
output
[
2
]
=
x6
[
3
];
output
[
3
]
=
x6
[
4
];
output
[
4
]
=
x6
[
5
];
output
[
5
]
=
x6
[
2
];
output
[
6
]
=
x6
[
7
];
output
[
7
]
=
x6
[
0
];
}
static
void
fadst8x4_new_sse2
(
const
__m128i
*
input
,
__m128i
*
output
,
int8_t
cos_bit
)
{
const
int32_t
*
sinpi
=
sinpi_arr
(
cos_bit
);
...
...
@@ -1826,41 +1991,41 @@ static const transform_2d_sse2 txfm4_arr[] = {
};
static
const
transform_2d_sse2
txfm4x8_arr
[
16
]
=
{
{
fdct
8_new_sse2
,
fdct4_new_sse2
},
// DCT_DCT
{
fadst
8_new_sse2
,
fdct4_new_sse2
},
// ADST_DCT
{
fdct
8_new_sse2
,
fadst8x4_new_sse2
},
// DCT_ADST
{
fadst
8_new_sse2
,
fadst8x4_new_sse2
},
// ADST_ADST
{
fadst
8_new_sse2
,
fdct4_new_sse2
},
// FLIPADST_DCT
{
fdct
8_new_sse2
,
fadst8x4_new_sse2
},
// DCT_FLIPADST
{
fadst
8_new_sse2
,
fadst8x4_new_sse2
},
// FLIPADST_FLIPADST
{
fadst
8_new_sse2
,
fadst8x4_new_sse2
},
// ADST_FLIPADST
{
fadst
8_new_sse2
,
fadst8x4_new_sse2
},
// FLIPADST_ADST
{
fdct
4x8_new_sse2
,
fdct4_new_sse2
},
// DCT_DCT
{
fadst
4x8_new_sse2
,
fdct4_new_sse2
},
// ADST_DCT
{
fdct
4x8_new_sse2
,
fadst8x4_new_sse2
},
// DCT_ADST
{
fadst
4x8_new_sse2
,
fadst8x4_new_sse2
},
// ADST_ADST
{
fadst
4x8_new_sse2
,
fdct4_new_sse2
},
// FLIPADST_DCT
{
fdct
4x8_new_sse2
,
fadst8x4_new_sse2
},
// DCT_FLIPADST
{
fadst
4x8_new_sse2
,
fadst8x4_new_sse2
},
// FLIPADST_FLIPADST
{
fadst
4x8_new_sse2
,
fadst8x4_new_sse2
},
// ADST_FLIPADST
{
fadst
4x8_new_sse2
,
fadst8x4_new_sse2
},
// FLIPADST_ADST
{
fidentity8_new_sse2
,
fidentity8x4_new_sse2
},
// IDTX
{
fdct
8_new_sse2
,
fidentity8x4_new_sse2
},
// V_DCT
{
fdct
4x8_new_sse2
,
fidentity8x4_new_sse2
},
// V_DCT
{
fidentity8_new_sse2
,
fdct4_new_sse2
},
// H_DCT
{
fadst
8_new_sse2
,
fidentity8x4_new_sse2
},
// V_ADST
{
fadst
4x8_new_sse2
,
fidentity8x4_new_sse2
},
// V_ADST
{
fidentity8_new_sse2
,
fadst8x4_new_sse2
},
// H_ADST
{
fadst
8_new_sse2
,
fidentity8x4_new_sse2
},
// V_FLIPADST
{
fadst
4x8_new_sse2
,
fidentity8x4_new_sse2
},
// V_FLIPADST
{
fidentity8_new_sse2
,
fadst8x4_new_sse2
},
// H_FLIPADST
};
static
const
transform_2d_sse2
txfm8x4_arr
[]
=
{
{
fdct4_new_sse2
,
fdct
8_new_sse2
},
// DCT_DCT
{
fadst8x4_new_sse2
,
fdct
8_new_sse2
},
// ADST_DCT
{
fdct4_new_sse2
,
fadst
8_new_sse2
},
// DCT_ADST
{
fadst8x4_new_sse2
,
fadst
8_new_sse2
},
// ADST_ADST
{
fadst8x4_new_sse2
,
fdct
8_new_sse2
},
// FLIPADST_DCT
{
fdct4_new_sse2
,
fadst
8_new_sse2
},
// DCT_FLIPADST
{
fadst8x4_new_sse2
,
fadst
8_new_sse2
},
// FLIPADST_FLIPADST
{
fadst8x4_new_sse2
,
fadst
8_new_sse2
},
// ADST_FLIPADST
{
fadst8x4_new_sse2
,
fadst
8_new_sse2
},
// FLIPADST_ADST
{
fdct4_new_sse2
,
fdct
4x8_new_sse2
},
// DCT_DCT
{
fadst8x4_new_sse2
,
fdct
4x8_new_sse2
},
// ADST_DCT
{
fdct4_new_sse2
,
fadst
4x8_new_sse2
},
// DCT_ADST
{
fadst8x4_new_sse2
,
fadst
4x8_new_sse2
},
// ADST_ADST
{
fadst8x4_new_sse2
,
fdct
4x8_new_sse2
},
// FLIPADST_DCT
{
fdct4_new_sse2
,
fadst
4x8_new_sse2
},
// DCT_FLIPADST
{
fadst8x4_new_sse2
,
fadst
4x8_new_sse2
},
// FLIPADST_FLIPADST
{
fadst8x4_new_sse2
,
fadst
4x8_new_sse2
},
// ADST_FLIPADST
{
fadst8x4_new_sse2
,
fadst
4x8_new_sse2
},
// FLIPADST_ADST
{
fidentity8x4_new_sse2
,
fidentity8_new_sse2
},
// IDTX
{
fdct4_new_sse2
,
fidentity8_new_sse2
},
// V_DCT
{
fidentity8x4_new_sse2
,
fdct
8_new_sse2
},
// H_DCT
{
fidentity8x4_new_sse2
,
fdct
4x8_new_sse2
},
// H_DCT
{
fadst8x4_new_sse2
,
fidentity8_new_sse2
},
// V_ADST
{
fidentity8x4_new_sse2
,
fadst
8_new_sse2
},
// H_ADST
{
fidentity8x4_new_sse2
,
fadst
4x8_new_sse2
},
// H_ADST
{
fadst8x4_new_sse2
,
fidentity8_new_sse2
},
// V_FLIPADST
{
fidentity8x4_new_sse2
,
fadst
8_new_sse2
},
// H_FLIPADST
{
fidentity8x4_new_sse2
,
fadst
4x8_new_sse2
},
// H_FLIPADST
};
static
const
transform_2d_sse2
txfm8_arr
[]
=
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment