Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
997ba51e
Commit
997ba51e
authored
Dec 09, 2016
by
Yi Luo
Committed by
Yaowu Xu
Dec 10, 2016
Browse files
Fix 16x16 HT avx2 mismatch with C
BUG=aomedia:109 Change-Id: Iaada41806771392478d3e09af249b9701e07beec
parent
6847860b
Changes
1
Hide whitespace changes
Inline
Side-by-side
av1/encoder/x86/hybrid_fwd_txfm_avx2.c
View file @
997ba51e
...
...
@@ -538,24 +538,6 @@ void fadst16_avx2(__m256i *in) {
v15
=
_mm256_sub_epi32
(
x7
,
x15
);
// low 256 bits rounding
u0
=
_mm256_add_epi32
(
u0
,
dct_rounding
);
u1
=
_mm256_add_epi32
(
u1
,
dct_rounding
);
u2
=
_mm256_add_epi32
(
u2
,
dct_rounding
);
u3
=
_mm256_add_epi32
(
u3
,
dct_rounding
);
u4
=
_mm256_add_epi32
(
u4
,
dct_rounding
);
u5
=
_mm256_add_epi32
(
u5
,
dct_rounding
);
u6
=
_mm256_add_epi32
(
u6
,
dct_rounding
);
u7
=
_mm256_add_epi32
(
u7
,
dct_rounding
);
u0
=
_mm256_srai_epi32
(
u0
,
DCT_CONST_BITS
);
u1
=
_mm256_srai_epi32
(
u1
,
DCT_CONST_BITS
);
u2
=
_mm256_srai_epi32
(
u2
,
DCT_CONST_BITS
);
u3
=
_mm256_srai_epi32
(
u3
,
DCT_CONST_BITS
);
u4
=
_mm256_srai_epi32
(
u4
,
DCT_CONST_BITS
);
u5
=
_mm256_srai_epi32
(
u5
,
DCT_CONST_BITS
);
u6
=
_mm256_srai_epi32
(
u6
,
DCT_CONST_BITS
);
u7
=
_mm256_srai_epi32
(
u7
,
DCT_CONST_BITS
);
u8
=
_mm256_add_epi32
(
u8
,
dct_rounding
);
u9
=
_mm256_add_epi32
(
u9
,
dct_rounding
);
u10
=
_mm256_add_epi32
(
u10
,
dct_rounding
);
...
...
@@ -575,24 +557,6 @@ void fadst16_avx2(__m256i *in) {
u15
=
_mm256_srai_epi32
(
u15
,
DCT_CONST_BITS
);
// high 256 bits rounding
v0
=
_mm256_add_epi32
(
v0
,
dct_rounding
);
v1
=
_mm256_add_epi32
(
v1
,
dct_rounding
);
v2
=
_mm256_add_epi32
(
v2
,
dct_rounding
);
v3
=
_mm256_add_epi32
(
v3
,
dct_rounding
);
v4
=
_mm256_add_epi32
(
v4
,
dct_rounding
);
v5
=
_mm256_add_epi32
(
v5
,
dct_rounding
);
v6
=
_mm256_add_epi32
(
v6
,
dct_rounding
);
v7
=
_mm256_add_epi32
(
v7
,
dct_rounding
);
v0
=
_mm256_srai_epi32
(
v0
,
DCT_CONST_BITS
);
v1
=
_mm256_srai_epi32
(
v1
,
DCT_CONST_BITS
);
v2
=
_mm256_srai_epi32
(
v2
,
DCT_CONST_BITS
);
v3
=
_mm256_srai_epi32
(
v3
,
DCT_CONST_BITS
);
v4
=
_mm256_srai_epi32
(
v4
,
DCT_CONST_BITS
);
v5
=
_mm256_srai_epi32
(
v5
,
DCT_CONST_BITS
);
v6
=
_mm256_srai_epi32
(
v6
,
DCT_CONST_BITS
);
v7
=
_mm256_srai_epi32
(
v7
,
DCT_CONST_BITS
);
v8
=
_mm256_add_epi32
(
v8
,
dct_rounding
);
v9
=
_mm256_add_epi32
(
v9
,
dct_rounding
);
v10
=
_mm256_add_epi32
(
v10
,
dct_rounding
);
...
...
@@ -612,14 +576,6 @@ void fadst16_avx2(__m256i *in) {
v15
=
_mm256_srai_epi32
(
v15
,
DCT_CONST_BITS
);
// Saturation pack 32-bit to 16-bit
x0
=
_mm256_packs_epi32
(
u0
,
v0
);
x1
=
_mm256_packs_epi32
(
u1
,
v1
);
x2
=
_mm256_packs_epi32
(
u2
,
v2
);
x3
=
_mm256_packs_epi32
(
u3
,
v3
);
x4
=
_mm256_packs_epi32
(
u4
,
v4
);
x5
=
_mm256_packs_epi32
(
u5
,
v5
);
x6
=
_mm256_packs_epi32
(
u6
,
v6
);
x7
=
_mm256_packs_epi32
(
u7
,
v7
);
x8
=
_mm256_packs_epi32
(
u8
,
v8
);
x9
=
_mm256_packs_epi32
(
u9
,
v9
);
x10
=
_mm256_packs_epi32
(
u10
,
v10
);
...
...
@@ -630,15 +586,6 @@ void fadst16_avx2(__m256i *in) {
x15
=
_mm256_packs_epi32
(
u15
,
v15
);
// stage 2
s0
=
x0
;
s1
=
x1
;
s2
=
x2
;
s3
=
x3
;
s4
=
x4
;
s5
=
x5
;
s6
=
x6
;
s7
=
x7
;
y0
=
_mm256_unpacklo_epi16
(
x8
,
x9
);
y1
=
_mm256_unpackhi_epi16
(
x8
,
x9
);
s8
=
_mm256_madd_epi16
(
y0
,
cospi_p04_p28
);
...
...
@@ -667,14 +614,46 @@ void fadst16_avx2(__m256i *in) {
s15
=
_mm256_madd_epi16
(
y0
,
cospi_p20_p12
);
x15
=
_mm256_madd_epi16
(
y1
,
cospi_p20_p12
);
x0
=
_mm256_add_epi16
(
s0
,
s4
);
x1
=
_mm256_add_epi16
(
s1
,
s5
);
x2
=
_mm256_add_epi16
(
s2
,
s6
);
x3
=
_mm256_add_epi16
(
s3
,
s7
);
x4
=
_mm256_sub_epi16
(
s0
,
s4
);
x5
=
_mm256_sub_epi16
(
s1
,
s5
);
x6
=
_mm256_sub_epi16
(
s2
,
s6
);
x7
=
_mm256_sub_epi16
(
s3
,
s7
);
x0
=
_mm256_add_epi32
(
u0
,
u4
);
s0
=
_mm256_add_epi32
(
v0
,
v4
);
x1
=
_mm256_add_epi32
(
u1
,
u5
);
s1
=
_mm256_add_epi32
(
v1
,
v5
);
x2
=
_mm256_add_epi32
(
u2
,
u6
);
s2
=
_mm256_add_epi32
(
v2
,
v6
);
x3
=
_mm256_add_epi32
(
u3
,
u7
);
s3
=
_mm256_add_epi32
(
v3
,
v7
);
v8
=
_mm256_sub_epi32
(
u0
,
u4
);
v9
=
_mm256_sub_epi32
(
v0
,
v4
);
v10
=
_mm256_sub_epi32
(
u1
,
u5
);
v11
=
_mm256_sub_epi32
(
v1
,
v5
);
v12
=
_mm256_sub_epi32
(
u2
,
u6
);
v13
=
_mm256_sub_epi32
(
v2
,
v6
);
v14
=
_mm256_sub_epi32
(
u3
,
u7
);
v15
=
_mm256_sub_epi32
(
v3
,
v7
);
v8
=
_mm256_add_epi32
(
v8
,
dct_rounding
);
v9
=
_mm256_add_epi32
(
v9
,
dct_rounding
);
v10
=
_mm256_add_epi32
(
v10
,
dct_rounding
);
v11
=
_mm256_add_epi32
(
v11
,
dct_rounding
);
v12
=
_mm256_add_epi32
(
v12
,
dct_rounding
);
v13
=
_mm256_add_epi32
(
v13
,
dct_rounding
);
v14
=
_mm256_add_epi32
(
v14
,
dct_rounding
);
v15
=
_mm256_add_epi32
(
v15
,
dct_rounding
);
v8
=
_mm256_srai_epi32
(
v8
,
DCT_CONST_BITS
);
v9
=
_mm256_srai_epi32
(
v9
,
DCT_CONST_BITS
);
v10
=
_mm256_srai_epi32
(
v10
,
DCT_CONST_BITS
);
v11
=
_mm256_srai_epi32
(
v11
,
DCT_CONST_BITS
);
v12
=
_mm256_srai_epi32
(
v12
,
DCT_CONST_BITS
);
v13
=
_mm256_srai_epi32
(
v13
,
DCT_CONST_BITS
);
v14
=
_mm256_srai_epi32
(
v14
,
DCT_CONST_BITS
);
v15
=
_mm256_srai_epi32
(
v15
,
DCT_CONST_BITS
);
x4
=
_mm256_packs_epi32
(
v8
,
v9
);
x5
=
_mm256_packs_epi32
(
v10
,
v11
);
x6
=
_mm256_packs_epi32
(
v12
,
v13
);
x7
=
_mm256_packs_epi32
(
v14
,
v15
);
u8
=
_mm256_add_epi32
(
s8
,
s12
);
u9
=
_mm256_add_epi32
(
s9
,
s13
);
...
...
@@ -694,57 +673,32 @@ void fadst16_avx2(__m256i *in) {
v14
=
_mm256_sub_epi32
(
x10
,
x14
);
v15
=
_mm256_sub_epi32
(
x11
,
x15
);
u8
=
_mm256_add_epi32
(
u8
,
dct_rounding
);
u9
=
_mm256_add_epi32
(
u9
,
dct_rounding
);
u10
=
_mm256_add_epi32
(
u10
,
dct_rounding
);
u11
=
_mm256_add_epi32
(
u11
,
dct_rounding
);
u12
=
_mm256_add_epi32
(
u12
,
dct_rounding
);
u13
=
_mm256_add_epi32
(
u13
,
dct_rounding
);
u14
=
_mm256_add_epi32
(
u14
,
dct_rounding
);
u15
=
_mm256_add_epi32
(
u15
,
dct_rounding
);
u8
=
_mm256_srai_epi32
(
u8
,
DCT_CONST_BITS
);
u9
=
_mm256_srai_epi32
(
u9
,
DCT_CONST_BITS
);
u10
=
_mm256_srai_epi32
(
u10
,
DCT_CONST_BITS
);
u11
=
_mm256_srai_epi32
(
u11
,
DCT_CONST_BITS
);
u12
=
_mm256_srai_epi32
(
u12
,
DCT_CONST_BITS
);
u13
=
_mm256_srai_epi32
(
u13
,
DCT_CONST_BITS
);
u14
=
_mm256_srai_epi32
(
u14
,
DCT_CONST_BITS
);
u15
=
_mm256_srai_epi32
(
u15
,
DCT_CONST_BITS
);
v8
=
_mm256_add_epi32
(
v8
,
dct_rounding
);
v9
=
_mm256_add_epi32
(
v9
,
dct_rounding
);
v10
=
_mm256_add_epi32
(
v10
,
dct_rounding
);
v11
=
_mm256_add_epi32
(
v11
,
dct_rounding
);
v12
=
_mm256_add_epi32
(
v12
,
dct_rounding
);
v13
=
_mm256_add_epi32
(
v13
,
dct_rounding
);
v14
=
_mm256_add_epi32
(
v14
,
dct_rounding
);
v15
=
_mm256_add_epi32
(
v15
,
dct_rounding
);
v8
=
_mm256_srai_epi32
(
v8
,
DCT_CONST_BITS
);
v9
=
_mm256_srai_epi32
(
v9
,
DCT_CONST_BITS
);
v10
=
_mm256_srai_epi32
(
v10
,
DCT_CONST_BITS
);
v11
=
_mm256_srai_epi32
(
v11
,
DCT_CONST_BITS
);
v12
=
_mm256_srai_epi32
(
v12
,
DCT_CONST_BITS
);
v13
=
_mm256_srai_epi32
(
v13
,
DCT_CONST_BITS
);
v14
=
_mm256_srai_epi32
(
v14
,
DCT_CONST_BITS
);
v15
=
_mm256_srai_epi32
(
v15
,
DCT_CONST_BITS
);
x8
=
_mm256_packs_epi32
(
u8
,
v8
);
x9
=
_mm256_packs_epi32
(
u9
,
v9
);
x10
=
_mm256_packs_epi32
(
u10
,
v10
);
x11
=
_mm256_packs_epi32
(
u11
,
v11
);
x12
=
_mm256_packs_epi32
(
u12
,
v12
);
x13
=
_mm256_packs_epi32
(
u13
,
v13
);
x14
=
_mm256_packs_epi32
(
u14
,
v14
);
x15
=
_mm256_packs_epi32
(
u15
,
v15
);
// stage 3
s0
=
x0
;
s1
=
x1
;
s2
=
x2
;
s3
=
x3
;
y0
=
_mm256_unpacklo_epi16
(
x4
,
x5
);
y1
=
_mm256_unpackhi_epi16
(
x4
,
x5
);
s4
=
_mm256_madd_epi16
(
y0
,
cospi_p08_p24
);
...
...
@@ -759,11 +713,6 @@ void fadst16_avx2(__m256i *in) {
s7
=
_mm256_madd_epi16
(
y0
,
cospi_p08_p24
);
x7
=
_mm256_madd_epi16
(
y1
,
cospi_p08_p24
);
s8
=
x8
;
s9
=
x9
;
s10
=
x10
;
s11
=
x11
;
y0
=
_mm256_unpacklo_epi16
(
x12
,
x13
);
y1
=
_mm256_unpackhi_epi16
(
x12
,
x13
);
s12
=
_mm256_madd_epi16
(
y0
,
cospi_p08_p24
);
...
...
@@ -778,10 +727,37 @@ void fadst16_avx2(__m256i *in) {
s15
=
_mm256_madd_epi16
(
y0
,
cospi_p08_p24
);
x15
=
_mm256_madd_epi16
(
y1
,
cospi_p08_p24
);
in
[
0
]
=
_mm256_add_epi16
(
s0
,
s2
);
x1
=
_mm256_add_epi16
(
s1
,
s3
);
x2
=
_mm256_sub_epi16
(
s0
,
s2
);
x3
=
_mm256_sub_epi16
(
s1
,
s3
);
u0
=
_mm256_add_epi32
(
x0
,
x2
);
v0
=
_mm256_add_epi32
(
s0
,
s2
);
u1
=
_mm256_add_epi32
(
x1
,
x3
);
v1
=
_mm256_add_epi32
(
s1
,
s3
);
u2
=
_mm256_sub_epi32
(
x0
,
x2
);
v2
=
_mm256_sub_epi32
(
s0
,
s2
);
u3
=
_mm256_sub_epi32
(
x1
,
x3
);
v3
=
_mm256_sub_epi32
(
s1
,
s3
);
u0
=
_mm256_add_epi32
(
u0
,
dct_rounding
);
v0
=
_mm256_add_epi32
(
v0
,
dct_rounding
);
u1
=
_mm256_add_epi32
(
u1
,
dct_rounding
);
v1
=
_mm256_add_epi32
(
v1
,
dct_rounding
);
u2
=
_mm256_add_epi32
(
u2
,
dct_rounding
);
v2
=
_mm256_add_epi32
(
v2
,
dct_rounding
);
u3
=
_mm256_add_epi32
(
u3
,
dct_rounding
);
v3
=
_mm256_add_epi32
(
v3
,
dct_rounding
);
u0
=
_mm256_srai_epi32
(
u0
,
DCT_CONST_BITS
);
v0
=
_mm256_srai_epi32
(
v0
,
DCT_CONST_BITS
);
u1
=
_mm256_srai_epi32
(
u1
,
DCT_CONST_BITS
);
v1
=
_mm256_srai_epi32
(
v1
,
DCT_CONST_BITS
);
u2
=
_mm256_srai_epi32
(
u2
,
DCT_CONST_BITS
);
v2
=
_mm256_srai_epi32
(
v2
,
DCT_CONST_BITS
);
u3
=
_mm256_srai_epi32
(
u3
,
DCT_CONST_BITS
);
v3
=
_mm256_srai_epi32
(
v3
,
DCT_CONST_BITS
);
in
[
0
]
=
_mm256_packs_epi32
(
u0
,
v0
);
x1
=
_mm256_packs_epi32
(
u1
,
v1
);
x2
=
_mm256_packs_epi32
(
u2
,
v2
);
x3
=
_mm256_packs_epi32
(
u3
,
v3
);
// Rounding on s4 + s6, s5 + s7, s4 - s6, s5 - s7
u4
=
_mm256_add_epi32
(
s4
,
s6
);
...
...
@@ -819,10 +795,37 @@ void fadst16_avx2(__m256i *in) {
x6
=
_mm256_packs_epi32
(
u6
,
v6
);
x7
=
_mm256_packs_epi32
(
u7
,
v7
);
x8
=
_mm256_add_epi16
(
s8
,
s10
);
in
[
14
]
=
_mm256_add_epi16
(
s9
,
s11
);
x10
=
_mm256_sub_epi16
(
s8
,
s10
);
x11
=
_mm256_sub_epi16
(
s9
,
s11
);
u0
=
_mm256_add_epi32
(
u8
,
u10
);
v0
=
_mm256_add_epi32
(
v8
,
v10
);
u1
=
_mm256_add_epi32
(
u9
,
u11
);
v1
=
_mm256_add_epi32
(
v9
,
v11
);
u2
=
_mm256_sub_epi32
(
u8
,
u10
);
v2
=
_mm256_sub_epi32
(
v8
,
v10
);
u3
=
_mm256_sub_epi32
(
u9
,
u11
);
v3
=
_mm256_sub_epi32
(
v9
,
v11
);
u0
=
_mm256_add_epi32
(
u0
,
dct_rounding
);
v0
=
_mm256_add_epi32
(
v0
,
dct_rounding
);
u1
=
_mm256_add_epi32
(
u1
,
dct_rounding
);
v1
=
_mm256_add_epi32
(
v1
,
dct_rounding
);
u2
=
_mm256_add_epi32
(
u2
,
dct_rounding
);
v2
=
_mm256_add_epi32
(
v2
,
dct_rounding
);
u3
=
_mm256_add_epi32
(
u3
,
dct_rounding
);
v3
=
_mm256_add_epi32
(
v3
,
dct_rounding
);
u0
=
_mm256_srai_epi32
(
u0
,
DCT_CONST_BITS
);
v0
=
_mm256_srai_epi32
(
v0
,
DCT_CONST_BITS
);
u1
=
_mm256_srai_epi32
(
u1
,
DCT_CONST_BITS
);
v1
=
_mm256_srai_epi32
(
v1
,
DCT_CONST_BITS
);
u2
=
_mm256_srai_epi32
(
u2
,
DCT_CONST_BITS
);
v2
=
_mm256_srai_epi32
(
v2
,
DCT_CONST_BITS
);
u3
=
_mm256_srai_epi32
(
u3
,
DCT_CONST_BITS
);
v3
=
_mm256_srai_epi32
(
v3
,
DCT_CONST_BITS
);
x8
=
_mm256_packs_epi32
(
u0
,
v0
);
in
[
14
]
=
_mm256_packs_epi32
(
u1
,
v1
);
x10
=
_mm256_packs_epi32
(
u2
,
v2
);
x11
=
_mm256_packs_epi32
(
u3
,
v3
);
// Rounding on s12 + s14, s13 + s15, s12 - s14, s13 - s15
u12
=
_mm256_add_epi32
(
s12
,
s14
);
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment