Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
6d90a9d2
Commit
6d90a9d2
authored
Nov 03, 2014
by
Yunqing Wang
Committed by
Gerrit Code Review
Nov 03, 2014
Browse files
Merge "WORKAROUND FIX FOR GCC4.9.1"
parents
6bab322d
86175a57
Changes
2
Hide whitespace changes
Inline
Side-by-side
test/lpf_8_test.cc
View file @
6d90a9d2
...
...
@@ -522,6 +522,13 @@ INSTANTIATE_TEST_CASE_P(
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif
#if HAVE_AVX2 && (!CONFIG_VP9_HIGHBITDEPTH)
INSTANTIATE_TEST_CASE_P
(
AVX2_C_COMPARE_SINGLE
,
Loop8Test6Param
,
::
testing
::
Values
(
make_tuple
(
&
vp9_lpf_horizontal_16_avx2
,
&
vp9_lpf_horizontal_16_c
,
8
)));
#endif
#if HAVE_SSE2
#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P
(
...
...
@@ -584,4 +591,5 @@ INSTANTIATE_TEST_CASE_P(
&
vp9_lpf_vertical_8_dual_c
,
8
)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif
}
// namespace
vp9/common/x86/vp9_loopfilter_intrin_avx2.c
View file @
6d90a9d2
...
...
@@ -9,6 +9,7 @@
*/
#include
<immintrin.h>
/* AVX2 */
#include
"vpx_ports/mem.h"
static
void
mb_lpf_horizontal_edge_w_avx2_8
(
unsigned
char
*
s
,
int
p
,
const
unsigned
char
*
_blimit
,
const
unsigned
char
*
_limit
,
...
...
@@ -392,6 +393,11 @@ static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
}
}
DECLARE_ALIGNED
(
32
,
static
const
uint8_t
,
filt_loopfilter_avx2
[
32
])
=
{
0
,
128
,
1
,
128
,
2
,
128
,
3
,
128
,
4
,
128
,
5
,
128
,
6
,
128
,
7
,
128
,
8
,
128
,
9
,
128
,
10
,
128
,
11
,
128
,
12
,
128
,
13
,
128
,
14
,
128
,
15
,
128
};
static
void
mb_lpf_horizontal_edge_w_avx2_16
(
unsigned
char
*
s
,
int
p
,
const
unsigned
char
*
_blimit
,
const
unsigned
char
*
_limit
,
const
unsigned
char
*
_thresh
)
{
...
...
@@ -401,6 +407,9 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
__m128i
p7
,
p6
,
p5
;
__m128i
p4
,
p3
,
p2
,
p1
,
p0
,
q0
,
q1
,
q2
,
q3
,
q4
;
__m128i
q5
,
q6
,
q7
;
__m256i
p256_7
,
q256_7
,
p256_6
,
q256_6
,
p256_5
,
q256_5
,
p256_4
,
q256_4
,
p256_3
,
q256_3
,
p256_2
,
q256_2
,
p256_1
,
q256_1
,
p256_0
,
q256_0
;
const
__m128i
thresh
=
_mm_broadcastb_epi8
(
_mm_cvtsi32_si128
((
int
)
_thresh
[
0
]));
...
...
@@ -409,16 +418,37 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
const
__m128i
blimit
=
_mm_broadcastb_epi8
(
_mm_cvtsi32_si128
((
int
)
_blimit
[
0
]));
p4
=
_mm_loadu_si128
((
__m128i
*
)
(
s
-
5
*
p
));
p3
=
_mm_loadu_si128
((
__m128i
*
)
(
s
-
4
*
p
));
p2
=
_mm_loadu_si128
((
__m128i
*
)
(
s
-
3
*
p
));
p1
=
_mm_loadu_si128
((
__m128i
*
)
(
s
-
2
*
p
));
p0
=
_mm_loadu_si128
((
__m128i
*
)
(
s
-
1
*
p
));
q0
=
_mm_loadu_si128
((
__m128i
*
)
(
s
-
0
*
p
));
q1
=
_mm_loadu_si128
((
__m128i
*
)
(
s
+
1
*
p
));
q2
=
_mm_loadu_si128
((
__m128i
*
)
(
s
+
2
*
p
));
q3
=
_mm_loadu_si128
((
__m128i
*
)
(
s
+
3
*
p
));
q4
=
_mm_loadu_si128
((
__m128i
*
)
(
s
+
4
*
p
));
p256_4
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
-
5
*
p
)));
p256_3
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
-
4
*
p
)));
p256_2
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
-
3
*
p
)));
p256_1
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
-
2
*
p
)));
p256_0
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
-
1
*
p
)));
q256_0
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
-
0
*
p
)));
q256_1
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
+
1
*
p
)));
q256_2
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
+
2
*
p
)));
q256_3
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
+
3
*
p
)));
q256_4
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
+
4
*
p
)));
p4
=
_mm256_castsi256_si128
(
p256_4
);
p3
=
_mm256_castsi256_si128
(
p256_3
);
p2
=
_mm256_castsi256_si128
(
p256_2
);
p1
=
_mm256_castsi256_si128
(
p256_1
);
p0
=
_mm256_castsi256_si128
(
p256_0
);
q0
=
_mm256_castsi256_si128
(
q256_0
);
q1
=
_mm256_castsi256_si128
(
q256_1
);
q2
=
_mm256_castsi256_si128
(
q256_2
);
q3
=
_mm256_castsi256_si128
(
q256_3
);
q4
=
_mm256_castsi256_si128
(
q256_4
);
{
const
__m128i
abs_p1p0
=
_mm_or_si128
(
_mm_subs_epu8
(
p1
,
p0
),
...
...
@@ -534,23 +564,35 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
flat
=
_mm_cmpeq_epi8
(
flat
,
zero
);
flat
=
_mm_and_si128
(
flat
,
mask
);
p5
=
_mm_loadu_si128
((
__m128i
*
)
(
s
-
6
*
p
));
q5
=
_mm_loadu_si128
((
__m128i
*
)
(
s
+
5
*
p
));
p256_5
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
-
6
*
p
)));
q256_5
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
+
5
*
p
)));
p5
=
_mm256_castsi256_si128
(
p256_5
);
q5
=
_mm256_castsi256_si128
(
q256_5
);
flat2
=
_mm_max_epu8
(
_mm_or_si128
(
_mm_subs_epu8
(
p5
,
p0
),
_mm_subs_epu8
(
p0
,
p5
)),
_mm_or_si128
(
_mm_subs_epu8
(
q5
,
q0
),
_mm_subs_epu8
(
q0
,
q5
)));
flat2
=
_mm_max_epu8
(
work
,
flat2
);
p6
=
_mm_loadu_si128
((
__m128i
*
)
(
s
-
7
*
p
));
q6
=
_mm_loadu_si128
((
__m128i
*
)
(
s
+
6
*
p
));
p256_6
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
-
7
*
p
)));
q256_6
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
+
6
*
p
)));
p6
=
_mm256_castsi256_si128
(
p256_6
);
q6
=
_mm256_castsi256_si128
(
q256_6
);
work
=
_mm_max_epu8
(
_mm_or_si128
(
_mm_subs_epu8
(
p6
,
p0
),
_mm_subs_epu8
(
p0
,
p6
)),
_mm_or_si128
(
_mm_subs_epu8
(
q6
,
q0
),
_mm_subs_epu8
(
q0
,
q6
)));
flat2
=
_mm_max_epu8
(
work
,
flat2
);
p7
=
_mm_loadu_si128
((
__m128i
*
)
(
s
-
8
*
p
));
q7
=
_mm_loadu_si128
((
__m128i
*
)
(
s
+
7
*
p
));
p256_7
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
-
8
*
p
)));
q256_7
=
_mm256_castpd_si256
(
_mm256_broadcast_pd
(
(
__m128d
const
*
)(
s
+
7
*
p
)));
p7
=
_mm256_castsi256_si128
(
p256_7
);
q7
=
_mm256_castsi256_si128
(
q256_7
);
work
=
_mm_max_epu8
(
_mm_or_si128
(
_mm_subs_epu8
(
p7
,
p0
),
_mm_subs_epu8
(
p0
,
p7
)),
_mm_or_si128
(
_mm_subs_epu8
(
q7
,
q0
),
_mm_subs_epu8
(
q0
,
q7
)));
...
...
@@ -566,29 +608,28 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
{
const
__m256i
eight
=
_mm256_set1_epi16
(
8
);
const
__m256i
four
=
_mm256_set1_epi16
(
4
);
__m256i
p256_7
,
q256_7
,
p256_6
,
q256_6
,
p256_5
,
q256_5
,
p256_4
,
q256_4
,
p256_3
,
q256_3
,
p256_2
,
q256_2
,
p256_1
,
q256_1
,
p256_0
,
q256_0
;
__m256i
pixelFilter_p
,
pixelFilter_q
,
pixetFilter_p2p1p0
,
pixetFilter_q2q1q0
,
sum_p7
,
sum_q7
,
sum_p3
,
sum_q3
,
res_p
,
res_q
;
p256_7
=
_mm256_cvtepu8_epi16
(
p7
);
p256_6
=
_mm256_cvtepu8_epi16
(
p6
);
p256_5
=
_mm256_cvtepu8_epi16
(
p5
);
p256_4
=
_mm256_cvtepu8_epi16
(
p4
);
p256_3
=
_mm256_cvtepu8_epi16
(
p3
);
p256_2
=
_mm256_cvtepu8_epi16
(
p2
);
p256_1
=
_mm256_cvtepu8_epi16
(
p1
);
p256_0
=
_mm256_cvtepu8_epi16
(
p0
);
q256_0
=
_mm256_cvtepu8_epi16
(
q0
);
q256_1
=
_mm256_cvtepu8_epi16
(
q1
);
q256_2
=
_mm256_cvtepu8_epi16
(
q2
);
q256_3
=
_mm256_cvtepu8_epi16
(
q3
);
q256_4
=
_mm256_cvtepu8_epi16
(
q4
);
q256_5
=
_mm256_cvtepu8_epi16
(
q5
);
q256_6
=
_mm256_cvtepu8_epi16
(
q6
);
q256_7
=
_mm256_cvtepu8_epi16
(
q7
);
const
__m256i
filter
=
_mm256_load_si256
(
(
__m256i
const
*
)
filt_loopfilter_avx2
);
p256_7
=
_mm256_shuffle_epi8
(
p256_7
,
filter
);
p256_6
=
_mm256_shuffle_epi8
(
p256_6
,
filter
);
p256_5
=
_mm256_shuffle_epi8
(
p256_5
,
filter
);
p256_4
=
_mm256_shuffle_epi8
(
p256_4
,
filter
);
p256_3
=
_mm256_shuffle_epi8
(
p256_3
,
filter
);
p256_2
=
_mm256_shuffle_epi8
(
p256_2
,
filter
);
p256_1
=
_mm256_shuffle_epi8
(
p256_1
,
filter
);
p256_0
=
_mm256_shuffle_epi8
(
p256_0
,
filter
);
q256_0
=
_mm256_shuffle_epi8
(
q256_0
,
filter
);
q256_1
=
_mm256_shuffle_epi8
(
q256_1
,
filter
);
q256_2
=
_mm256_shuffle_epi8
(
q256_2
,
filter
);
q256_3
=
_mm256_shuffle_epi8
(
q256_3
,
filter
);
q256_4
=
_mm256_shuffle_epi8
(
q256_4
,
filter
);
q256_5
=
_mm256_shuffle_epi8
(
q256_5
,
filter
);
q256_6
=
_mm256_shuffle_epi8
(
q256_6
,
filter
);
q256_7
=
_mm256_shuffle_epi8
(
q256_7
,
filter
);
pixelFilter_p
=
_mm256_add_epi16
(
_mm256_add_epi16
(
p256_6
,
p256_5
),
_mm256_add_epi16
(
p256_4
,
p256_3
));
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment