Commit 6f5569f3 authored by Yi Luo's avatar Yi Luo

Highbd parallel_deblocking sse2 optimization

- Decoder speed improves ~13.7% (baseline + parallel_deblocking).
- Highbd loopfilter AVX2 version works when this experiment is
  disabled.

Change-Id: I5d56b137a1d52236a4735656c370d57ef71ae043
parent 0af2732e
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "aom_dsp/x86/lpf_common_sse2.h" #include "aom_dsp/x86/lpf_common_sse2.h"
#include "aom/aom_integer.h" #include "aom/aom_integer.h"
#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
const uint8_t *t, int bd, __m256i *blt, const uint8_t *t, int bd, __m256i *blt,
__m256i *lt, __m256i *thr) { __m256i *lt, __m256i *thr) {
...@@ -200,7 +201,54 @@ static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask, ...@@ -200,7 +201,54 @@ static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask,
qs[1] = _mm256_adds_epi16(qs1, t80); qs[1] = _mm256_adds_epi16(qs1, t80);
ps[1] = _mm256_adds_epi16(ps1, t80); ps[1] = _mm256_adds_epi16(ps1, t80);
} }
#endif // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p,
const uint8_t *blt,
const uint8_t *lt,
const uint8_t *thr, int bd) {
aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd);
}
void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p,
const uint8_t *blt, const uint8_t *lt,
const uint8_t *thr, int bd) {
aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd);
}
void aom_highbd_lpf_horizontal_4_dual_avx2(
uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
limit1, thresh1, bd);
}
void aom_highbd_lpf_horizontal_8_dual_avx2(
uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
limit1, thresh1, bd);
}
void aom_highbd_lpf_vertical_4_dual_avx2(
uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
limit1, thresh1, bd);
}
void aom_highbd_lpf_vertical_8_dual_avx2(
uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
limit1, thresh1, bd);
}
#else
void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch, void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch,
const uint8_t *blt, const uint8_t *blt,
const uint8_t *lt, const uint8_t *lt,
...@@ -822,3 +870,4 @@ void aom_highbd_lpf_vertical_8_dual_avx2( ...@@ -822,3 +870,4 @@ void aom_highbd_lpf_vertical_8_dual_avx2(
// Transpose back // Transpose back
highbd_transpose(src, 16, dst, p, 2); highbd_transpose(src, 16, dst, p, 2);
} }
#endif // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
This diff is collapsed.
This diff is collapsed.
...@@ -499,7 +499,7 @@ TEST_P(Loop8Test9Param, DISABLED_Speed) { ...@@ -499,7 +499,7 @@ TEST_P(Loop8Test9Param, DISABLED_Speed) {
using std::tr1::make_tuple; using std::tr1::make_tuple;
#if HAVE_SSE2 && (!CONFIG_PARALLEL_DEBLOCKING) #if HAVE_SSE2
#if CONFIG_HIGHBITDEPTH #if CONFIG_HIGHBITDEPTH
const loop8_param_t kHbdLoop8Test6[] = { const loop8_param_t kHbdLoop8Test6[] = {
...@@ -550,6 +550,7 @@ const loop8_param_t kHbdLoop8Test6[] = { ...@@ -550,6 +550,7 @@ const loop8_param_t kHbdLoop8Test6[] = {
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param, INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param,
::testing::ValuesIn(kHbdLoop8Test6)); ::testing::ValuesIn(kHbdLoop8Test6));
#else #else
#if !CONFIG_PARALLEL_DEBLOCKING
const loop8_param_t kLoop8Test6[] = { const loop8_param_t kLoop8Test6[] = {
make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8), make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8),
make_tuple(&aom_lpf_horizontal_8_sse2, &aom_lpf_horizontal_8_c, 8), make_tuple(&aom_lpf_horizontal_8_sse2, &aom_lpf_horizontal_8_c, 8),
...@@ -564,10 +565,11 @@ const loop8_param_t kLoop8Test6[] = { ...@@ -564,10 +565,11 @@ const loop8_param_t kLoop8Test6[] = {
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param, INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param,
::testing::ValuesIn(kLoop8Test6)); ::testing::ValuesIn(kLoop8Test6));
#endif // CONFIG_HIGHBITDEPTH
#endif #endif
#endif // CONFIG_HIGHBITDEPTH
#endif // HAVE_SSE2
#if HAVE_AVX2 && (!CONFIG_PARALLEL_DEBLOCKING) #if HAVE_AVX2
#if CONFIG_HIGHBITDEPTH #if CONFIG_HIGHBITDEPTH
const loop8_param_t kHbdLoop8Test6Avx2[] = { const loop8_param_t kHbdLoop8Test6Avx2[] = {
...@@ -600,7 +602,7 @@ INSTANTIATE_TEST_CASE_P( ...@@ -600,7 +602,7 @@ INSTANTIATE_TEST_CASE_P(
&aom_lpf_horizontal_edge_16_c, 8))); &aom_lpf_horizontal_edge_16_c, 8)));
#endif #endif
#if HAVE_SSE2 && (!CONFIG_PARALLEL_DEBLOCKING) #if HAVE_SSE2
#if CONFIG_HIGHBITDEPTH #if CONFIG_HIGHBITDEPTH
const dualloop8_param_t kHbdLoop8Test9[] = { const dualloop8_param_t kHbdLoop8Test9[] = {
make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2, make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
...@@ -632,6 +634,7 @@ const dualloop8_param_t kHbdLoop8Test9[] = { ...@@ -632,6 +634,7 @@ const dualloop8_param_t kHbdLoop8Test9[] = {
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param, INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param,
::testing::ValuesIn(kHbdLoop8Test9)); ::testing::ValuesIn(kHbdLoop8Test9));
#else #else
#if !CONFIG_PARALLEL_DEBLOCKING
const dualloop8_param_t kLoop8Test9[] = { const dualloop8_param_t kLoop8Test9[] = {
make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8), make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8),
make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8), make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8),
...@@ -641,10 +644,11 @@ const dualloop8_param_t kLoop8Test9[] = { ...@@ -641,10 +644,11 @@ const dualloop8_param_t kLoop8Test9[] = {
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param, INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param,
::testing::ValuesIn(kLoop8Test9)); ::testing::ValuesIn(kLoop8Test9));
#endif // CONFIG_HIGHBITDEPTH
#endif #endif
#endif // CONFIG_HIGHBITDEPTH
#endif // HAVE_SSE2
#if HAVE_AVX2 && (!CONFIG_PARALLEL_DEBLOCKING) #if HAVE_AVX2
#if CONFIG_HIGHBITDEPTH #if CONFIG_HIGHBITDEPTH
const dualloop8_param_t kHbdLoop8Test9Avx2[] = { const dualloop8_param_t kHbdLoop8Test9Avx2[] = {
make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2, make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment