Commit 6f5569f3 authored by Yi Luo's avatar Yi Luo

Highbd parallel_deblocking sse2 optimization

- Decoder speed improves ~13.7% (baseline + parallel_deblocking).
- Highbd loopfilter AVX2 version works when this experiment is
  disabled.

Change-Id: I5d56b137a1d52236a4735656c370d57ef71ae043
parent 0af2732e
......@@ -16,6 +16,7 @@
#include "aom_dsp/x86/lpf_common_sse2.h"
#include "aom/aom_integer.h"
#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
const uint8_t *t, int bd, __m256i *blt,
__m256i *lt, __m256i *thr) {
......@@ -200,7 +201,54 @@ static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask,
qs[1] = _mm256_adds_epi16(qs1, t80);
ps[1] = _mm256_adds_epi16(ps1, t80);
}
#endif // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4
#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p,
const uint8_t *blt,
const uint8_t *lt,
const uint8_t *thr, int bd) {
aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd);
}
void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p,
const uint8_t *blt, const uint8_t *lt,
const uint8_t *thr, int bd) {
aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd);
}
void aom_highbd_lpf_horizontal_4_dual_avx2(
uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
limit1, thresh1, bd);
}
void aom_highbd_lpf_horizontal_8_dual_avx2(
uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
limit1, thresh1, bd);
}
void aom_highbd_lpf_vertical_4_dual_avx2(
uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
limit1, thresh1, bd);
}
void aom_highbd_lpf_vertical_8_dual_avx2(
uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
limit1, thresh1, bd);
}
#else
void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch,
const uint8_t *blt,
const uint8_t *lt,
......@@ -822,3 +870,4 @@ void aom_highbd_lpf_vertical_8_dual_avx2(
// Transpose back
highbd_transpose(src, 16, dst, p, 2);
}
#endif // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
This diff is collapsed.
......@@ -2997,7 +2997,7 @@ static void av1_filter_block_plane_vert(
case 4:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(filt_start),
aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(filt_start),
line_length, params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
......@@ -3009,7 +3009,7 @@ static void av1_filter_block_plane_vert(
case 8:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_vertical_8_c(CONVERT_TO_SHORTPTR(filt_start),
aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(filt_start),
line_length, params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
......@@ -3021,9 +3021,9 @@ static void av1_filter_block_plane_vert(
case 16:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_vertical_16_c(
CONVERT_TO_SHORTPTR(filt_start), line_length, params.mblim,
params.lim, params.hev_thr, cm->bit_depth);
aom_highbd_lpf_vertical_16(CONVERT_TO_SHORTPTR(filt_start),
line_length, params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
#endif // CONFIG_HIGHBITDEPTH
aom_lpf_vertical_16_c(filt_start, line_length, params.mblim,
......@@ -3055,7 +3055,7 @@ static void av1_filter_block_plane_vert(
uint8_t *const filt_start = block + pivot;
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(filt_start),
aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(filt_start),
line_length, params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
......@@ -3067,15 +3067,15 @@ static void av1_filter_block_plane_vert(
if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
}
}
#else // CONFIG_LPF_DIRECT
#else // !CONFIG_LPF_DIRECT
switch (params.filter_length) {
// apply 4-tap filtering
case 4:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
cm->bit_depth);
else
#endif // CONFIG_HIGHBITDEPTH
aom_lpf_vertical_4_c(p, dst_stride, params.mblim, params.lim,
......@@ -3085,9 +3085,9 @@ static void av1_filter_block_plane_vert(
case 8:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_vertical_8_c(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
cm->bit_depth);
else
#endif // CONFIG_HIGHBITDEPTH
aom_lpf_vertical_8_c(p, dst_stride, params.mblim, params.lim,
......@@ -3097,9 +3097,9 @@ static void av1_filter_block_plane_vert(
case 16:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_vertical_16_c(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
aom_highbd_lpf_vertical_16(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
cm->bit_depth);
else
#endif // CONFIG_HIGHBITDEPTH
aom_lpf_vertical_16_c(p, dst_stride, params.mblim, params.lim,
......@@ -3112,7 +3112,7 @@ static void av1_filter_block_plane_vert(
if (params.filter_length_internal) {
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(p + 4), dst_stride,
aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p + 4), dst_stride,
params.mblim, params.lim, params.hev_thr,
cm->bit_depth);
else
......@@ -3183,9 +3183,9 @@ static void av1_filter_block_plane_horz(
case 4:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_horizontal_4_c(
CONVERT_TO_SHORTPTR(filt_start), line_length, params.mblim,
params.lim, params.hev_thr, cm->bit_depth);
aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(filt_start),
line_length, params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
#endif // CONFIG_HIGHBITDEPTH
aom_lpf_horizontal_4_c(filt_start, line_length, params.mblim,
......@@ -3195,9 +3195,9 @@ static void av1_filter_block_plane_horz(
case 8:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_horizontal_8_c(
CONVERT_TO_SHORTPTR(filt_start), line_length, params.mblim,
params.lim, params.hev_thr, cm->bit_depth);
aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(filt_start),
line_length, params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
#endif // CONFIG_HIGHBITDEPTH
aom_lpf_horizontal_8_c(filt_start, line_length, params.mblim,
......@@ -3207,7 +3207,7 @@ static void av1_filter_block_plane_horz(
case 16:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_horizontal_edge_16_c(
aom_highbd_lpf_horizontal_edge_16(
CONVERT_TO_SHORTPTR(filt_start), line_length, params.mblim,
params.lim, params.hev_thr, cm->bit_depth);
else
......@@ -3241,7 +3241,7 @@ static void av1_filter_block_plane_horz(
uint8_t *const filt_start = block + pivot * line_length;
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_horizontal_4_c(CONVERT_TO_SHORTPTR(filt_start),
aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(filt_start),
line_length, params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
......@@ -3253,13 +3253,13 @@ static void av1_filter_block_plane_horz(
if (orig_pos[i] >= 0) src[orig_pos[i]] = block[i];
}
}
#else // CONFIG_LPF_DIRECT
#else // !CONFIG_LPF_DIRECT
switch (params.filter_length) {
// apply 4-tap filtering
case 4:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_horizontal_4_c(CONVERT_TO_SHORTPTR(p), dst_stride,
aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
......@@ -3271,7 +3271,7 @@ static void av1_filter_block_plane_horz(
case 8:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_horizontal_8_c(CONVERT_TO_SHORTPTR(p), dst_stride,
aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
......@@ -3283,7 +3283,7 @@ static void av1_filter_block_plane_horz(
case 16:
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_horizontal_edge_16_c(
aom_highbd_lpf_horizontal_edge_16(
CONVERT_TO_SHORTPTR(p), dst_stride, params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
......@@ -3298,7 +3298,7 @@ static void av1_filter_block_plane_horz(
if (params.filter_length_internal) {
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
aom_highbd_lpf_horizontal_4_c(CONVERT_TO_SHORTPTR(p + 4 * dst_stride),
aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p + 4 * dst_stride),
dst_stride, params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
else
......
......@@ -499,7 +499,7 @@ TEST_P(Loop8Test9Param, DISABLED_Speed) {
using std::tr1::make_tuple;
#if HAVE_SSE2 && (!CONFIG_PARALLEL_DEBLOCKING)
#if HAVE_SSE2
#if CONFIG_HIGHBITDEPTH
const loop8_param_t kHbdLoop8Test6[] = {
......@@ -550,6 +550,7 @@ const loop8_param_t kHbdLoop8Test6[] = {
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param,
::testing::ValuesIn(kHbdLoop8Test6));
#else
#if !CONFIG_PARALLEL_DEBLOCKING
const loop8_param_t kLoop8Test6[] = {
make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8),
make_tuple(&aom_lpf_horizontal_8_sse2, &aom_lpf_horizontal_8_c, 8),
......@@ -564,10 +565,11 @@ const loop8_param_t kLoop8Test6[] = {
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param,
::testing::ValuesIn(kLoop8Test6));
#endif // CONFIG_HIGHBITDEPTH
#endif
#endif // CONFIG_HIGHBITDEPTH
#endif // HAVE_SSE2
#if HAVE_AVX2 && (!CONFIG_PARALLEL_DEBLOCKING)
#if HAVE_AVX2
#if CONFIG_HIGHBITDEPTH
const loop8_param_t kHbdLoop8Test6Avx2[] = {
......@@ -600,7 +602,7 @@ INSTANTIATE_TEST_CASE_P(
&aom_lpf_horizontal_edge_16_c, 8)));
#endif
#if HAVE_SSE2 && (!CONFIG_PARALLEL_DEBLOCKING)
#if HAVE_SSE2
#if CONFIG_HIGHBITDEPTH
const dualloop8_param_t kHbdLoop8Test9[] = {
make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
......@@ -632,6 +634,7 @@ const dualloop8_param_t kHbdLoop8Test9[] = {
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param,
::testing::ValuesIn(kHbdLoop8Test9));
#else
#if !CONFIG_PARALLEL_DEBLOCKING
const dualloop8_param_t kLoop8Test9[] = {
make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8),
make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8),
......@@ -641,10 +644,11 @@ const dualloop8_param_t kLoop8Test9[] = {
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param,
::testing::ValuesIn(kLoop8Test9));
#endif // CONFIG_HIGHBITDEPTH
#endif
#endif // CONFIG_HIGHBITDEPTH
#endif // HAVE_SSE2
#if HAVE_AVX2 && (!CONFIG_PARALLEL_DEBLOCKING)
#if HAVE_AVX2
#if CONFIG_HIGHBITDEPTH
const dualloop8_param_t kHbdLoop8Test9Avx2[] = {
make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment