Commit e33f5819 authored by Maxym Dmytrychenko's avatar Maxym Dmytrychenko

SSE2 optimizations for _16 highbd lpf functions

Includes vertical and horizontal implementations
and to fix 13 TAPs/Parallel deblocking support

Appropriate tests are enabled

Performance changes, SSE2 over C:
Horizontal methods: up to    2x
Vertical   methods: up to  1.5x

Change-Id: Icbdc217a55353eb33417b81847b73005e043262d
parent a07a3b28
......@@ -130,6 +130,16 @@ static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p,
flat_mask_internal(th, p, q, bd, 1, 4, flat);
}
#if CONFIG_DEBLOCK_13TAP
// Note:
// access p[6-4], p[0], and q[6-4], q[0]
static INLINE void highbd_flat_mask4_13(const __m128i *th, const __m128i *p,
const __m128i *q, __m128i *flat,
int bd) {
flat_mask_internal(th, p, q, bd, 4, 7, flat);
}
#endif
// Note:
// access p[7-4], p[0], and q[7-4], q[0]
static INLINE void highbd_flat_mask5(const __m128i *th, const __m128i *p,
......@@ -214,8 +224,13 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
__m128i blimit, limit, thresh;
get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh);
#if CONFIG_DEBLOCK_13TAP
__m128i p[7], q[7];
load_highbd_pixel(s, 7, pitch, p, q);
#else
__m128i p[8], q[8];
load_highbd_pixel(s, 8, pitch, p, q);
#endif
__m128i mask;
highbd_filter_mask(p, q, &limit, &blimit, &mask);
......@@ -223,7 +238,12 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
__m128i flat, flat2;
const __m128i one = _mm_set1_epi16(1);
highbd_flat_mask4(&one, p, q, &flat, bd);
#if CONFIG_DEBLOCK_13TAP
highbd_flat_mask4_13(&one, p, q, &flat2, bd);
#else
highbd_flat_mask5(&one, p, q, &flat2, bd);
#endif
flat = _mm_and_si128(flat, mask);
flat2 = _mm_and_si128(flat2, flat);
......@@ -233,11 +253,133 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
// flat and wide flat calculations
__m128i flat_p[3], flat_q[3];
#if CONFIG_DEBLOCK_13TAP
__m128i flat2_p[6], flat2_q[6];
#else
__m128i flat2_p[7], flat2_q[7];
#endif
{
const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4);
#if CONFIG_DEBLOCK_13TAP
__m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
__m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
__m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
sum_p = _mm_add_epi16(sum_p, sum_lp);
__m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
sum_q = _mm_add_epi16(sum_q, sum_lq);
sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
flat2_p[0] = _mm_srli_epi16(
_mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
_mm_add_epi16(p[1], q[0]))),
4);
flat2_q[0] = _mm_srli_epi16(
_mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
_mm_add_epi16(p[0], q[1]))),
4);
flat_p[0] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
flat_q[0] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
__m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
__m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
__m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
__m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
sum_q = _mm_sub_epi16(sum_p, p[5]);
sum_p = _mm_sub_epi16(sum_p, q[5]);
flat2_p[1] = _mm_srli_epi16(
_mm_add_epi16(
sum_p, _mm_add_epi16(
sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
4);
flat2_q[1] = _mm_srli_epi16(
_mm_add_epi16(
sum_q, _mm_add_epi16(
sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
4);
sum_lq = _mm_sub_epi16(sum_lp, p[2]);
sum_lp = _mm_sub_epi16(sum_lp, q[2]);
flat_p[1] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
flat_q[1] =
_mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
sum_p6 = _mm_add_epi16(sum_p6, p[6]);
sum_q6 = _mm_add_epi16(sum_q6, q[6]);
sum_p3 = _mm_add_epi16(sum_p3, p[3]);
sum_q3 = _mm_add_epi16(sum_q3, q[3]);
sum_p = _mm_sub_epi16(sum_p, q[4]);
sum_q = _mm_sub_epi16(sum_q, p[4]);
flat2_p[2] = _mm_srli_epi16(
_mm_add_epi16(
sum_p, _mm_add_epi16(
sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
4);
flat2_q[2] = _mm_srli_epi16(
_mm_add_epi16(
sum_q, _mm_add_epi16(
sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
4);
sum_lp = _mm_sub_epi16(sum_lp, q[1]);
sum_lq = _mm_sub_epi16(sum_lq, p[1]);
flat_p[2] =
_mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
flat_q[2] =
_mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
sum_p6 = _mm_add_epi16(sum_p6, p[6]);
sum_q6 = _mm_add_epi16(sum_q6, q[6]);
sum_p = _mm_sub_epi16(sum_p, q[3]);
sum_q = _mm_sub_epi16(sum_q, p[3]);
flat2_p[3] = _mm_srli_epi16(
_mm_add_epi16(
sum_p, _mm_add_epi16(
sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
4);
flat2_q[3] = _mm_srli_epi16(
_mm_add_epi16(
sum_q, _mm_add_epi16(
sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
4);
sum_p6 = _mm_add_epi16(sum_p6, p[6]);
sum_q6 = _mm_add_epi16(sum_q6, q[6]);
sum_p = _mm_sub_epi16(sum_p, q[2]);
sum_q = _mm_sub_epi16(sum_q, p[2]);
flat2_p[4] = _mm_srli_epi16(
_mm_add_epi16(
sum_p, _mm_add_epi16(
sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
4);
flat2_q[4] = _mm_srli_epi16(
_mm_add_epi16(
sum_q, _mm_add_epi16(
sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
4);
sum_p6 = _mm_add_epi16(sum_p6, p[6]);
sum_q6 = _mm_add_epi16(sum_q6, q[6]);
sum_p = _mm_sub_epi16(sum_p, q[1]);
sum_q = _mm_sub_epi16(sum_q, p[1]);
flat2_p[5] = _mm_srli_epi16(
_mm_add_epi16(
sum_p, _mm_add_epi16(
sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
4);
flat2_q[5] = _mm_srli_epi16(
_mm_add_epi16(
sum_q, _mm_add_epi16(
sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
4);
#else
__m128i sum_p =
_mm_add_epi16(_mm_add_epi16(p[6], p[5]), _mm_add_epi16(p[4], p[3]));
__m128i sum_q =
......@@ -309,6 +451,7 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
flat2_q[i] =
_mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[i])), 4);
}
#endif
}
// highbd_filter8
......@@ -331,33 +474,25 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch,
q[i] = _mm_or_si128(qs[i], flat_q[i]);
}
// highbd_filter16
if (pixel_output == FOUR_PIXELS) {
for (i = 6; i >= 0; i--) {
// p[i] remains unchanged if !(flat2 && flat && mask)
p[i] = _mm_andnot_si128(flat2, p[i]);
flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
// get values for when (flat2 && flat && mask)
p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
q[i] = _mm_andnot_si128(flat2, q[i]);
flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
q[i] = _mm_or_si128(q[i], flat2_q[i]);
// highbd_filter16
#if CONFIG_DEBLOCK_13TAP
for (i = 5; i >= 0; i--) {
#else
for (i = 6; i >= 0; i--) {
#endif
// p[i] remains unchanged if !(flat2 && flat && mask)
p[i] = _mm_andnot_si128(flat2, p[i]);
flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
// get values for when (flat2 && flat && mask)
p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
q[i] = _mm_andnot_si128(flat2, q[i]);
flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
q[i] = _mm_or_si128(q[i], flat2_q[i]);
if (pixel_output == FOUR_PIXELS) {
_mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), p[i]);
_mm_storel_epi64((__m128i *)(s + i * pitch), q[i]);
}
} else { // EIGHT_PIXELS
for (i = 6; i >= 0; i--) {
// p[i] remains unchanged if !(flat2 && flat && mask)
p[i] = _mm_andnot_si128(flat2, p[i]);
flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
// get values for when (flat2 && flat && mask)
p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
q[i] = _mm_andnot_si128(flat2, q[i]);
flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
q[i] = _mm_or_si128(q[i], flat2_q[i]);
} else {
_mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
_mm_store_si128((__m128i *)(s + i * pitch), q[i]);
}
......
......@@ -2226,10 +2226,9 @@ static void av1_filter_block_plane_vert(
case 16:
if (cm->use_highbitdepth)
#if CONFIG_DEBLOCK_13TAP
// TODO(olah): Remove _c once SIMD for 13-tap is available
aom_highbd_lpf_vertical_16_c(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
aom_highbd_lpf_vertical_16(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
cm->bit_depth);
#else
aom_highbd_lpf_vertical_16(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim, params.hev_thr,
......@@ -2318,10 +2317,9 @@ static void av1_filter_block_plane_horz(
case 16:
if (cm->use_highbitdepth)
#if CONFIG_DEBLOCK_13TAP
// TODO(olah): Remove _c once SIMD for 13-tap is available
aom_highbd_lpf_horizontal_16_c(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
aom_highbd_lpf_horizontal_16(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
params.hev_thr, cm->bit_depth);
#else
aom_highbd_lpf_horizontal_16(CONVERT_TO_SHORTPTR(p), dst_stride,
params.mblim, params.lim,
......
......@@ -453,52 +453,36 @@ const hbdloop_param_t kHbdLoop8Test6[] = {
#endif
make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
8),
#if !CONFIG_DEBLOCK_13TAP
// Despite the name the following funcition is doing 15-tap filtering
// which is changed to 13-tap and not yet implemented in SIMD
make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
&aom_highbd_lpf_horizontal_16_c, 8),
#endif
#if !CONFIG_DEBLOCK_13TAP // No SIMD implementation for deblock_13tap yet
make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
&aom_highbd_lpf_horizontal_16_dual_c, 8),
#endif
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
#if !CONFIG_DEBLOCK_13TAP // No SIMD implementation for deblock_13tap yet
make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
8),
#endif
make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
10),
make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 10),
make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
10),
#if !CONFIG_DEBLOCK_13TAP
// Despite the name the following funcition is doing 15-tap filtering
// which is changed to 13-tap and not yet implemented in SIMD
make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
&aom_highbd_lpf_horizontal_16_c, 10),
#endif
#if !CONFIG_DEBLOCK_13TAP // No SIMD implementation for deblock_13tap yet
make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
&aom_highbd_lpf_horizontal_16_dual_c, 10),
#endif
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
#if !CONFIG_DEBLOCK_13TAP // No SIMD implementation for deblock_13tap yet
make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
10),
#endif
make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
12),
make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 12),
make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
12),
#if !CONFIG_DEBLOCK_13TAP
// Despite the name the following funcition is doing 15-tap filtering
// which is changed to 13-tap and not yet implemented in SIMD
make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
&aom_highbd_lpf_horizontal_16_c, 12),
#endif
#if !CONFIG_DEBLOCK_13TAP // No SIMD implementation for deblock_13tap yet
make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
&aom_highbd_lpf_horizontal_16_dual_c, 12),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment