Commit d6a7dd19 authored by Maxym Dmytrychenko's avatar Maxym Dmytrychenko

SSE2 optimization for lpf 16_dual implementations

covers horizontal and vertical variations and
including low and high bitdepth types.

Appropriate tests are enabled

Performance changes, SSE2 over C:
Horizontal methods: up to  3x
Vertical   methods: up to  2x

Change-Id: If430a916394c7befa743e4fbaa9913fd37c535ed
parent b4d4aff4
......@@ -431,6 +431,7 @@ add_proto qw/void aom_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *b
specialize qw/aom_lpf_vertical_16 sse2/;
add_proto qw/void aom_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/aom_lpf_vertical_16_dual sse2/;
add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/aom_lpf_vertical_6 sse2/;
......
......@@ -835,7 +835,7 @@ void aom_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
}
static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
......
......@@ -434,6 +434,7 @@ void aom_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p,
const uint8_t *_limit,
const uint8_t *_thresh, int bd) {
highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
highbd_lpf_horz_edge_8_4p(s + 4, p, _blimit, _limit, _thresh, bd);
}
static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1,
......
This diff is collapsed.
......@@ -450,6 +450,8 @@ const hbdloop_param_t kHbdLoop8Test6[] = {
8),
make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
&aom_highbd_lpf_horizontal_16_c, 8),
make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
&aom_highbd_lpf_horizontal_16_dual_c, 8),
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
8),
......@@ -460,6 +462,8 @@ const hbdloop_param_t kHbdLoop8Test6[] = {
10),
make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
&aom_highbd_lpf_horizontal_16_c, 10),
make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
&aom_highbd_lpf_horizontal_16_dual_c, 10),
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
10),
......@@ -470,6 +474,16 @@ const hbdloop_param_t kHbdLoop8Test6[] = {
12),
make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
&aom_highbd_lpf_horizontal_16_c, 12),
make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
&aom_highbd_lpf_horizontal_16_dual_c, 12),
make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
12),
make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
&aom_highbd_lpf_vertical_16_dual_c, 8),
make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
&aom_highbd_lpf_vertical_16_dual_c, 10),
make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
&aom_highbd_lpf_vertical_16_dual_c, 12),
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
};
......@@ -482,9 +496,12 @@ const loop_param_t kLoop8Test6[] = {
make_tuple(&aom_lpf_horizontal_6_sse2, &aom_lpf_horizontal_6_c, 8),
make_tuple(&aom_lpf_vertical_6_sse2, &aom_lpf_vertical_6_c, 8),
make_tuple(&aom_lpf_horizontal_16_sse2, &aom_lpf_horizontal_16_c, 8),
make_tuple(&aom_lpf_horizontal_16_dual_sse2, &aom_lpf_horizontal_16_dual_c,
8),
make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
make_tuple(&aom_lpf_vertical_16_sse2, &aom_lpf_vertical_16_c, 8),
make_tuple(&aom_lpf_vertical_16_dual_sse2, &aom_lpf_vertical_16_dual_c, 8)
};
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param_lbd,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment