Commit d6a7dd19 authored by Maxym Dmytrychenko's avatar Maxym Dmytrychenko

SSE2 optimization for lpf 16_dual implementations

covers horizontal and vertical variations and
including low and high bitdepth types.

Appropriate tests are enabled

Performance changes, SSE2 over C:
Horizontal methods: up to  3x
Vertical   methods: up to  2x

Change-Id: If430a916394c7befa743e4fbaa9913fd37c535ed
parent b4d4aff4
......@@ -431,6 +431,7 @@ add_proto qw/void aom_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *b
specialize qw/aom_lpf_vertical_16 sse2/;
add_proto qw/void aom_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/aom_lpf_vertical_16_dual sse2/;
add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/aom_lpf_vertical_6 sse2/;
......
......@@ -835,7 +835,7 @@ void aom_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
}
static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
......
......@@ -434,6 +434,7 @@ void aom_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p,
const uint8_t *_limit,
const uint8_t *_thresh, int bd) {
highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd);
highbd_lpf_horz_edge_8_4p(s + 4, p, _blimit, _limit, _thresh, bd);
}
static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1,
......
......@@ -626,350 +626,6 @@ static INLINE __m128i filter16_mask(const __m128i *const flat,
return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
}
typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput;
static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x,
int p, int offset, uint8_t *s) {
int i;
if (pixel_num == FOUR_PIXELS) {
for (i = 13; i >= 0; i--) {
xx_storel_32(s - (i - offset) * p, x[i]);
}
}
if (pixel_num == EIGHT_PIXELS) {
for (i = 13; i >= 0; i--) {
xx_storel_64(s - (i - offset) * p, x[i]);
}
}
if (pixel_num == SIXTEEN_PIXELS) {
for (i = 13; i >= 0; i--) {
xx_storeu_128(s - (i - offset) * p, x[i]);
}
}
}
static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num,
unsigned char *s, int p,
const unsigned char *_blimit,
const unsigned char *_limit,
const unsigned char *_thresh) {
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi8(1);
const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
const __m128i limit = _mm_load_si128((const __m128i *)_limit);
const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
__m128i mask, hev, flat, flat2;
__m128i p7, p6, p5;
__m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
__m128i q5, q6, q7;
__m128i op2, op1, op0, oq0, oq1, oq2;
__m128i max_abs_p1p0q1q0;
p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
{
const __m128i abs_p1p0 = abs_diff(p1, p0);
const __m128i abs_q1q0 = abs_diff(q1, q0);
const __m128i fe = _mm_set1_epi8(0xfe);
const __m128i ff = _mm_cmpeq_epi8(zero, zero);
__m128i abs_p0q0 = abs_diff(p0, q0);
__m128i abs_p1q1 = abs_diff(p1, q1);
__m128i work;
max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
// mask |= (abs(p1 - p0) > limit) * -1;
// mask |= (abs(q1 - q0) > limit) * -1;
work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
mask = _mm_max_epu8(work, mask);
work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
mask = _mm_max_epu8(work, mask);
mask = _mm_subs_epu8(mask, limit);
mask = _mm_cmpeq_epi8(mask, zero);
}
{
__m128i work;
work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
flat = _mm_max_epu8(work, flat);
work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
flat = _mm_subs_epu8(flat, one);
flat = _mm_cmpeq_epi8(flat, zero);
flat = _mm_and_si128(flat, mask);
flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
flat2 = _mm_max_epu8(work, flat2);
work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
flat2 = _mm_max_epu8(work, flat2);
work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
flat2 = _mm_max_epu8(work, flat2);
flat2 = _mm_subs_epu8(flat2, one);
flat2 = _mm_cmpeq_epi8(flat2, zero);
flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
}
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// filter4
{
const __m128i t4 = _mm_set1_epi8(4);
const __m128i t3 = _mm_set1_epi8(3);
const __m128i t80 = _mm_set1_epi8(0x80);
const __m128i te0 = _mm_set1_epi8(0xe0);
const __m128i t1f = _mm_set1_epi8(0x1f);
const __m128i t1 = _mm_set1_epi8(0x1);
const __m128i t7f = _mm_set1_epi8(0x7f);
const __m128i ff = _mm_cmpeq_epi8(t4, t4);
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
op1 = _mm_xor_si128(p1, t80);
op0 = _mm_xor_si128(p0, t80);
oq0 = _mm_xor_si128(q0, t80);
oq1 = _mm_xor_si128(q1, t80);
hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
work_a = _mm_subs_epi8(oq0, op0);
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
// (aom_filter + 3 * (qs0 - ps0)) & mask
filt = _mm_and_si128(filt, mask);
filter1 = _mm_adds_epi8(filt, t4);
filter2 = _mm_adds_epi8(filt, t3);
// Filter1 >> 3
work_a = _mm_cmpgt_epi8(zero, filter1);
filter1 = _mm_srli_epi16(filter1, 3);
work_a = _mm_and_si128(work_a, te0);
filter1 = _mm_and_si128(filter1, t1f);
filter1 = _mm_or_si128(filter1, work_a);
oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
// Filter2 >> 3
work_a = _mm_cmpgt_epi8(zero, filter2);
filter2 = _mm_srli_epi16(filter2, 3);
work_a = _mm_and_si128(work_a, te0);
filter2 = _mm_and_si128(filter2, t1f);
filter2 = _mm_or_si128(filter2, work_a);
op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
// filt >> 1
filt = _mm_adds_epi8(filter1, t1);
work_a = _mm_cmpgt_epi8(zero, filt);
filt = _mm_srli_epi16(filt, 1);
work_a = _mm_and_si128(work_a, t80);
filt = _mm_and_si128(filt, t7f);
filt = _mm_or_si128(filt, work_a);
filt = _mm_andnot_si128(hev, filt);
op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
// loopfilter done
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// filter8
{
const __m128i four = _mm_set1_epi16(4);
const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
__m128i f8_lo, f8_hi;
f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
_mm_add_epi16(p3_lo, p2_lo));
f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
_mm_add_epi16(p2_lo, p1_lo));
f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
_mm_add_epi16(p3_hi, p2_hi));
f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
_mm_add_epi16(p2_hi, p1_hi));
f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
}
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// wide flat calculations
{
const __m128i eight = _mm_set1_epi16(8);
const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
__m128i f_lo;
__m128i f_hi;
f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7
f_lo =
_mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
_mm_add_epi16(p2_lo, p1_lo));
f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7
f_hi =
_mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
_mm_add_epi16(p2_hi, p1_hi));
f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
__m128i x[14];
x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
store_buffer_horz_16(pixel_num, x, p, 6, s);
}
// wide flat
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
}
}
void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
const unsigned char *_blimit,
const unsigned char *_limit,
......@@ -1316,7 +972,8 @@ void aom_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
const unsigned char *_blimit,
const unsigned char *_limit,
const unsigned char *_thresh) {
lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh);
aom_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh);
aom_lpf_horizontal_16_sse2(s + 4, p, _blimit, _limit, _thresh);
}
void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
......
......@@ -450,6 +450,8 @@ const hbdloop_param_t kHbdLoop8Test6[] = {
8),
make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
&aom_highbd_lpf_horizontal_16_c, 8),
make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
&aom_highbd_lpf_horizontal_16_dual_c, 8),
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
8),
......@@ -460,6 +462,8 @@ const hbdloop_param_t kHbdLoop8Test6[] = {
10),
make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
&aom_highbd_lpf_horizontal_16_c, 10),
make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
&aom_highbd_lpf_horizontal_16_dual_c, 10),
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
10),
......@@ -470,6 +474,16 @@ const hbdloop_param_t kHbdLoop8Test6[] = {
12),
make_tuple(&aom_highbd_lpf_horizontal_16_sse2,
&aom_highbd_lpf_horizontal_16_c, 12),
make_tuple(&aom_highbd_lpf_horizontal_16_dual_sse2,
&aom_highbd_lpf_horizontal_16_dual_c, 12),
make_tuple(&aom_highbd_lpf_vertical_16_sse2, &aom_highbd_lpf_vertical_16_c,
12),
make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
&aom_highbd_lpf_vertical_16_dual_c, 8),
make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
&aom_highbd_lpf_vertical_16_dual_c, 10),
make_tuple(&aom_highbd_lpf_vertical_16_dual_sse2,
&aom_highbd_lpf_vertical_16_dual_c, 12),
make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
};
......@@ -482,9 +496,12 @@ const loop_param_t kLoop8Test6[] = {
make_tuple(&aom_lpf_horizontal_6_sse2, &aom_lpf_horizontal_6_c, 8),
make_tuple(&aom_lpf_vertical_6_sse2, &aom_lpf_vertical_6_c, 8),
make_tuple(&aom_lpf_horizontal_16_sse2, &aom_lpf_horizontal_16_c, 8),
make_tuple(&aom_lpf_horizontal_16_dual_sse2, &aom_lpf_horizontal_16_dual_c,
8),
make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
make_tuple(&aom_lpf_vertical_16_sse2, &aom_lpf_vertical_16_c, 8),
make_tuple(&aom_lpf_vertical_16_dual_sse2, &aom_lpf_vertical_16_dual_c, 8)
};
INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param_lbd,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment