Commit bdddf33a authored by Yi Luo's avatar Yi Luo
Browse files

Highbd rectangle intrapred V/DC sse2 optimization

Function speedup (i7-6700),  sse2 verse C:
Predictor      V_PRED    DC_PRED
4x8            ~1.5x     ~4.9x
8x4            ~2.5x     ~4.8x
8x16           ~1.9x     ~9.1x
16x8           ~1.9x     ~4.4x
16x32          ~2.1x     ~5.8x
32x16          ~2.0x     ~3.6x

Change-Id: I6deffd0637e57ee5d0bd533502f5705148c4cdd4
parent 1727fac4
...@@ -116,12 +116,24 @@ specialize qw/aom_dc_128_predictor_32x32 msa neon sse2/; ...@@ -116,12 +116,24 @@ specialize qw/aom_dc_128_predictor_32x32 msa neon sse2/;
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_highbd_v_predictor_4x4 sse2/; specialize qw/aom_highbd_v_predictor_4x4 sse2/;
specialize qw/aom_highbd_dc_predictor_4x4 sse2/; specialize qw/aom_highbd_v_predictor_4x8 sse2/;
specialize qw/aom_highbd_v_predictor_8x4 sse2/;
specialize qw/aom_highbd_v_predictor_8x8 sse2/; specialize qw/aom_highbd_v_predictor_8x8 sse2/;
specialize qw/aom_highbd_dc_predictor_8x8 sse2/;; specialize qw/aom_highbd_v_predictor_8x16 sse2/;
specialize qw/aom_highbd_v_predictor_16x8 sse2/;
specialize qw/aom_highbd_v_predictor_16x16 sse2/; specialize qw/aom_highbd_v_predictor_16x16 sse2/;
specialize qw/aom_highbd_dc_predictor_16x16 sse2/; specialize qw/aom_highbd_v_predictor_16x32 sse2/;
specialize qw/aom_highbd_v_predictor_32x16 sse2/;
specialize qw/aom_highbd_v_predictor_32x32 sse2/; specialize qw/aom_highbd_v_predictor_32x32 sse2/;
specialize qw/aom_highbd_dc_predictor_4x4 sse2/;
specialize qw/aom_highbd_dc_predictor_4x8 sse2/;
specialize qw/aom_highbd_dc_predictor_8x4 sse2/;;
specialize qw/aom_highbd_dc_predictor_8x8 sse2/;;
specialize qw/aom_highbd_dc_predictor_8x16 sse2/;;
specialize qw/aom_highbd_dc_predictor_16x8 sse2/;
specialize qw/aom_highbd_dc_predictor_16x16 sse2/;
specialize qw/aom_highbd_dc_predictor_16x32 sse2/;
specialize qw/aom_highbd_dc_predictor_32x16 sse2/;
specialize qw/aom_highbd_dc_predictor_32x32 sse2/; specialize qw/aom_highbd_dc_predictor_32x32 sse2/;
specialize qw/aom_highbd_h_predictor_4x4 sse2/; specialize qw/aom_highbd_h_predictor_4x4 sse2/;
specialize qw/aom_highbd_h_predictor_4x8 sse2/; specialize qw/aom_highbd_h_predictor_4x8 sse2/;
......
...@@ -681,3 +681,304 @@ void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, ...@@ -681,3 +681,304 @@ void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
(void)left; (void)left;
dc_store_32xh(dst, stride, 32, &dc_dup); dc_store_32xh(dst, stride, 32, &dc_dup);
} }
// -----------------------------------------------------------------------------
// V_PRED
void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
int i;
for (i = 0; i < 2; ++i) {
_mm_storel_epi64((__m128i *)dst, above_u16);
_mm_storel_epi64((__m128i *)(dst + stride), above_u16);
_mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
_mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
dst += stride << 2;
}
}
void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
_mm_store_si128((__m128i *)dst, above_u16);
_mm_store_si128((__m128i *)(dst + stride), above_u16);
_mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
_mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
}
void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
int i;
for (i = 0; i < 4; ++i) {
_mm_store_si128((__m128i *)dst, above_u16);
_mm_store_si128((__m128i *)(dst + stride), above_u16);
_mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
_mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
dst += stride << 2;
}
}
void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
int i;
for (i = 0; i < 2; ++i) {
_mm_store_si128((__m128i *)dst, above0_u16);
_mm_store_si128((__m128i *)(dst + 8), above1_u16);
dst += stride;
_mm_store_si128((__m128i *)dst, above0_u16);
_mm_store_si128((__m128i *)(dst + 8), above1_u16);
dst += stride;
_mm_store_si128((__m128i *)dst, above0_u16);
_mm_store_si128((__m128i *)(dst + 8), above1_u16);
dst += stride;
_mm_store_si128((__m128i *)dst, above0_u16);
_mm_store_si128((__m128i *)(dst + 8), above1_u16);
dst += stride;
}
}
void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
int i;
for (i = 0; i < 8; ++i) {
_mm_store_si128((__m128i *)dst, above0_u16);
_mm_store_si128((__m128i *)(dst + 8), above1_u16);
dst += stride;
_mm_store_si128((__m128i *)dst, above0_u16);
_mm_store_si128((__m128i *)(dst + 8), above1_u16);
dst += stride;
_mm_store_si128((__m128i *)dst, above0_u16);
_mm_store_si128((__m128i *)(dst + 8), above1_u16);
dst += stride;
_mm_store_si128((__m128i *)dst, above0_u16);
_mm_store_si128((__m128i *)(dst + 8), above1_u16);
dst += stride;
}
}
void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
int i;
for (i = 0; i < 4; ++i) {
_mm_store_si128((__m128i *)dst, above0_u16);
_mm_store_si128((__m128i *)(dst + 8), above1_u16);
_mm_store_si128((__m128i *)(dst + 16), above2_u16);
_mm_store_si128((__m128i *)(dst + 24), above3_u16);
dst += stride;
_mm_store_si128((__m128i *)dst, above0_u16);
_mm_store_si128((__m128i *)(dst + 8), above1_u16);
_mm_store_si128((__m128i *)(dst + 16), above2_u16);
_mm_store_si128((__m128i *)(dst + 24), above3_u16);
dst += stride;
_mm_store_si128((__m128i *)dst, above0_u16);
_mm_store_si128((__m128i *)(dst + 8), above1_u16);
_mm_store_si128((__m128i *)(dst + 16), above2_u16);
_mm_store_si128((__m128i *)(dst + 24), above3_u16);
dst += stride;
_mm_store_si128((__m128i *)dst, above0_u16);
_mm_store_si128((__m128i *)(dst + 8), above1_u16);
_mm_store_si128((__m128i *)(dst + 16), above2_u16);
_mm_store_si128((__m128i *)(dst + 24), above3_u16);
dst += stride;
}
}
// -----------------------------------------------------------------------------
// DC_PRED
void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)bd;
const __m128i sum_above = dc_sum_4(above);
const __m128i sum_left = dc_sum_8(left);
const __m128i sum = _mm_add_epi16(sum_above, sum_left);
uint32_t sum32 = _mm_cvtsi128_si32(sum);
sum32 >>= 16;
sum32 += 6;
sum32 /= 12;
const __m128i row = _mm_set1_epi16((uint16_t)sum32);
int i;
for (i = 0; i < 4; ++i) {
_mm_storel_epi64((__m128i *)dst, row);
dst += stride;
_mm_storel_epi64((__m128i *)dst, row);
dst += stride;
}
}
void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)bd;
const __m128i sum_left = dc_sum_4(left);
const __m128i sum_above = dc_sum_8(above);
const __m128i sum = _mm_add_epi16(sum_above, sum_left);
uint32_t sum32 = _mm_cvtsi128_si32(sum);
sum32 >>= 16;
sum32 += 6;
sum32 /= 12;
const __m128i row = _mm_set1_epi16((uint16_t)sum32);
_mm_store_si128((__m128i *)dst, row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
}
void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)bd;
__m128i sum_left = dc_sum_16(left);
__m128i sum_above = dc_sum_8(above);
const __m128i zero = _mm_setzero_si128();
sum_left = _mm_unpacklo_epi16(sum_left, zero);
sum_above = _mm_unpacklo_epi16(sum_above, zero);
const __m128i sum = _mm_add_epi32(sum_left, sum_above);
uint32_t sum32 = _mm_cvtsi128_si32(sum);
sum32 += 12;
sum32 /= 24;
const __m128i row = _mm_set1_epi16((uint16_t)sum32);
int i;
for (i = 0; i < 4; ++i) {
_mm_store_si128((__m128i *)dst, row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
dst += stride;
}
}
void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)bd;
__m128i sum_left = dc_sum_8(left);
__m128i sum_above = dc_sum_16(above);
const __m128i zero = _mm_setzero_si128();
sum_left = _mm_unpacklo_epi16(sum_left, zero);
sum_above = _mm_unpacklo_epi16(sum_above, zero);
const __m128i sum = _mm_add_epi32(sum_left, sum_above);
uint32_t sum32 = _mm_cvtsi128_si32(sum);
sum32 += 12;
sum32 /= 24;
const __m128i row = _mm_set1_epi16((uint16_t)sum32);
int i;
for (i = 0; i < 2; ++i) {
_mm_store_si128((__m128i *)dst, row);
_mm_store_si128((__m128i *)(dst + 8), row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
_mm_store_si128((__m128i *)(dst + 8), row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
_mm_store_si128((__m128i *)(dst + 8), row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
_mm_store_si128((__m128i *)(dst + 8), row);
dst += stride;
}
}
void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)bd;
__m128i sum_left = dc_sum_32(left);
__m128i sum_above = dc_sum_16(above);
const __m128i zero = _mm_setzero_si128();
sum_above = _mm_unpacklo_epi16(sum_above, zero);
const __m128i sum = _mm_add_epi32(sum_left, sum_above);
uint32_t sum32 = _mm_cvtsi128_si32(sum);
sum32 += 24;
sum32 /= 48;
const __m128i row = _mm_set1_epi16((uint16_t)sum32);
int i;
for (i = 0; i < 8; ++i) {
_mm_store_si128((__m128i *)dst, row);
_mm_store_si128((__m128i *)(dst + 8), row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
_mm_store_si128((__m128i *)(dst + 8), row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
_mm_store_si128((__m128i *)(dst + 8), row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
_mm_store_si128((__m128i *)(dst + 8), row);
dst += stride;
}
}
void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)bd;
__m128i sum_left = dc_sum_16(left);
__m128i sum_above = dc_sum_32(above);
const __m128i zero = _mm_setzero_si128();
sum_left = _mm_unpacklo_epi16(sum_left, zero);
const __m128i sum = _mm_add_epi32(sum_left, sum_above);
uint32_t sum32 = _mm_cvtsi128_si32(sum);
sum32 += 24;
sum32 /= 48;
const __m128i row = _mm_set1_epi16((uint16_t)sum32);
int i;
for (i = 0; i < 4; ++i) {
_mm_store_si128((__m128i *)dst, row);
_mm_store_si128((__m128i *)(dst + 8), row);
_mm_store_si128((__m128i *)(dst + 16), row);
_mm_store_si128((__m128i *)(dst + 24), row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
_mm_store_si128((__m128i *)(dst + 8), row);
_mm_store_si128((__m128i *)(dst + 16), row);
_mm_store_si128((__m128i *)(dst + 24), row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
_mm_store_si128((__m128i *)(dst + 8), row);
_mm_store_si128((__m128i *)(dst + 16), row);
_mm_store_si128((__m128i *)(dst + 24), row);
dst += stride;
_mm_store_si128((__m128i *)dst, row);
_mm_store_si128((__m128i *)(dst + 8), row);
_mm_store_si128((__m128i *)(dst + 16), row);
_mm_store_si128((__m128i *)(dst + 24), row);
dst += stride;
}
}
...@@ -137,9 +137,16 @@ TEST_P(AV1IntraPredTest, IntraPredTests) { ...@@ -137,9 +137,16 @@ TEST_P(AV1IntraPredTest, IntraPredTests) {
#if CONFIG_HIGHBITDEPTH #if CONFIG_HIGHBITDEPTH
const IntraPredFunc IntraPredTestVector8[] = { const IntraPredFunc IntraPredTestVector8[] = {
highbd_entry(dc, 4, 4, sse2, 8), highbd_entry(dc, 4, 4, sse2, 8),
highbd_entry(dc, 4, 8, sse2, 8),
highbd_entry(dc, 8, 4, sse2, 8),
highbd_entry(dc, 8, 8, sse2, 8), highbd_entry(dc, 8, 8, sse2, 8),
highbd_entry(dc, 8, 16, sse2, 8),
highbd_entry(dc, 16, 8, sse2, 8),
highbd_entry(dc, 16, 16, sse2, 8), highbd_entry(dc, 16, 16, sse2, 8),
highbd_entry(dc, 16, 32, sse2, 8),
highbd_entry(dc, 32, 16, sse2, 8),
highbd_entry(dc, 32, 32, sse2, 8), highbd_entry(dc, 32, 32, sse2, 8),
highbd_entry(dc_left, 4, 4, sse2, 8), highbd_entry(dc_left, 4, 4, sse2, 8),
highbd_entry(dc_left, 4, 8, sse2, 8), highbd_entry(dc_left, 4, 8, sse2, 8),
highbd_entry(dc_top, 4, 4, sse2, 8), highbd_entry(dc_top, 4, 4, sse2, 8),
...@@ -173,10 +180,18 @@ const IntraPredFunc IntraPredTestVector8[] = { ...@@ -173,10 +180,18 @@ const IntraPredFunc IntraPredTestVector8[] = {
highbd_entry(dc_left, 32, 32, sse2, 8), highbd_entry(dc_left, 32, 32, sse2, 8),
highbd_entry(dc_top, 32, 32, sse2, 8), highbd_entry(dc_top, 32, 32, sse2, 8),
highbd_entry(dc_128, 32, 32, sse2, 8), highbd_entry(dc_128, 32, 32, sse2, 8),
highbd_entry(v, 4, 4, sse2, 8), highbd_entry(v, 4, 4, sse2, 8),
highbd_entry(v, 4, 8, sse2, 8),
highbd_entry(v, 8, 4, sse2, 8),
highbd_entry(v, 8, 8, sse2, 8), highbd_entry(v, 8, 8, sse2, 8),
highbd_entry(v, 8, 16, sse2, 8),
highbd_entry(v, 16, 8, sse2, 8),
highbd_entry(v, 16, 16, sse2, 8), highbd_entry(v, 16, 16, sse2, 8),
highbd_entry(v, 16, 32, sse2, 8),
highbd_entry(v, 32, 16, sse2, 8),
highbd_entry(v, 32, 32, sse2, 8), highbd_entry(v, 32, 32, sse2, 8),
highbd_entry(h, 4, 4, sse2, 8), highbd_entry(h, 4, 4, sse2, 8),
highbd_entry(h, 4, 8, sse2, 8), highbd_entry(h, 4, 8, sse2, 8),
highbd_entry(h, 8, 4, sse2, 8), highbd_entry(h, 8, 4, sse2, 8),
...@@ -194,9 +209,16 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, AV1IntraPredTest, ...@@ -194,9 +209,16 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, AV1IntraPredTest,
const IntraPredFunc IntraPredTestVector10[] = { const IntraPredFunc IntraPredTestVector10[] = {
highbd_entry(dc, 4, 4, sse2, 10), highbd_entry(dc, 4, 4, sse2, 10),
highbd_entry(dc, 4, 8, sse2, 10),
highbd_entry(dc, 8, 4, sse2, 10),
highbd_entry(dc, 8, 8, sse2, 10), highbd_entry(dc, 8, 8, sse2, 10),
highbd_entry(dc, 8, 16, sse2, 10),
highbd_entry(dc, 16, 8, sse2, 10),
highbd_entry(dc, 16, 16, sse2, 10), highbd_entry(dc, 16, 16, sse2, 10),
highbd_entry(dc, 16, 32, sse2, 10),
highbd_entry(dc, 32, 16, sse2, 10),
highbd_entry(dc, 32, 32, sse2, 10), highbd_entry(dc, 32, 32, sse2, 10),
highbd_entry(dc_left, 4, 4, sse2, 10), highbd_entry(dc_left, 4, 4, sse2, 10),
highbd_entry(dc_left, 4, 8, sse2, 10), highbd_entry(dc_left, 4, 8, sse2, 10),
highbd_entry(dc_top, 4, 4, sse2, 10), highbd_entry(dc_top, 4, 4, sse2, 10),
...@@ -230,10 +252,18 @@ const IntraPredFunc IntraPredTestVector10[] = { ...@@ -230,10 +252,18 @@ const IntraPredFunc IntraPredTestVector10[] = {
highbd_entry(dc_left, 32, 32, sse2, 10), highbd_entry(dc_left, 32, 32, sse2, 10),
highbd_entry(dc_top, 32, 32, sse2, 10), highbd_entry(dc_top, 32, 32, sse2, 10),
highbd_entry(dc_128, 32, 32, sse2, 10), highbd_entry(dc_128, 32, 32, sse2, 10),
highbd_entry(v, 4, 4, sse2, 10), highbd_entry(v, 4, 4, sse2, 10),
highbd_entry(v, 4, 8, sse2, 10),
highbd_entry(v, 8, 4, sse2, 10),
highbd_entry(v, 8, 8, sse2, 10), highbd_entry(v, 8, 8, sse2, 10),
highbd_entry(v, 8, 16, sse2, 10),
highbd_entry(v, 16, 8, sse2, 10),
highbd_entry(v, 16, 16, sse2, 10), highbd_entry(v, 16, 16, sse2, 10),
highbd_entry(v, 16, 32, sse2, 10),
highbd_entry(v, 32, 16, sse2, 10),
highbd_entry(v, 32, 32, sse2, 10), highbd_entry(v, 32, 32, sse2, 10),
highbd_entry(h, 4, 4, sse2, 10), highbd_entry(h, 4, 4, sse2, 10),
highbd_entry(h, 4, 8, sse2, 10), highbd_entry(h, 4, 8, sse2, 10),
highbd_entry(h, 8, 4, sse2, 10), highbd_entry(h, 8, 4, sse2, 10),
...@@ -251,9 +281,16 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, AV1IntraPredTest, ...@@ -251,9 +281,16 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, AV1IntraPredTest,
const IntraPredFunc IntraPredTestVector12[] = { const IntraPredFunc IntraPredTestVector12[] = {
highbd_entry(dc, 4, 4, sse2, 12), highbd_entry(dc, 4, 4, sse2, 12),
highbd_entry(dc, 4, 8, sse2, 12),
highbd_entry(dc, 8, 4, sse2, 12),
highbd_entry(dc, 8, 8, sse2, 12), highbd_entry(dc, 8, 8, sse2, 12),
highbd_entry(dc, 8, 16, sse2, 12),
highbd_entry(dc, 16, 8, sse2, 12),
highbd_entry(dc, 16, 16, sse2, 12), highbd_entry(dc, 16, 16, sse2, 12),
highbd_entry(dc, 16, 32, sse2, 12),
highbd_entry(dc, 32, 16, sse2, 12),
highbd_entry(dc, 32, 32, sse2, 12), highbd_entry(dc, 32, 32, sse2, 12),
highbd_entry(dc_left, 4, 4, sse2, 12), highbd_entry(dc_left, 4, 4, sse2, 12),
highbd_entry(dc_left, 4, 8, sse2, 12), highbd_entry(dc_left, 4, 8, sse2, 12),
highbd_entry(dc_top, 4, 4, sse2, 12), highbd_entry(dc_top, 4, 4, sse2, 12),
...@@ -287,10 +324,18 @@ const IntraPredFunc IntraPredTestVector12[] = { ...@@ -287,10 +324,18 @@ const IntraPredFunc IntraPredTestVector12[] = {
highbd_entry(dc_left, 32, 32, sse2, 12), highbd_entry(dc_left, 32, 32, sse2, 12),
highbd_entry(dc_top, 32, 32, sse2, 12), highbd_entry(dc_top, 32, 32, sse2, 12),
highbd_entry(dc_128, 32, 32, sse2, 12), highbd_entry(dc_128, 32, 32, sse2, 12),
highbd_entry(v, 4, 4, sse2, 12), highbd_entry(v, 4, 4, sse2, 12),
highbd_entry(v, 4, 8, sse2, 12),
highbd_entry(v, 8, 4, sse2, 12),
highbd_entry(v, 8, 8, sse2, 12), highbd_entry(v, 8, 8, sse2, 12),
highbd_entry(v, 8, 16, sse2, 12),
highbd_entry(v, 16, 8, sse2, 12),
highbd_entry(v, 16, 16, sse2, 12), highbd_entry(v, 16, 16, sse2, 12),
highbd_entry(v, 16, 32, sse2, 12),
highbd_entry(v, 32, 16, sse2, 12),
highbd_entry(v, 32, 32, sse2, 12), highbd_entry(v, 32, 32, sse2, 12),
highbd_entry(h, 4, 4, sse2, 12), highbd_entry(h, 4, 4, sse2, 12),
highbd_entry(h, 4, 8, sse2, 12), highbd_entry(h, 4, 8, sse2, 12),
highbd_entry(h, 8, 4, sse2, 12), highbd_entry(h, 8, 4, sse2, 12),
......
...@@ -791,16 +791,20 @@ HIGHBD_INTRA_PRED_TEST( ...@@ -791,16 +791,20 @@ HIGHBD_INTRA_PRED_TEST(
#undef smooth_h_pred_func #undef smooth_h_pred_func
#if HAVE_SSE2 #if HAVE_SSE2
HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred4, "Hbd Intra4x4", NULL, HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred4, "Hbd Intra4x4",
aom_highbd_dc_predictor_4x4_sse2,
aom_highbd_dc_left_predictor_4x4_sse2, aom_highbd_dc_left_predictor_4x4_sse2,
aom_highbd_dc_top_predictor_4x4_sse2, aom_highbd_dc_top_predictor_4x4_sse2,
aom_highbd_dc_128_predictor_4x4_sse2, NULL, aom_highbd_dc_128_predictor_4x4_sse2,
aom_highbd_v_predictor_4x4_sse2,
aom_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, aom_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL) NULL, NULL, NULL, NULL, NULL, NULL)
HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred4, "Hbd Intra4x8", NULL, HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred4, "Hbd Intra4x8",
aom_highbd_dc_predictor_4x8_sse2,
aom_highbd_dc_left_predictor_4x8_sse2, aom_highbd_dc_left_predictor_4x8_sse2,
aom_highbd_dc_top_predictor_4x8_sse2, aom_highbd_dc_top_predictor_4x8_sse2,
aom_highbd_dc_128_predictor_4x8_sse2, NULL, aom_highbd_dc_128_predictor_4x8_sse2,
aom_highbd_v_predictor_4x8_sse2,
aom_highbd_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL, aom_highbd_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL) NULL, NULL, NULL, NULL, NULL, NULL)
#endif #endif
...@@ -849,22 +853,28 @@ HIGHBD_INTRA_PRED_TEST( ...@@ -849,22 +853,28 @@ HIGHBD_INTRA_PRED_TEST(
#undef smooth_h_pred_func #undef smooth_h_pred_func
#if HAVE_SSE2 #if HAVE_SSE2
HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred8, "Hbd Intra8x8", NULL, HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred8, "Hbd Intra8x8",
aom_highbd_dc_predictor_8x8_sse2,
aom_highbd_dc_left_predictor_8x8_sse2, aom_highbd_dc_left_predictor_8x8_sse2,
aom_highbd_dc_top_predictor_8x8_sse2, aom_highbd_dc_top_predictor_8x8_sse2,
aom_highbd_dc_128_predictor_8x8_sse2, NULL, aom_highbd_dc_128_predictor_8x8_sse2,
aom_highbd_v_predictor_8x8_sse2,
aom_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, aom_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL) NULL, NULL, NULL, NULL, NULL, NULL)
HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred8, "Hbd Intra8x4", NULL, HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred8, "Hbd Intra8x4",
aom_highbd_dc_predictor_8x4_sse2,
aom_highbd_dc_left_predictor_8x4_sse2, aom_highbd_dc_left_predictor_8x4_sse2,
aom_highbd_dc_top_predictor_8x4_sse2, aom_highbd_dc_top_predictor_8x4_sse2,
aom_highbd_dc_128_predictor_8x4_sse2, NULL, aom_highbd_dc_128_predictor_8x4_sse2,
aom_highbd_v_predictor_8x4_sse2,
aom_highbd_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL, aom_highbd_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL) NULL, NULL, NULL, NULL, NULL, NULL)
HIGHBD_INTRA_PRED_TEST(SSE2_3, TestHighbdIntraPred8, "Hbd Intra8x16", NULL, HIGHBD_INTRA_PRED_TEST(SSE2_3, TestHighbdIntraPred8, "Hbd Intra8x16",
aom_highbd_dc_predictor_8x16_sse2,
aom_highbd_dc_left_predictor_8x16_sse2, aom_highbd_dc_left_predictor_8x16_sse2,
aom_highbd_dc_top_predictor_8x16_sse2, aom_highbd_dc_top_predictor_8x16_sse2,
aom_highbd_dc_128_predictor_8x16_sse2, NULL, aom_highbd_dc_128_predictor_8x16_sse2,
aom_highbd_v_predictor_8x16_sse2,
aom_highbd_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL, aom_highbd_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL) NULL, NULL, NULL, NULL, NULL, NULL)
#endif #endif
...@@ -934,22 +944,28 @@ HIGHBD_INTRA_PRED_TEST( ...@@ -934,22 +944,28 @@ HIGHBD_INTRA_PRED_TEST(
#undef smooth_h_pred_func #undef smooth_h_pred_func
#if HAVE_SSE2 #if HAVE_SSE2
HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred16, "Hbd Intra16x16", NULL, HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred16, "Hbd Intra16x16",
aom_highbd_dc_predictor_16x16_sse2,
aom_highbd_dc_left_predictor_16x16_sse2, aom_highbd_dc_left_pred