Commit 56ad3dd3 authored by Yi Luo's avatar Yi Luo

Highbd D45E intrapred SSE2/AVX2 speedup

Function  SSE2 vs C  AVX2 vs C
4x4       ~4.5x
4x8       ~4.5x
8x4       ~11.7x
8x8       ~12.7x
8x16      ~14.0x
16x8                 ~21.7x
16x16                ~24.0x
16x32                ~28.7x
32x16                ~20.5x
32x32                ~24.4x

Change-Id: Iaca49727d8df17b7f793b774a8d51a401ef8a8d1
parent 88f2d444
......@@ -246,6 +246,7 @@ if (CONFIG_HIGHBITDEPTH)
set(AOM_DSP_COMMON_INTRIN_AVX2
${AOM_DSP_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
else ()
set(AOM_DSP_COMMON_INTRIN_DSPR2
......
......@@ -85,6 +85,7 @@ DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.c
DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_ssse3.c
DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_avx2.c
endif # CONFIG_HIGHBITDEPTH
DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
......
......@@ -255,6 +255,17 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_highbd_d153_predictor_8x8 ssse3/;
specialize qw/aom_highbd_d153_predictor_16x16 ssse3/;
specialize qw/aom_highbd_d153_predictor_32x32 ssse3/;
specialize qw/aom_highbd_d45e_predictor_4x4 sse2/;
specialize qw/aom_highbd_d45e_predictor_4x8 sse2/;
specialize qw/aom_highbd_d45e_predictor_8x4 sse2/;
specialize qw/aom_highbd_d45e_predictor_8x8 sse2/;
specialize qw/aom_highbd_d45e_predictor_8x16 sse2/;
specialize qw/aom_highbd_d45e_predictor_16x8 avx2/;
specialize qw/aom_highbd_d45e_predictor_16x16 avx2/;
specialize qw/aom_highbd_d45e_predictor_16x32 avx2/;
specialize qw/aom_highbd_d45e_predictor_32x16 avx2/;
specialize qw/aom_highbd_d45e_predictor_32x32 avx2/;
} # CONFIG_HIGHBITDEPTH
#
......
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <immintrin.h>
#include "./aom_dsp_rtcd.h"
// -----------------------------------------------------------------------------
// D45E_PRED
/*
; ------------------------------------------
; input: x, y, z, result
;
; trick from pascal
; (x+2y+z+2)>>2 can be calculated as:
; result = avg(x,z)
; result -= xor(x,z) & 1
; result = avg(result,y)
; ------------------------------------------
*/
static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y,
const __m256i *z) {
const __m256i one = _mm256_set1_epi16(1);
const __m256i a = _mm256_avg_epu16(*x, *z);
const __m256i b =
_mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one));
return _mm256_avg_epu16(b, *y);
}
static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1,
const __m256i *a2, uint16_t **dst,
ptrdiff_t stride) {
const __m256i y = avg3_epu16(a0, a1, a2);
_mm256_storeu_si256((__m256i *)*dst, y);
*dst += stride;
}
void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
__m256i x0 = _mm256_loadu_si256((const __m256i *)above);
__m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
__m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
d45e_w16(&x0, &x1, &x2, &dst, stride);
int i = 3;
do {
x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
d45e_w16(&x1, &x2, &x0, &dst, stride);
x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
d45e_w16(&x2, &x0, &x1, &dst, stride);
x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
d45e_w16(&x0, &x1, &x2, &dst, stride);
} while (i < 9);
x0 = _mm256_loadu_si256((const __m256i *)(above + 9));
x0 = _mm256_insert_epi16(x0, above[23], 15);
const __m256i y = avg3_epu16(&x1, &x2, &x0);
_mm256_storeu_si256((__m256i *)dst, y);
}
void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
__m256i x0 = _mm256_loadu_si256((const __m256i *)above);
__m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
__m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
d45e_w16(&x0, &x1, &x2, &dst, stride);
int i = 3;
do {
x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
d45e_w16(&x1, &x2, &x0, &dst, stride);
x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
d45e_w16(&x2, &x0, &x1, &dst, stride);
x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
d45e_w16(&x0, &x1, &x2, &dst, stride);
} while (i < 15);
x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
d45e_w16(&x1, &x2, &x0, &dst, stride);
x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
d45e_w16(&x2, &x0, &x1, &dst, stride);
x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
x2 = _mm256_insert_epi16(x2, above[31], 15);
const __m256i y = avg3_epu16(&x0, &x1, &x2);
_mm256_storeu_si256((__m256i *)dst, y);
}
void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
__m256i x0 = _mm256_loadu_si256((const __m256i *)above);
__m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
__m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
d45e_w16(&x0, &x1, &x2, &dst, stride);
int i = 3;
do {
x0 = _mm256_loadu_si256((const __m256i *)(above + i++));
d45e_w16(&x1, &x2, &x0, &dst, stride);
x1 = _mm256_loadu_si256((const __m256i *)(above + i++));
d45e_w16(&x2, &x0, &x1, &dst, stride);
x2 = _mm256_loadu_si256((const __m256i *)(above + i++));
d45e_w16(&x0, &x1, &x2, &dst, stride);
} while (i < 33);
x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
x0 = _mm256_insert_epi16(x0, above[47], 15);
const __m256i y = avg3_epu16(&x1, &x2, &x0);
_mm256_storeu_si256((__m256i *)dst, y);
}
void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
__m256i x0 = _mm256_loadu_si256((const __m256i *)above);
__m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
__m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
__m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
__m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
__m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
uint16_t *dst1 = dst;
uint16_t *dst2 = dst + 16;
d45e_w16(&x0, &x1, &x2, &dst1, stride);
d45e_w16(&y0, &y1, &y2, &dst2, stride);
int i = 3;
do {
x0 = _mm256_loadu_si256((const __m256i *)(above + i));
d45e_w16(&x1, &x2, &x0, &dst1, stride);
y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
d45e_w16(&y1, &y2, &y0, &dst2, stride);
x1 = _mm256_loadu_si256((const __m256i *)(above + i));
d45e_w16(&x2, &x0, &x1, &dst1, stride);
y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
d45e_w16(&y2, &y0, &y1, &dst2, stride);
x2 = _mm256_loadu_si256((const __m256i *)(above + i));
d45e_w16(&x0, &x1, &x2, &dst1, stride);
y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
d45e_w16(&y0, &y1, &y2, &dst2, stride);
} while (i < 15);
x0 = _mm256_loadu_si256((const __m256i *)(above + 15));
d45e_w16(&x1, &x2, &x0, &dst1, stride);
y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15));
d45e_w16(&y1, &y2, &y0, &dst2, stride);
x1 = _mm256_loadu_si256((const __m256i *)(above + 16));
d45e_w16(&x2, &x0, &x1, &dst1, stride);
y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16));
d45e_w16(&y2, &y0, &y1, &dst2, stride);
x2 = _mm256_loadu_si256((const __m256i *)(above + 17));
__m256i u = avg3_epu16(&x0, &x1, &x2);
_mm256_storeu_si256((__m256i *)dst1, u);
y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17));
y2 = _mm256_insert_epi16(y2, above[47], 15);
u = avg3_epu16(&y0, &y1, &y2);
_mm256_storeu_si256((__m256i *)dst2, u);
}
void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
__m256i x0 = _mm256_loadu_si256((const __m256i *)above);
__m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1));
__m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2));
__m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16));
__m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17));
__m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18));
uint16_t *dst1 = dst;
uint16_t *dst2 = dst + 16;
d45e_w16(&x0, &x1, &x2, &dst1, stride);
d45e_w16(&y0, &y1, &y2, &dst2, stride);
int i = 3;
do {
x0 = _mm256_loadu_si256((const __m256i *)(above + i));
d45e_w16(&x1, &x2, &x0, &dst1, stride);
y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
d45e_w16(&y1, &y2, &y0, &dst2, stride);
x1 = _mm256_loadu_si256((const __m256i *)(above + i));
d45e_w16(&x2, &x0, &x1, &dst1, stride);
y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
d45e_w16(&y2, &y0, &y1, &dst2, stride);
x2 = _mm256_loadu_si256((const __m256i *)(above + i));
d45e_w16(&x0, &x1, &x2, &dst1, stride);
y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++));
d45e_w16(&y0, &y1, &y2, &dst2, stride);
} while (i < 33);
x0 = _mm256_loadu_si256((const __m256i *)(above + 33));
__m256i u = avg3_epu16(&x1, &x2, &x0);
_mm256_storeu_si256((__m256i *)dst1, u);
y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33));
y0 = _mm256_insert_epi16(y0, above[63], 15);
u = avg3_epu16(&y1, &y2, &y0);
_mm256_storeu_si256((__m256i *)dst2, u);
}
......@@ -1094,3 +1094,163 @@ void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
dst += stride;
_mm_storel_epi64((__m128i *)dst, row3);
}
void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
__m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6);
const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
(void)left;
(void)bd;
_mm_storel_epi64((__m128i *)dst, avg3);
dst += stride;
_mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
dst += stride;
_mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
dst += stride;
_mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
}
void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
__m128i h76543210 = _mm_load_si128((const __m128i *)above);
__m128i hx7654321 = _mm_srli_si128(h76543210, 2);
__m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7);
__m128i hx8765432 = _mm_srli_si128(h87654321, 2);
__m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7);
__m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432);
_mm_storel_epi64((__m128i *)dst, avg3);
dst += stride;
_mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
dst += stride;
_mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
dst += stride;
_mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
dst += stride;
_mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8));
dst += stride;
// hcba98765
h76543210 = _mm_loadu_si128((const __m128i *)((above + 5)));
h76543210 = _mm_insert_epi16(h76543210, above[11], 7);
// hxcba9876
hx7654321 = _mm_srli_si128(h76543210, 2);
// hxxcba987
hx8765432 = _mm_srli_si128(h76543210, 4);
avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432);
_mm_storel_epi64((__m128i *)dst, avg3);
dst += stride;
_mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
dst += stride;
_mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
}
void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
__m128i x0 = _mm_load_si128((const __m128i *)above);
__m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
__m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
__m128i y = avg3_epu16(&x0, &x1, &x2);
_mm_store_si128((__m128i *)dst, y);
dst += stride;
x0 = _mm_loadu_si128((const __m128i *)(above + 3));
y = avg3_epu16(&x1, &x2, &x0);
_mm_store_si128((__m128i *)dst, y);
dst += stride;
x1 = _mm_loadu_si128((const __m128i *)(above + 4));
y = avg3_epu16(&x2, &x0, &x1);
_mm_store_si128((__m128i *)dst, y);
dst += stride;
x2 = _mm_loadu_si128((const __m128i *)(above + 5));
x2 = _mm_insert_epi16(x2, above[11], 7);
y = avg3_epu16(&x0, &x1, &x2);
_mm_store_si128((__m128i *)dst, y);
}
static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1,
const __m128i *a2, uint16_t **dst,
ptrdiff_t stride) {
const __m128i y = avg3_epu16(a0, a1, a2);
_mm_storeu_si128((__m128i *)*dst, y);
*dst += stride;
}
void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
__m128i x0 = _mm_load_si128((const __m128i *)above);
__m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
__m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
d45e_w8(&x0, &x1, &x2, &dst, stride);
int i = 3;
do {
x0 = _mm_loadu_si128((const __m128i *)(above + i++));
d45e_w8(&x1, &x2, &x0, &dst, stride);
x1 = _mm_loadu_si128((const __m128i *)(above + i++));
d45e_w8(&x2, &x0, &x1, &dst, stride);
x2 = _mm_loadu_si128((const __m128i *)(above + i++));
d45e_w8(&x0, &x1, &x2, &dst, stride);
} while (i < 9);
x0 = _mm_loadu_si128((const __m128i *)(above + 9));
x0 = _mm_insert_epi16(x0, above[15], 7);
const __m128i y = avg3_epu16(&x1, &x2, &x0);
_mm_store_si128((__m128i *)dst, y);
}
void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
(void)left;
(void)bd;
__m128i x0 = _mm_load_si128((const __m128i *)above);
__m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1));
__m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2));
d45e_w8(&x0, &x1, &x2, &dst, stride);
int i = 3;
do {
x0 = _mm_loadu_si128((const __m128i *)(above + i++));
d45e_w8(&x1, &x2, &x0, &dst, stride);
x1 = _mm_loadu_si128((const __m128i *)(above + i++));
d45e_w8(&x2, &x0, &x1, &dst, stride);
x2 = _mm_loadu_si128((const __m128i *)(above + i++));
d45e_w8(&x0, &x1, &x2, &dst, stride);
} while (i < 15);
x0 = _mm_loadu_si128((const __m128i *)(above + 15));
__m128i y = avg3_epu16(&x1, &x2, &x0);
_mm_store_si128((__m128i *)dst, y);
dst += stride;
x1 = _mm_loadu_si128((const __m128i *)(above + 16));
y = avg3_epu16(&x2, &x0, &x1);
_mm_store_si128((__m128i *)dst, y);
dst += stride;
x2 = _mm_loadu_si128((const __m128i *)(above + 17));
x2 = _mm_insert_epi16(x2, above[23], 7);
y = avg3_epu16(&x0, &x1, &x2);
_mm_store_si128((__m128i *)dst, y);
}
......@@ -139,7 +139,7 @@ class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
}
};
TEST_P(HighbdIntraPredTest, IntraPredTests) {
TEST_P(HighbdIntraPredTest, Bitexact) {
// max block size is 32
DECLARE_ALIGNED(16, uint16_t, left_col[2 * 32]);
DECLARE_ALIGNED(16, uint16_t, above_data[2 * 32 + 32]);
......@@ -148,7 +148,7 @@ TEST_P(HighbdIntraPredTest, IntraPredTests) {
RunTest(left_col, above_data, dst, ref_dst);
}
TEST_P(LowbdIntraPredTest, IntraPredTests) {
TEST_P(LowbdIntraPredTest, Bitexact) {
// max block size is 32
DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
......@@ -157,6 +157,9 @@ TEST_P(LowbdIntraPredTest, IntraPredTests) {
RunTest(left_col, above_data, dst, ref_dst);
}
// -----------------------------------------------------------------------------
// High Bit Depth Tests
#define highbd_entry(type, width, height, opt, bd) \
IntraPredFunc<HighbdIntraPred>( \
&aom_highbd_##type##_predictor_##width##x##height##_##opt, \
......@@ -178,7 +181,9 @@ const IntraPredFunc<HighbdIntraPred> IntraPredTestVector8[] = {
highbd_intrapred(dc_top, sse2, 8), highbd_intrapred(dc_128, sse2, 8),
highbd_intrapred(h, sse2, 8), highbd_intrapred(v, sse2, 8),
highbd_entry(d117, 4, 4, sse2, 8), highbd_entry(d135, 4, 4, sse2, 8),
highbd_entry(d153, 4, 4, sse2, 8),
highbd_entry(d153, 4, 4, sse2, 8), highbd_entry(d45e, 4, 4, sse2, 8),
highbd_entry(d45e, 4, 8, sse2, 8), highbd_entry(d45e, 8, 4, sse2, 8),
highbd_entry(d45e, 8, 8, sse2, 8), highbd_entry(d45e, 8, 16, sse2, 8),
};
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, HighbdIntraPredTest,
......@@ -189,7 +194,9 @@ const IntraPredFunc<HighbdIntraPred> IntraPredTestVector10[] = {
highbd_intrapred(dc_top, sse2, 10), highbd_intrapred(dc_128, sse2, 10),
highbd_intrapred(h, sse2, 10), highbd_intrapred(v, sse2, 10),
highbd_entry(d117, 4, 4, sse2, 10), highbd_entry(d135, 4, 4, sse2, 10),
highbd_entry(d153, 4, 4, sse2, 10),
highbd_entry(d153, 4, 4, sse2, 10), highbd_entry(d45e, 4, 4, sse2, 10),
highbd_entry(d45e, 4, 8, sse2, 10), highbd_entry(d45e, 8, 4, sse2, 10),
highbd_entry(d45e, 8, 8, sse2, 10), highbd_entry(d45e, 8, 16, sse2, 10),
};
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, HighbdIntraPredTest,
......@@ -200,7 +207,9 @@ const IntraPredFunc<HighbdIntraPred> IntraPredTestVector12[] = {
highbd_intrapred(dc_top, sse2, 12), highbd_intrapred(dc_128, sse2, 12),
highbd_intrapred(h, sse2, 12), highbd_intrapred(v, sse2, 12),
highbd_entry(d117, 4, 4, sse2, 12), highbd_entry(d135, 4, 4, sse2, 12),
highbd_entry(d153, 4, 4, sse2, 12),
highbd_entry(d153, 4, 4, sse2, 12), highbd_entry(d45e, 4, 4, sse2, 12),
highbd_entry(d45e, 4, 8, sse2, 12), highbd_entry(d45e, 8, 4, sse2, 12),
highbd_entry(d45e, 8, 8, sse2, 12), highbd_entry(d45e, 8, 16, sse2, 12),
};
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, HighbdIntraPredTest,
......@@ -239,8 +248,37 @@ const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorSsse3_12[] = {
INSTANTIATE_TEST_CASE_P(SSSE3_TO_C_12, HighbdIntraPredTest,
::testing::ValuesIn(IntraPredTestVectorSsse3_12));
#endif // HAVE_SSSE3
#if HAVE_AVX2
const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorAvx2_8[] = {
highbd_entry(d45e, 16, 8, avx2, 8), highbd_entry(d45e, 16, 16, avx2, 8),
highbd_entry(d45e, 16, 32, avx2, 8), highbd_entry(d45e, 32, 16, avx2, 8),
highbd_entry(d45e, 32, 32, avx2, 8),
};
INSTANTIATE_TEST_CASE_P(AVX2_TO_C_8, HighbdIntraPredTest,
::testing::ValuesIn(IntraPredTestVectorAvx2_8));
const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorAvx2_10[] = {
highbd_entry(d45e, 16, 8, avx2, 10), highbd_entry(d45e, 16, 16, avx2, 10),
highbd_entry(d45e, 16, 32, avx2, 10), highbd_entry(d45e, 32, 16, avx2, 10),
highbd_entry(d45e, 32, 32, avx2, 10),
};
INSTANTIATE_TEST_CASE_P(AVX2_TO_C_10, HighbdIntraPredTest,
::testing::ValuesIn(IntraPredTestVectorAvx2_10));
const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorAvx2_12[] = {
highbd_entry(d45e, 16, 8, avx2, 12), highbd_entry(d45e, 16, 16, avx2, 12),
highbd_entry(d45e, 16, 32, avx2, 12), highbd_entry(d45e, 32, 16, avx2, 12),
highbd_entry(d45e, 32, 32, avx2, 12),
};
INSTANTIATE_TEST_CASE_P(AVX2_TO_C_12, HighbdIntraPredTest,
::testing::ValuesIn(IntraPredTestVectorAvx2_12));
#endif // HAVE_AVX2
#endif // CONFIG_HIGHBITDEPTH
// -----------------------------------------------------------------------------
// Low Bit Depth Tests
#define lowbd_entry(type, width, height, opt) \
IntraPredFunc<IntraPred>(&aom_##type##_predictor_##width##x##height##_##opt, \
&aom_##type##_predictor_##width##x##height##_c, \
......
......@@ -1145,16 +1145,19 @@ HIGHBD_INTRA_PRED_TEST(
SSE2_1, TestHighbdIntraPred4, "Hbd Intra4x4",
aom_highbd_dc_predictor_4x4_sse2, aom_highbd_dc_left_predictor_4x4_sse2,
aom_highbd_dc_top_predictor_4x4_sse2, aom_highbd_dc_128_predictor_4x4_sse2,
aom_highbd_v_predictor_4x4_sse2, aom_highbd_h_predictor_4x4_sse2, NULL,
aom_highbd_d135_predictor_4x4_sse2, aom_highbd_d117_predictor_4x4_sse2,
aom_highbd_d153_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL)
aom_highbd_v_predictor_4x4_sse2, aom_highbd_h_predictor_4x4_sse2,
aom_highbd_d45e_predictor_4x4_sse2, aom_highbd_d135_predictor_4x4_sse2,
aom_highbd_d117_predictor_4x4_sse2, aom_highbd_d153_predictor_4x4_sse2,
NULL, NULL, NULL, NULL, NULL, NULL)
HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred4, "Hbd Intra4x8",
aom_highbd_dc_predictor_4x8_sse2,
aom_highbd_dc_left_predictor_4x8_sse2,
aom_highbd_dc_top_predictor_4x8_sse2,
aom_highbd_dc_128_predictor_4x8_sse2,
aom_highbd_v_predictor_4x8_sse2,
aom_highbd_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL,
aom_highbd_h_predictor_4x8_sse2,
aom_highbd_d45e_predictor_4x8_sse2, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL)
#endif
......@@ -1208,7 +1211,8 @@ HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred8, "Hbd Intra8x8",
aom_highbd_dc_top_predictor_8x8_sse2,
aom_highbd_dc_128_predictor_8x8_sse2,
aom_highbd_v_predictor_8x8_sse2,
aom_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
aom_highbd_h_predictor_8x8_sse2,
aom_highbd_d45e_predictor_8x8_sse2, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL)
HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred8, "Hbd Intra8x4",
aom_highbd_dc_predictor_8x4_sse2,
......@@ -1216,7 +1220,8 @@ HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred8, "Hbd Intra8x4",
aom_highbd_dc_top_predictor_8x4_sse2,
aom_highbd_dc_128_predictor_8x4_sse2,
aom_highbd_v_predictor_8x4_sse2,
aom_highbd_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL,
aom_highbd_h_predictor_8x4_sse2,
aom_highbd_d45e_predictor_8x4_sse2, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL)
HIGHBD_INTRA_PRED_TEST(SSE2_3, TestHighbdIntraPred8,