diff --git a/test/reconintra_predictors_test.cc b/test/reconintra_predictors_test.cc index 38720baa856b2c36007bdee571636e27ede5254a..5da9af57478622b96edb697fb7023d4ca72fa970 100644 --- a/test/reconintra_predictors_test.cc +++ b/test/reconintra_predictors_test.cc @@ -32,6 +32,20 @@ typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, int bs, typedef tuple<Predictor, Predictor, int> PredFuncMode; typedef tuple<PredFuncMode, int> PredParams; +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*HbdPredictor)(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, const uint16_t *left, + int bd); + +// Note: +// Test parameter list: +// Reference predictor, optimized predictor, prediction mode, block size, +// bit depth +// +typedef tuple<HbdPredictor, HbdPredictor, int> HbdPredFuncMode; +typedef tuple<HbdPredFuncMode, int, int> HbdPredParams; +#endif + const int MaxBlkSize = 32; // By default, disable speed test @@ -136,6 +150,105 @@ class VP10IntraPredOptimzTest : public ::testing::TestWithParam<PredParams> { uint8_t *predRef_; }; +#if CONFIG_VP9_HIGHBITDEPTH +class VP10HbdIntraPredOptimzTest : + public ::testing::TestWithParam<HbdPredParams> { + public: + virtual ~VP10HbdIntraPredOptimzTest() {} + virtual void SetUp() { + HbdPredFuncMode funcMode = GET_PARAM(0); + predFuncRef_ = std::tr1::get<0>(funcMode); + predFunc_ = std::tr1::get<1>(funcMode); + mode_ = std::tr1::get<2>(funcMode); + blockSize_ = GET_PARAM(1); + bd_ = GET_PARAM(2); + + alloc_ = (uint16_t *)malloc((3 * MaxBlkSize + 2) * sizeof(alloc_[0])); + predRef_ = + (uint16_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(predRef_[0])); + pred_ = (uint16_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(pred_[0])); + } + + virtual void TearDown() { + delete[] alloc_; + delete[] predRef_; + delete[] pred_; + libvpx_test::ClearSystemState(); + } + + protected: + void RunTest() const { + int tstIndex = 0; + int stride = blockSize_; + uint16_t *left = alloc_; + uint16_t *above = alloc_ + MaxBlkSize + 1; + while (tstIndex < MaxTestNum) { + PrepareBuffer(); + predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_); + ASM_REGISTER_STATE_CHECK( + predFunc_(pred_, stride, blockSize_, &above[1], left, bd_)); + DiffPred(tstIndex); + tstIndex += 1; + } + } + + void RunSpeedTestC() const { + int tstIndex = 0; + int stride = blockSize_; + uint16_t *left = alloc_; + uint16_t *above = alloc_ + MaxBlkSize + 1; + PrepareBuffer(); + while (tstIndex < MaxTestNum) { + predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_); + tstIndex += 1; + } + } + + void RunSpeedTestSSE() const { + int tstIndex = 0; + int stride = blockSize_; + uint16_t *left = alloc_; + uint16_t *above = alloc_ + MaxBlkSize + 1; + PrepareBuffer(); + while (tstIndex < MaxTestNum) { + predFunc_(predRef_, stride, blockSize_, &above[1], left, bd_); + tstIndex += 1; + } + } + + private: + void PrepareBuffer() const { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int i = 0; + while (i < (3 * MaxBlkSize + 2)) { + alloc_[i] = rnd.Rand16() & ((1 << bd_) - 1); + i += 1; + } + } + + void DiffPred(int testNum) const { + int i = 0; + while (i < blockSize_ * blockSize_) { + EXPECT_EQ(predRef_[i], pred_[i]) + << "Error at position: " << i << " " + << "Block size: " << blockSize_ << " " + << "Bit depth: " << bd_ << " " + << "Test number: " << testNum; + i += 1; + } + } + + HbdPredictor predFunc_; + HbdPredictor predFuncRef_; + int mode_; + int blockSize_; + int bd_; + uint16_t *alloc_; + uint16_t *pred_; + uint16_t *predRef_; +}; +#endif // CONFIG_VP9_HIGHBITDEPTH + TEST_P(VP10IntraPredOptimzTest, BitExactCheck) { RunTest(); } @@ -150,6 +263,22 @@ TEST_P(VP10IntraPredOptimzTest, SpeedCheckSSE) { } #endif +#if CONFIG_VP9_HIGHBITDEPTH +TEST_P(VP10HbdIntraPredOptimzTest, BitExactCheck) { + RunTest(); +} + +#if PREDICTORS_SPEED_TEST +TEST_P(VP10HbdIntraPredOptimzTest, SpeedCheckC) { + RunSpeedTestC(); +} + +TEST_P(VP10HbdIntraPredOptimzTest, SpeedCheckSSE) { + RunSpeedTestSSE(); +} +#endif // PREDICTORS_SPEED_TEST +#endif // CONFIG_VP9_HIGHBITDEPTH + using std::tr1::make_tuple; const PredFuncMode kPredFuncMdArray[] = { @@ -183,4 +312,38 @@ INSTANTIATE_TEST_CASE_P( ::testing::ValuesIn(kPredFuncMdArray), ::testing::ValuesIn(kBlkSize))); +#if CONFIG_VP9_HIGHBITDEPTH +const HbdPredFuncMode kHbdPredFuncMdArray[] = { + make_tuple(vp10_highbd_dc_filter_predictor_c, + vp10_highbd_dc_filter_predictor_sse4_1, DC_PRED), + make_tuple(vp10_highbd_v_filter_predictor_c, + vp10_highbd_v_filter_predictor_sse4_1, V_PRED), + make_tuple(vp10_highbd_h_filter_predictor_c, + vp10_highbd_h_filter_predictor_sse4_1, H_PRED), + make_tuple(vp10_highbd_d45_filter_predictor_c, + vp10_highbd_d45_filter_predictor_sse4_1, D45_PRED), + make_tuple(vp10_highbd_d135_filter_predictor_c, + vp10_highbd_d135_filter_predictor_sse4_1, D135_PRED), + make_tuple(vp10_highbd_d117_filter_predictor_c, + vp10_highbd_d117_filter_predictor_sse4_1, D117_PRED), + make_tuple(vp10_highbd_d153_filter_predictor_c, + vp10_highbd_d153_filter_predictor_sse4_1, D153_PRED), + make_tuple(vp10_highbd_d207_filter_predictor_c, + vp10_highbd_d207_filter_predictor_sse4_1, D207_PRED), + make_tuple(vp10_highbd_d63_filter_predictor_c, + vp10_highbd_d63_filter_predictor_sse4_1, D63_PRED), + make_tuple(vp10_highbd_tm_filter_predictor_c, + vp10_highbd_tm_filter_predictor_sse4_1, TM_PRED), +}; + +const int kBd[] = {10, 12}; + +INSTANTIATE_TEST_CASE_P( + SSE4_1, VP10HbdIntraPredOptimzTest, + ::testing::Combine( + ::testing::ValuesIn(kHbdPredFuncMdArray), + ::testing::ValuesIn(kBlkSize), + ::testing::ValuesIn(kBd))); +#endif // CONFIG_VP9_HIGHBITDEPTH + } // namespace diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c index 19d0c3d48889b60df8ef74a0baa77d5f973fa26d..b5b0777e56fd251e286f790cc8c95dc1d0959561 100644 --- a/vp10/common/reconintra.c +++ b/vp10/common/reconintra.c @@ -1071,85 +1071,115 @@ static void highbd_filter_intra_predictors_4tap(uint16_t *dst, ptrdiff_t stride, } } -static void highbd_dc_filter_predictor(uint16_t *dst, ptrdiff_t stride, +void vp10_highbd_dc_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED, bd); } -static void highbd_v_filter_predictor(uint16_t *dst, ptrdiff_t stride, +void vp10_highbd_v_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED, bd); } -static void highbd_h_filter_predictor(uint16_t *dst, ptrdiff_t stride, +void vp10_highbd_h_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED, bd); } -static void highbd_d45_filter_predictor(uint16_t *dst, ptrdiff_t stride, +void vp10_highbd_d45_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED, bd); } -static void highbd_d135_filter_predictor(uint16_t *dst, ptrdiff_t stride, +void vp10_highbd_d135_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED, bd); } -static void highbd_d117_filter_predictor(uint16_t *dst, ptrdiff_t stride, +void vp10_highbd_d117_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED, bd); } -static void highbd_d153_filter_predictor(uint16_t *dst, ptrdiff_t stride, +void vp10_highbd_d153_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED, bd); } -static void highbd_d207_filter_predictor(uint16_t *dst, ptrdiff_t stride, +void vp10_highbd_d207_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED, bd); } -static void highbd_d63_filter_predictor(uint16_t *dst, ptrdiff_t stride, +void vp10_highbd_d63_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED, bd); } -static void highbd_tm_filter_predictor(uint16_t *dst, ptrdiff_t stride, +void vp10_highbd_tm_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED, bd); } -static void (*highbd_filter_intra_predictors[EXT_INTRA_MODES])(uint16_t *dst, - ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, - int bd) = { - highbd_dc_filter_predictor, highbd_v_filter_predictor, - highbd_h_filter_predictor, highbd_d45_filter_predictor, - highbd_d135_filter_predictor, highbd_d117_filter_predictor, - highbd_d153_filter_predictor, highbd_d207_filter_predictor, - highbd_d63_filter_predictor, highbd_tm_filter_predictor, -}; +static void highbd_filter_intra_predictors(int mode, uint16_t *dst, + ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + switch (mode) { + case DC_PRED: + vp10_highbd_dc_filter_predictor(dst, stride, bs, above, left, bd); + break; + case V_PRED: + vp10_highbd_v_filter_predictor(dst, stride, bs, above, left, bd); + break; + case H_PRED: + vp10_highbd_h_filter_predictor(dst, stride, bs, above, left, bd); + break; + case D45_PRED: + vp10_highbd_d45_filter_predictor(dst, stride, bs, above, left, bd); + break; + case D135_PRED: + vp10_highbd_d135_filter_predictor(dst, stride, bs, above, left, bd); + break; + case D117_PRED: + vp10_highbd_d117_filter_predictor(dst, stride, bs, above, left, bd); + break; + case D153_PRED: + vp10_highbd_d153_filter_predictor(dst, stride, bs, above, left, bd); + break; + case D207_PRED: + vp10_highbd_d207_filter_predictor(dst, stride, bs, above, left, bd); + break; + case D63_PRED: + vp10_highbd_d63_filter_predictor(dst, stride, bs, above, left, bd); + break; + case TM_PRED: + vp10_highbd_tm_filter_predictor(dst, stride, bs, above, left, bd); + break; + default: + assert(0); + } +} #endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_EXT_INTRA @@ -1303,7 +1333,7 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd, #if CONFIG_EXT_INTRA if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) { - highbd_filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs, + highbd_filter_intra_predictors(ext_intra_mode, dst, dst_stride, bs, const_above_row, left_col, xd->bd); return; } diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl index 6dbcc65c9889e9919ae456bd2b197887040d427c..b82b2634996ea0745e56e9defa519368743b7aab 100644 --- a/vp10/common/vp10_rtcd_defs.pl +++ b/vp10/common/vp10_rtcd_defs.pl @@ -320,6 +320,29 @@ if (vpx_config("CONFIG_EXT_INTRA") eq "yes") { specialize qw/vp10_d63_filter_predictor sse4_1/; add_proto qw/void vp10_tm_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left"; specialize qw/vp10_tm_filter_predictor sse4_1/; + # High bitdepth functions + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/void vp10_highbd_dc_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vp10_highbd_dc_filter_predictor sse4_1/; + add_proto qw/void vp10_highbd_v_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vp10_highbd_v_filter_predictor sse4_1/; + add_proto qw/void vp10_highbd_h_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vp10_highbd_h_filter_predictor sse4_1/; + add_proto qw/void vp10_highbd_d45_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vp10_highbd_d45_filter_predictor sse4_1/; + add_proto qw/void vp10_highbd_d135_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vp10_highbd_d135_filter_predictor sse4_1/; + add_proto qw/void vp10_highbd_d117_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vp10_highbd_d117_filter_predictor sse4_1/; + add_proto qw/void vp10_highbd_d153_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vp10_highbd_d153_filter_predictor sse4_1/; + add_proto qw/void vp10_highbd_d207_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vp10_highbd_d207_filter_predictor sse4_1/; + add_proto qw/void vp10_highbd_d63_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vp10_highbd_d63_filter_predictor sse4_1/; + add_proto qw/void vp10_highbd_tm_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vp10_highbd_tm_filter_predictor sse4_1/; + } } # High bitdepth functions diff --git a/vp10/common/x86/reconintra_sse4.c b/vp10/common/x86/reconintra_sse4.c index 851d850e732c0a6bbd6a1878435e8f6b5d7ce80a..7399de2b0f95dfa8270364050d03948b1f5c202f 100644 --- a/vp10/common/x86/reconintra_sse4.c +++ b/vp10/common/x86/reconintra_sse4.c @@ -591,3 +591,323 @@ void vp10_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, GetIntraFilterParams(bs, TM_PRED, &prm[0]); FilterPrediction(above, left, bs, prm, dst, stride); } + +// ============== High Bit Depth ============== +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE int HighbdGetMeanValue4x4(const uint16_t *above, + const uint16_t *left, const int bd, + __m128i *params) { + const __m128i a = _mm_loadu_si128((const __m128i *)above); + const __m128i l = _mm_loadu_si128((const __m128i *)left); + const __m128i zero = _mm_setzero_si128(); + __m128i sum_vector, u; + uint16_t sum_value; + (void)bd; + + sum_vector = _mm_add_epi16(a, l); + + sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values + u = _mm_srli_si128(sum_vector, 2); + sum_vector = _mm_add_epi16(sum_vector, u); + + sum_value = _mm_extract_epi16(sum_vector, 0); + sum_value += 4; + sum_value >>= 3; + *params = _mm_set1_epi32(sum_value); + return sum_value; +} + +static INLINE int HighbdGetMeanValue8x8(const uint16_t *above, + const uint16_t *left, const int bd, + __m128i *params) { + const __m128i a = _mm_loadu_si128((const __m128i *)above); + const __m128i l = _mm_loadu_si128((const __m128i *)left); + const __m128i zero = _mm_setzero_si128(); + __m128i sum_vector, u; + uint16_t sum_value; + (void)bd; + + sum_vector = _mm_add_epi16(a, l); + + sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values + sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values + + u = _mm_srli_si128(sum_vector, 2); + sum_vector = _mm_add_epi16(sum_vector, u); + + sum_value = _mm_extract_epi16(sum_vector, 0); + sum_value += 8; + sum_value >>= 4; + *params = _mm_set1_epi32(sum_value); + return sum_value; +} + +// Note: +// Process 16 pixels above and left, 10-bit depth +// Add to the last 8 pixels sum +static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left, + __m128i *sum) { + __m128i a = _mm_loadu_si128((const __m128i *)above); + __m128i l = _mm_loadu_si128((const __m128i *)left); + sum[0] = _mm_add_epi16(a, l); + a = _mm_loadu_si128((const __m128i *)(above + 8)); + l = _mm_loadu_si128((const __m128i *)(left + 8)); + sum[0] = _mm_add_epi16(sum[0], a); + sum[0] = _mm_add_epi16(sum[0], l); +} + +// Note: +// Process 16 pixels above and left, 12-bit depth +// Add to the last 8 pixels sum +static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left, + __m128i *sum) { + __m128i a = _mm_loadu_si128((const __m128i *)above); + __m128i l = _mm_loadu_si128((const __m128i *)left); + const __m128i zero = _mm_setzero_si128(); + __m128i v0, v1; + + v0 = _mm_unpacklo_epi16(a, zero); + v1 = _mm_unpacklo_epi16(l, zero); + sum[0] = _mm_add_epi32(v0, v1); + + v0 = _mm_unpackhi_epi16(a, zero); + v1 = _mm_unpackhi_epi16(l, zero); + sum[0] = _mm_add_epi32(sum[0], v0); + sum[0] = _mm_add_epi32(sum[0], v1); + + a = _mm_loadu_si128((const __m128i *)(above + 8)); + l = _mm_loadu_si128((const __m128i *)(left + 8)); + + v0 = _mm_unpacklo_epi16(a, zero); + v1 = _mm_unpacklo_epi16(l, zero); + sum[0] = _mm_add_epi32(sum[0], v0); + sum[0] = _mm_add_epi32(sum[0], v1); + + v0 = _mm_unpackhi_epi16(a, zero); + v1 = _mm_unpackhi_epi16(l, zero); + sum[0] = _mm_add_epi32(sum[0], v0); + sum[0] = _mm_add_epi32(sum[0], v1); +} + +static INLINE int HighbdGetMeanValue16x16(const uint16_t *above, + const uint16_t *left, const int bd, + __m128i *params) { + const __m128i zero = _mm_setzero_si128(); + __m128i sum_vector, u; + uint32_t sum_value = 0; + + if (10 == bd) { + AddPixels10bit(above, left, &sum_vector); + sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values + sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values + + u = _mm_srli_si128(sum_vector, 2); + sum_vector = _mm_add_epi16(sum_vector, u); + sum_value = _mm_extract_epi16(sum_vector, 0); + } else if (12 == bd) { + AddPixels12bit(above, left, &sum_vector); + + sum_vector = _mm_hadd_epi32(sum_vector, zero); + u = _mm_srli_si128(sum_vector, 4); + sum_vector = _mm_add_epi32(u, sum_vector); + sum_value = _mm_extract_epi32(sum_vector, 0); + } + + sum_value += 16; + sum_value >>= 5; + *params = _mm_set1_epi32(sum_value); + return sum_value; +} + +static INLINE int HighbdGetMeanValue32x32(const uint16_t *above, + const uint16_t *left, const int bd, + __m128i *params) { + const __m128i zero = _mm_setzero_si128(); + __m128i sum_vector[2], u; + uint32_t sum_value = 0; + + if (10 == bd) { + AddPixels10bit(above, left, &sum_vector[0]); + AddPixels10bit(above + 16, left + 16, &sum_vector[1]); + + sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]); + sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 4 values + sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 2 values + + u = _mm_srli_si128(sum_vector[0], 2); + sum_vector[0] = _mm_add_epi16(sum_vector[0], u); + sum_value = _mm_extract_epi16(sum_vector[0], 0); + } else if (12 == bd) { + AddPixels12bit(above, left, &sum_vector[0]); + AddPixels12bit(above + 16, left + 16, &sum_vector[1]); + + sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]); + sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero); + u = _mm_srli_si128(sum_vector[0], 4); + sum_vector[0] = _mm_add_epi32(u, sum_vector[0]); + sum_value = _mm_extract_epi32(sum_vector[0], 0); + } + + sum_value += 32; + sum_value >>= 6; + *params = _mm_set1_epi32(sum_value); + return sum_value; +} + +// Note: +// params[4] : mean value, 4 int32_t repetition +// +static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above, + const uint16_t *left, int bs, + const int bd, __m128i *params) { + int meanValue = 0; + switch (bs) { + case 4: + meanValue = HighbdGetMeanValue4x4(above, left, bd, params); + break; + case 8: + meanValue = HighbdGetMeanValue8x8(above, left, bd, params); + break; + case 16: + meanValue = HighbdGetMeanValue16x16(above, left, bd, params); + break; + case 32: + meanValue = HighbdGetMeanValue32x32(above, left, bd, params); + break; + default: + assert(0); + } + return meanValue; +} + +// Note: +// At column index c, the remaining pixels are R = 2 * bs + 1 - r - c +// the number of pixels to produce is R - 2 = 2 * bs - r - c - 1 +static void HighbdGeneratePrediction(const uint16_t *above, + const uint16_t *left, + const int bs, const int bd, + const __m128i *prm, int meanValue, + uint16_t *dst, + ptrdiff_t stride) { + int pred[33][65]; + int r, c, colBound; + int remainings; + int ipred; + + for (r = 0; r < bs; ++r) { + pred[r + 1][0] = (int)left[r] - meanValue; + } + + above -= 1; + for (c = 0; c < 2 * bs + 1; ++c) { + pred[0][c] = (int)above[c] - meanValue; + } + + r = 0; + c = 0; + while (r < bs) { + colBound = (bs << 1) - r; + for (c = 0; c < colBound; c += 4) { + remainings = colBound - c + 1; + ProducePixels(&pred[r][c], prm, remainings); + } + r += 1; + } + + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + ipred = pred[r + 1][c + 1] + meanValue; + dst[c] = clip_pixel_highbd(ipred, bd); + } + dst += stride; + } +} + +static void HighbdFilterPrediction(const uint16_t *above, const uint16_t *left, + int bs, const int bd, __m128i *prm, + uint16_t *dst, ptrdiff_t stride) { + int meanValue = 0; + meanValue = HighbdCalcRefPixelsMeanValue(above, left, bs, bd, &prm[4]); + HighbdGeneratePrediction(above, left, bs, bd, prm, meanValue, dst, stride); +} + +void vp10_highbd_dc_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + __m128i prm[5]; + GetIntraFilterParams(bs, DC_PRED, &prm[0]); + HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); +} + +void vp10_highbd_v_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + __m128i prm[5]; + GetIntraFilterParams(bs, V_PRED, &prm[0]); + HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); +} + +void vp10_highbd_h_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + __m128i prm[5]; + GetIntraFilterParams(bs, H_PRED, &prm[0]); + HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); +} + +void vp10_highbd_d45_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + __m128i prm[5]; + GetIntraFilterParams(bs, D45_PRED, &prm[0]); + HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); +} + +void vp10_highbd_d135_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + __m128i prm[5]; + GetIntraFilterParams(bs, D135_PRED, &prm[0]); + HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); +} + +void vp10_highbd_d117_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + __m128i prm[5]; + GetIntraFilterParams(bs, D117_PRED, &prm[0]); + HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); +} + +void vp10_highbd_d153_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + __m128i prm[5]; + GetIntraFilterParams(bs, D153_PRED, &prm[0]); + HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); +} + +void vp10_highbd_d207_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + __m128i prm[5]; + GetIntraFilterParams(bs, D207_PRED, &prm[0]); + HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); +} + +void vp10_highbd_d63_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + __m128i prm[5]; + GetIntraFilterParams(bs, D63_PRED, &prm[0]); + HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); +} + +void vp10_highbd_tm_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + __m128i prm[5]; + GetIntraFilterParams(bs, TM_PRED, &prm[0]); + HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride); +} +#endif // CONFIG_VP9_HIGHBITDEPTH