Commit 8cacca73 authored by Yi Luo's avatar Yi Luo
Browse files

HBD convolution filtering (10/12 taps) SSE4.1 optimization

- For experiment EXT_INTERP under high bit depth.
- Add unit test to verify bit-exact.
- Speed performance improvement:
  On Xeon E5-2680, park_joy_1080p_12.y4m, 50 frames, encoding time
  drops from 6682503 ms to 5390270 ms.

Change-Id: Iea4debf5414f3accf1eb5672abeab56a0539ac77
parent 1178f71d
...@@ -24,12 +24,25 @@ using libvpx_test::ACMRandom; ...@@ -24,12 +24,25 @@ using libvpx_test::ACMRandom;
typedef void (*conv_filter_t)(const uint8_t*, int, uint8_t*, int, typedef void (*conv_filter_t)(const uint8_t*, int, uint8_t*, int,
int, int, const InterpFilterParams, int, int, const InterpFilterParams,
const int, int, int); const int, int, int);
#if CONFIG_VP9_HIGHBITDEPTH
typedef void (*hbd_conv_filter_t)(const uint16_t*, int, uint16_t*, int,
int, int, const InterpFilterParams,
const int, int, int, int);
#endif
// Test parameter list: // Test parameter list:
// <convolve_horiz_func, convolve_vert_func, // <convolve_horiz_func, convolve_vert_func,
// <width, height>, filter_params, subpel_x_q4, avg> // <width, height>, filter_params, subpel_x_q4, avg>
typedef tuple<int, int> BlockDimension; typedef tuple<int, int> BlockDimension;
typedef tuple<conv_filter_t, conv_filter_t, BlockDimension, INTERP_FILTER, typedef tuple<conv_filter_t, conv_filter_t, BlockDimension, INTERP_FILTER,
int, int> ConvParams; int, int> ConvParams;
#if CONFIG_VP9_HIGHBITDEPTH
// Test parameter list:
// <convolve_horiz_func, convolve_vert_func,
// <width, height>, filter_params, subpel_x_q4, avg, bit_dpeth>
typedef tuple<hbd_conv_filter_t, hbd_conv_filter_t, BlockDimension,
INTERP_FILTER, int, int, int> HbdConvParams;
#endif
// Note: // Note:
// src_ and src_ref_ have special boundary requirement // src_ and src_ref_ have special boundary requirement
...@@ -75,11 +88,8 @@ class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> { ...@@ -75,11 +88,8 @@ class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
void RunVertFilterBitExactCheck(); void RunVertFilterBitExactCheck();
private: private:
void PrepFilterBuffer(uint8_t *src, uint8_t *src_ref, void PrepFilterBuffer(int w, int h);
uint8_t *dst, uint8_t *dst_ref, void DiffFilterBuffer();
int w, int h);
void DiffFilterBuffer(const uint8_t *buf, const uint8_t *buf_ref,
int w, int h, int fgroup, int findex);
conv_filter_t conv_horiz_; conv_filter_t conv_horiz_;
conv_filter_t conv_vert_; conv_filter_t conv_vert_;
uint8_t *alloc_; uint8_t *alloc_;
...@@ -94,18 +104,16 @@ class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> { ...@@ -94,18 +104,16 @@ class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
int avg_; int avg_;
}; };
void VP10ConvolveOptimzTest::PrepFilterBuffer(uint8_t *src, uint8_t *src_ref, void VP10ConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
uint8_t *dst, uint8_t *dst_ref,
int w, int h) {
int r, c; int r, c;
ACMRandom rnd(ACMRandom::DeterministicSeed()); ACMRandom rnd(ACMRandom::DeterministicSeed());
memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0])); memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
uint8_t *src_ptr = src; uint8_t *src_ptr = src_;
uint8_t *dst_ptr = dst; uint8_t *dst_ptr = dst_;
uint8_t *src_ref_ptr = src_ref; uint8_t *src_ref_ptr = src_ref_;
uint8_t *dst_ref_ptr = dst_ref; uint8_t *dst_ref_ptr = dst_ref_;
for (r = 0; r < height_; ++r) { for (r = 0; r < height_; ++r) {
for (c = 0; c < width_; ++c) { for (c = 0; c < width_; ++c) {
...@@ -121,21 +129,17 @@ void VP10ConvolveOptimzTest::PrepFilterBuffer(uint8_t *src, uint8_t *src_ref, ...@@ -121,21 +129,17 @@ void VP10ConvolveOptimzTest::PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
} }
} }
void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf, void VP10ConvolveOptimzTest::DiffFilterBuffer() {
const uint8_t *buf_ref,
int w, int h,
int filter_group,
int filter_index) {
int r, c; int r, c;
const uint8_t *dst_ptr = buf; const uint8_t *dst_ptr = dst_;
const uint8_t *dst_ref_ptr = buf_ref; const uint8_t *dst_ref_ptr = dst_ref_;
for (r = 0; r < h; ++r) { for (r = 0; r < height_; ++r) {
for (c = 0; c < w; ++c) { for (c = 0; c < width_; ++c) {
EXPECT_EQ((uint8_t)dst_ref_ptr[c], (uint8_t)dst_ptr[c]) EXPECT_EQ((uint8_t)dst_ref_ptr[c], (uint8_t)dst_ptr[c])
<< "Error at row: " << r << " col: " << c << " " << "Error at row: " << r << " col: " << c << " "
<< "w = " << w << " " << "h = " << h << " " << "w = " << width_ << " " << "h = " << height_ << " "
<< "filter group index = " << filter_group << " " << "filter group index = " << filter_ << " "
<< "filter index = " << filter_index; << "filter index = " << subpel_;
} }
dst_ptr += stride; dst_ptr += stride;
dst_ref_ptr += stride; dst_ref_ptr += stride;
...@@ -143,7 +147,7 @@ void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf, ...@@ -143,7 +147,7 @@ void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf,
} }
void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() { void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk); PrepFilterBuffer(testMaxBlk, testMaxBlk);
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_); InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
...@@ -153,14 +157,14 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() { ...@@ -153,14 +157,14 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
conv_horiz_(src_, stride, dst_, stride, width_, height_, conv_horiz_(src_, stride, dst_, stride, width_, height_,
filter_params, subpel_, x_step_q4, avg_); filter_params, subpel_, x_step_q4, avg_);
DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_); DiffFilterBuffer();
// Note: // Note:
// Here we need calculate a height which is different from the specified one // Here we need calculate a height which is different from the specified one
// and test again. // and test again.
int intermediate_height = int intermediate_height =
(((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps; (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk); PrepFilterBuffer(testMaxBlk, testMaxBlk);
vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_, vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
intermediate_height, filter_params, subpel_, x_step_q4, intermediate_height, filter_params, subpel_, x_step_q4,
...@@ -170,12 +174,11 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() { ...@@ -170,12 +174,11 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
intermediate_height, filter_params, subpel_, x_step_q4, intermediate_height, filter_params, subpel_, x_step_q4,
avg_); avg_);
DiffFilterBuffer(dst_, dst_ref_, width_, intermediate_height, filter_, DiffFilterBuffer();
subpel_);
} }
void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() { void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk); PrepFilterBuffer(testMaxBlk, testMaxBlk);
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_); InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
...@@ -185,7 +188,7 @@ void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() { ...@@ -185,7 +188,7 @@ void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
conv_vert_(src_, stride, dst_, stride, width_, height_, conv_vert_(src_, stride, dst_, stride, width_, height_,
filter_params, subpel_, x_step_q4, avg_); filter_params, subpel_, x_step_q4, avg_);
DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_); DiffFilterBuffer();
} }
TEST_P(VP10ConvolveOptimzTest, HorizBitExactCheck) { TEST_P(VP10ConvolveOptimzTest, HorizBitExactCheck) {
...@@ -197,7 +200,7 @@ TEST_P(VP10ConvolveOptimzTest, VerticalBitExactCheck) { ...@@ -197,7 +200,7 @@ TEST_P(VP10ConvolveOptimzTest, VerticalBitExactCheck) {
using std::tr1::make_tuple; using std::tr1::make_tuple;
#if HAVE_SSSE3 && CONFIG_EXT_INTERP #if (HAVE_SSSE3 || HAVE_SSE4_1) && CONFIG_EXT_INTERP
const BlockDimension kBlockDim[] = { const BlockDimension kBlockDim[] = {
make_tuple(2, 2), make_tuple(2, 2),
make_tuple(2, 4), make_tuple(2, 4),
...@@ -225,7 +228,9 @@ const INTERP_FILTER kFilter[] = {6, 4, 2}; ...@@ -225,7 +228,9 @@ const INTERP_FILTER kFilter[] = {6, 4, 2};
const int kSubpelQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const int kSubpelQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const int kAvg[] = {0, 1}; const int kAvg[] = {0, 1};
#endif
#if HAVE_SSSE3 && CONFIG_EXT_INTERP
INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P(
SSSE3, VP10ConvolveOptimzTest, SSSE3, VP10ConvolveOptimzTest,
::testing::Combine( ::testing::Combine(
...@@ -236,4 +241,167 @@ INSTANTIATE_TEST_CASE_P( ...@@ -236,4 +241,167 @@ INSTANTIATE_TEST_CASE_P(
::testing::ValuesIn(kSubpelQ4), ::testing::ValuesIn(kSubpelQ4),
::testing::ValuesIn(kAvg))); ::testing::ValuesIn(kAvg)));
#endif // HAVE_SSSE3 && CONFIG_EXT_INTERP #endif // HAVE_SSSE3 && CONFIG_EXT_INTERP
#if CONFIG_VP9_HIGHBITDEPTH
typedef ::testing::TestWithParam<HbdConvParams> TestWithHbdConvParams;
class VP10HbdConvolveOptimzTest : public TestWithHbdConvParams {
public:
virtual ~VP10HbdConvolveOptimzTest() {}
virtual void SetUp() {
conv_horiz_ = GET_PARAM(0);
conv_vert_ = GET_PARAM(1);
BlockDimension block = GET_PARAM(2);
width_ = std::tr1::get<0>(block);
height_ = std::tr1::get<1>(block);
filter_ = GET_PARAM(3);
subpel_ = GET_PARAM(4);
avg_ = GET_PARAM(5);
bit_depth_ = GET_PARAM(6);
alloc_ = new uint16_t[maxBlockSize * 4];
src_ = alloc_ + (vertiOffset * maxWidth);
src_ += horizOffset;
src_ref_ = src_ + maxBlockSize;
dst_ = alloc_ + 2 * maxBlockSize;
dst_ref_ = alloc_ + 3 * maxBlockSize;
}
virtual void TearDown() {
delete[] alloc_;
libvpx_test::ClearSystemState();
}
protected:
void RunHorizFilterBitExactCheck();
void RunVertFilterBitExactCheck();
private:
void PrepFilterBuffer(int w, int h);
void DiffFilterBuffer();
hbd_conv_filter_t conv_horiz_;
hbd_conv_filter_t conv_vert_;
uint16_t *alloc_;
uint16_t *src_;
uint16_t *dst_;
uint16_t *src_ref_;
uint16_t *dst_ref_;
int width_;
int height_;
int filter_;
int subpel_;
int avg_;
int bit_depth_;
};
void VP10HbdConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
int r, c;
ACMRandom rnd(ACMRandom::DeterministicSeed());
memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
uint16_t *src_ptr = src_;
uint16_t *dst_ptr = dst_;
uint16_t *dst_ref_ptr = dst_ref_;
uint16_t hbd_mask = (1 << bit_depth_) - 1;
for (r = 0; r < height_; ++r) {
for (c = 0; c < width_; ++c) {
src_ptr[c] = rnd.Rand16() & hbd_mask;
dst_ptr[c] = rnd.Rand16() & hbd_mask;
dst_ref_ptr[c] = dst_ptr[c];
}
src_ptr += stride;
dst_ptr += stride;
dst_ref_ptr += stride;
}
}
void VP10HbdConvolveOptimzTest::DiffFilterBuffer() {
int r, c;
const uint16_t *dst_ptr = dst_;
const uint16_t *dst_ref_ptr = dst_ref_;
for (r = 0; r < height_; ++r) {
for (c = 0; c < width_; ++c) {
EXPECT_EQ((uint16_t)dst_ref_ptr[c], (uint16_t)dst_ptr[c])
<< "Error at row: " << r << " col: " << c << " "
<< "w = " << width_ << " " << "h = " << height_ << " "
<< "filter group index = " << filter_ << " "
<< "filter index = " << subpel_ << " "
<< "bit depth = " << bit_depth_;
}
dst_ptr += stride;
dst_ref_ptr += stride;
}
}
void VP10HbdConvolveOptimzTest::RunHorizFilterBitExactCheck() {
PrepFilterBuffer(testMaxBlk, testMaxBlk);
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
vp10_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
height_, filter_params, subpel_, x_step_q4,
avg_, bit_depth_);
conv_horiz_(src_, stride, dst_, stride, width_, height_,
filter_params, subpel_, x_step_q4, avg_, bit_depth_);
DiffFilterBuffer();
// Note:
// Here we need calculate a height which is different from the specified one
// and test again.
int intermediate_height =
(((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
PrepFilterBuffer(testMaxBlk, testMaxBlk);
vp10_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
intermediate_height, filter_params, subpel_,
x_step_q4, avg_, bit_depth_);
conv_horiz_(src_, stride, dst_, stride, width_, intermediate_height,
filter_params, subpel_, x_step_q4, avg_, bit_depth_);
DiffFilterBuffer();
}
void VP10HbdConvolveOptimzTest::RunVertFilterBitExactCheck() {
PrepFilterBuffer(testMaxBlk, testMaxBlk);
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
vp10_highbd_convolve_vert_c(src_, stride, dst_ref_, stride, width_, height_,
filter_params, subpel_, x_step_q4, avg_,
bit_depth_);
conv_vert_(src_, stride, dst_, stride, width_, height_,
filter_params, subpel_, x_step_q4, avg_, bit_depth_);
DiffFilterBuffer();
}
TEST_P(VP10HbdConvolveOptimzTest, HorizBitExactCheck) {
RunHorizFilterBitExactCheck();
}
TEST_P(VP10HbdConvolveOptimzTest, VertBitExactCheck) {
RunVertFilterBitExactCheck();
}
#if HAVE_SSE4_1 && CONFIG_EXT_INTERP
const int kBitdepth[] = {10, 12};
INSTANTIATE_TEST_CASE_P(
SSE4_1, VP10HbdConvolveOptimzTest,
::testing::Combine(
::testing::Values(vp10_highbd_convolve_horiz_sse4_1),
::testing::Values(vp10_highbd_convolve_vert_sse4_1),
::testing::ValuesIn(kBlockDim),
::testing::ValuesIn(kFilter),
::testing::ValuesIn(kSubpelQ4),
::testing::ValuesIn(kAvg),
::testing::ValuesIn(kBitdepth)));
#endif // HAVE_SSE4_1 && CONFIG_EXT_INTERP
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace } // namespace
...@@ -342,3 +342,25 @@ SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir( ...@@ -342,3 +342,25 @@ SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
(void)index; (void)index;
return NULL; return NULL;
} }
#if CONFIG_VP9_HIGHBITDEPTH
HbdSubpelFilterCoeffs vp10_hbd_get_subpel_filter_ver_signal_dir(
const InterpFilterParams p, int index) {
#if CONFIG_EXT_INTERP && HAVE_SSE4_1
if (p.filter_ptr == (const int16_t *)sub_pel_filters_12sharp) {
return &sub_pel_filters_12sharp_highbd_ver_signal_dir[index][0];
}
if (p.filter_ptr == (const int16_t *)sub_pel_filters_10sharp) {
return &sub_pel_filters_10sharp_highbd_ver_signal_dir[index][0];
}
#endif
#if USE_TEMPORALFILTER_12TAP && HAVE_SSE4_1
if (p.filter_ptr == (const int16_t *)sub_pel_filters_temporalfilter_12) {
return &sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[index][0];
}
#endif
(void)p;
(void)index;
return NULL;
}
#endif
...@@ -95,6 +95,10 @@ static INLINE int vp10_is_interpolating_filter( ...@@ -95,6 +95,10 @@ static INLINE int vp10_is_interpolating_filter(
#if USE_TEMPORALFILTER_12TAP #if USE_TEMPORALFILTER_12TAP
extern const int8_t sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]; extern const int8_t sub_pel_filters_temporalfilter_12_signal_dir[15][2][16];
extern const int8_t sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]; extern const int8_t sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16];
#if CONFIG_VP9_HIGHBITDEPTH
extern const
int16_t sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[15][6][8];
#endif
#endif #endif
#if CONFIG_EXT_INTERP #if CONFIG_EXT_INTERP
...@@ -102,15 +106,26 @@ extern const int8_t sub_pel_filters_12sharp_signal_dir[15][2][16]; ...@@ -102,15 +106,26 @@ extern const int8_t sub_pel_filters_12sharp_signal_dir[15][2][16];
extern const int8_t sub_pel_filters_10sharp_signal_dir[15][2][16]; extern const int8_t sub_pel_filters_10sharp_signal_dir[15][2][16];
extern const int8_t sub_pel_filters_12sharp_ver_signal_dir[15][6][16]; extern const int8_t sub_pel_filters_12sharp_ver_signal_dir[15][6][16];
extern const int8_t sub_pel_filters_10sharp_ver_signal_dir[15][6][16]; extern const int8_t sub_pel_filters_10sharp_ver_signal_dir[15][6][16];
#if CONFIG_VP9_HIGHBITDEPTH
extern const int16_t sub_pel_filters_12sharp_highbd_ver_signal_dir[15][6][8];
extern const int16_t sub_pel_filters_10sharp_highbd_ver_signal_dir[15][6][8];
#endif
#endif #endif
typedef const int8_t (*SubpelFilterCoeffs)[16]; typedef const int8_t (*SubpelFilterCoeffs)[16];
#if CONFIG_VP9_HIGHBITDEPTH
typedef const int16_t (*HbdSubpelFilterCoeffs)[8];
#endif
SubpelFilterCoeffs vp10_get_subpel_filter_signal_dir( SubpelFilterCoeffs vp10_get_subpel_filter_signal_dir(
const InterpFilterParams p, int index); const InterpFilterParams p, int index);
SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir( SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
const InterpFilterParams p, int index); const InterpFilterParams p, int index);
#if CONFIG_VP9_HIGHBITDEPTH
HbdSubpelFilterCoeffs vp10_hbd_get_subpel_filter_ver_signal_dir(
const InterpFilterParams p, int index);
#endif
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
......
...@@ -182,7 +182,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst, ...@@ -182,7 +182,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
} }
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
static void highbd_convolve_horiz(const uint16_t *src, int src_stride, void vp10_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h, uint16_t *dst, int dst_stride, int w, int h,
const InterpFilterParams filter_params, const InterpFilterParams filter_params,
const int subpel_x_q4, int x_step_q4, int avg, const int subpel_x_q4, int x_step_q4, int avg,
...@@ -213,7 +213,7 @@ static void highbd_convolve_horiz(const uint16_t *src, int src_stride, ...@@ -213,7 +213,7 @@ static void highbd_convolve_horiz(const uint16_t *src, int src_stride,
} }
} }
static void highbd_convolve_vert(const uint16_t *src, int src_stride, void vp10_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
uint16_t *dst, int dst_stride, int w, int h, uint16_t *dst, int dst_stride, int w, int h,
const InterpFilterParams filter_params, const InterpFilterParams filter_params,
const int subpel_y_q4, int y_step_q4, int avg, const int subpel_y_q4, int y_step_q4, int avg,
...@@ -300,8 +300,9 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8, ...@@ -300,8 +300,9 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
InterpFilterParams filter_params = InterpFilterParams filter_params =
vp10_get_interp_filter_params(interp_filter); vp10_get_interp_filter_params(interp_filter);
#endif #endif
highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params, vp10_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h,
subpel_x_q4, x_step_q4, ref_idx, bd); filter_params, subpel_x_q4, x_step_q4, ref_idx,
bd);
} else if (ignore_horiz) { } else if (ignore_horiz) {
#if CONFIG_DUAL_FILTER #if CONFIG_DUAL_FILTER
InterpFilterParams filter_params = InterpFilterParams filter_params =
...@@ -310,8 +311,9 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8, ...@@ -310,8 +311,9 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
InterpFilterParams filter_params = InterpFilterParams filter_params =
vp10_get_interp_filter_params(interp_filter); vp10_get_interp_filter_params(interp_filter);
#endif #endif
highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params, vp10_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h,
subpel_y_q4, y_step_q4, ref_idx, bd); filter_params, subpel_y_q4, y_step_q4, ref_idx,
bd);
} else { } else {
// temp's size is set to (maximum possible intermediate_height) * // temp's size is set to (maximum possible intermediate_height) *
// MAX_BLOCK_WIDTH // MAX_BLOCK_WIDTH
...@@ -336,9 +338,10 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8, ...@@ -336,9 +338,10 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
int intermediate_height = int intermediate_height =
(((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size; (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
highbd_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, vp10_highbd_convolve_horiz(src - src_stride * (filter_size / 2 - 1),
temp, temp_stride, w, intermediate_height, src_stride, temp, temp_stride, w,
filter_params, subpel_x_q4, x_step_q4, 0, bd); intermediate_height, filter_params, subpel_x_q4,
x_step_q4, 0, bd);
#if CONFIG_DUAL_FILTER #if CONFIG_DUAL_FILTER
filter_params = filter_params_y; filter_params = filter_params_y;
...@@ -346,7 +349,7 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8, ...@@ -346,7 +349,7 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
filter_size = filter_params.taps; filter_size = filter_params.taps;
assert(filter_params.taps <= MAX_FILTER_TAP); assert(filter_params.taps <= MAX_FILTER_TAP);
highbd_convolve_vert(temp + temp_stride * (filter_size / 2 - 1), vp10_highbd_convolve_vert(temp + temp_stride * (filter_size / 2 - 1),
temp_stride, dst, dst_stride, w, h, filter_params, temp_stride, dst, dst_stride, w, h, filter_params,
subpel_y_q4, y_step_q4, ref_idx, bd); subpel_y_q4, y_step_q4, ref_idx, bd);
} }
......
...@@ -93,6 +93,13 @@ specialize qw/vp10_convolve_horiz ssse3/; ...@@ -93,6 +93,13 @@ specialize qw/vp10_convolve_horiz ssse3/;
add_proto qw/void vp10_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg"; add_proto qw/void vp10_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
specialize qw/vp10_convolve_vert ssse3/; specialize qw/vp10_convolve_vert ssse3/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp10_highbd_convolve_horiz/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
specialize qw/vp10_highbd_convolve_horiz sse4_1/;
add_proto qw/void vp10_highbd_convolve_vert/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
specialize qw/vp10_highbd_convolve_vert sse4_1/;
}
# #
# dct # dct
# #
......
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "vp10/common/filter.h"