Commit 81ad9536 authored by Yi Luo's avatar Yi Luo
Browse files

Convolution vertical filter SSSE3 optimization

- Apply 8-pixel vertical filtering direction parallelism.
- Add unit tests to verify bit exact.
- Encoder speed improves ~29% (enable EXT_INTERP) on Xeon E5-2680.
- Combinational cycle count of vp10_convolve() drops from 26.06%
  to 6.73%.

Change-Id: Ic1ae48f8fb1909991577947a8c00d07832737e57
parent 76ff9b30
......@@ -21,13 +21,15 @@ namespace {
using std::tr1::tuple;
using libvpx_test::ACMRandom;
typedef void (*conv_horiz_t)(const uint8_t*, int, uint8_t*, int,
int, int, const InterpFilterParams,
const int, int, int);
typedef void (*conv_filter_t)(const uint8_t*, int, uint8_t*, int,
int, int, const InterpFilterParams,
const int, int, int);
// Test parameter list:
// <convolve_horiz_func, <width, height>, filter_params, subpel_x_q4, avg>
// <convolve_horiz_func, convolve_vert_func,
// <width, height>, filter_params, subpel_x_q4, avg>
typedef tuple<int, int> BlockDimension;
typedef tuple<conv_horiz_t, BlockDimension, INTERP_FILTER, int, int> ConvParams;
typedef tuple<conv_filter_t, conv_filter_t, BlockDimension, INTERP_FILTER,
int, int> ConvParams;
// Note:
// src_ and src_ref_ have special boundary requirement
......@@ -44,13 +46,14 @@ class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
public:
virtual ~VP10ConvolveOptimzTest() {}
virtual void SetUp() {
conv_ = GET_PARAM(0);
BlockDimension block = GET_PARAM(1);
conv_horiz_ = GET_PARAM(0);
conv_vert_ = GET_PARAM(1);
BlockDimension block = GET_PARAM(2);
width_ = std::tr1::get<0>(block);
height_ = std::tr1::get<1>(block);
filter_ = GET_PARAM(2);
subpel_ = GET_PARAM(3);
avg_ = GET_PARAM(4);
filter_ = GET_PARAM(3);
subpel_ = GET_PARAM(4);
avg_ = GET_PARAM(5);
alloc_ = new uint8_t[maxBlockSize * 4];
src_ = alloc_ + (vertiOffset * maxWidth);
......@@ -68,6 +71,7 @@ class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
protected:
void RunHorizFilterBitExactCheck();
void RunVertFilterBitExactCheck();
private:
void PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
......@@ -75,7 +79,8 @@ class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
int w, int h);
void DiffFilterBuffer(const uint8_t *buf, const uint8_t *buf_ref,
int w, int h, int fgroup, int findex);
conv_horiz_t conv_;
conv_filter_t conv_horiz_;
conv_filter_t conv_vert_;
uint8_t *alloc_;
uint8_t *src_;
uint8_t *dst_;
......@@ -94,7 +99,7 @@ void VP10ConvolveOptimzTest::PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
int r, c;
ACMRandom rnd(ACMRandom::DeterministicSeed());
memset(alloc_, 0, 4 * maxBlockSize);
memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
uint8_t *src_ptr = src;
uint8_t *dst_ptr = dst;
......@@ -144,8 +149,8 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_, height_,
filter_params, subpel_, x_step_q4, avg_);
conv_(src_, stride, dst_, stride, width_, height_,
filter_params, subpel_, x_step_q4, avg_);
conv_horiz_(src_, stride, dst_, stride, width_, height_,
filter_params, subpel_, x_step_q4, avg_);
DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
......@@ -160,21 +165,40 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
intermediate_height, filter_params, subpel_, x_step_q4,
avg_);
conv_(src_, stride, dst_, stride, width_,
intermediate_height, filter_params, subpel_, x_step_q4,
avg_);
conv_horiz_(src_, stride, dst_, stride, width_,
intermediate_height, filter_params, subpel_, x_step_q4,
avg_);
DiffFilterBuffer(dst_, dst_ref_, width_, intermediate_height, filter_,
subpel_);
}
void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
vp10_convolve_vert_c(src_ref_, stride, dst_ref_, stride, width_, height_,
filter_params, subpel_, x_step_q4, avg_);
conv_vert_(src_, stride, dst_, stride, width_, height_,
filter_params, subpel_, x_step_q4, avg_);
DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
}
TEST_P(VP10ConvolveOptimzTest, HorizBitExactCheck) {
RunHorizFilterBitExactCheck();
}
TEST_P(VP10ConvolveOptimzTest, VerticalBitExactCheck) {
RunVertFilterBitExactCheck();
}
using std::tr1::make_tuple;
const BlockDimension kBlockDim[] = {
make_tuple(2, 2),
make_tuple(2, 4),
make_tuple(4, 4),
make_tuple(4, 8),
make_tuple(8, 4),
......@@ -195,7 +219,7 @@ const BlockDimension kBlockDim[] = {
// 10/12-tap filters
const INTERP_FILTER kFilter[] = {6, 4, 2};
const int kSubpelXQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const int kSubpelQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const int kAvg[] = {0, 1};
......@@ -204,9 +228,10 @@ INSTANTIATE_TEST_CASE_P(
SSSE3, VP10ConvolveOptimzTest,
::testing::Combine(
::testing::Values(vp10_convolve_horiz_ssse3),
::testing::Values(vp10_convolve_vert_ssse3),
::testing::ValuesIn(kBlockDim),
::testing::ValuesIn(kFilter),
::testing::ValuesIn(kSubpelXQ4),
::testing::ValuesIn(kSubpelQ4),
::testing::ValuesIn(kAvg)));
#endif // HAVE_SSSE3 && CONFIG_EXT_INTERP
} // namespace
......@@ -11,6 +11,13 @@
using libvpx_test::ACMRandom;
namespace {
void setup_convolve() {
#if HAVE_SSSE3
vp10_convolve_horiz = vp10_convolve_horiz_c;
vp10_convolve_vert = vp10_convolve_vert_c;
#endif
}
TEST(VP10ConvolveTest, vp10_convolve8) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
#if CONFIG_DUAL_FILTER
......@@ -41,7 +48,7 @@ TEST(VP10ConvolveTest, vp10_convolve8) {
int w = 1;
int h = 1;
vp10_rtcd();
setup_convolve();
for (int i = 0; i < filter_size * filter_size; i++) {
src[i] = rnd.Rand16() % (1 << 8);
......@@ -89,7 +96,7 @@ TEST(VP10ConvolveTest, vp10_convolve) {
int subpel_x_q4;
int subpel_y_q4;
vp10_rtcd();
setup_convolve();
for (int i = 0; i < filter_size * filter_size; i++) {
src[i] = rnd.Rand16() % (1 << 8);
......@@ -155,7 +162,7 @@ TEST(VP10ConvolveTest, vp10_convolve_avg) {
int subpel_x_q4;
int subpel_y_q4;
vp10_rtcd();
setup_convolve();
for (int i = 0; i < filter_size * filter_size; i++) {
src0[i] = rnd.Rand16() % (1 << 8);
......
......@@ -40,7 +40,7 @@ void vp10_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst,
}
}
static void convolve_vert(const uint8_t *src, int src_stride, uint8_t *dst,
void vp10_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams filter_params,
const int subpel_y_q4, int y_step_q4, int avg) {
......@@ -133,13 +133,13 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
vp10_get_interp_filter_params(interp_filter);
#endif
assert(filter_params.taps <= MAX_FILTER_TAP);
convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
subpel_y_q4, y_step_q4, ref_idx);
vp10_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
subpel_y_q4, y_step_q4, ref_idx);
} else {
// temp's size is set to (maximum possible intermediate_height) *
// MAX_BLOCK_WIDTH
uint8_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
MAX_FILTER_TAP) *
MAX_FILTER_TAP + 1) *
MAX_BLOCK_WIDTH];
int temp_stride = MAX_BLOCK_WIDTH;
#if CONFIG_DUAL_FILTER
......@@ -164,7 +164,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
assert(filter_params.taps <= MAX_FILTER_TAP);
vp10_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
temp, temp_stride, w, intermediate_height,
temp + temp_stride, temp_stride, w, intermediate_height,
filter_params, subpel_x_q4, x_step_q4, 0);
#if CONFIG_DUAL_FILTER
......@@ -175,9 +175,9 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
filter_size = filter_params.taps;
assert(filter_params.taps <= MAX_FILTER_TAP);
convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride, dst,
dst_stride, w, h, filter_params,
subpel_y_q4, y_step_q4, ref_idx);
vp10_convolve_vert(temp + temp_stride * (filter_size / 2), temp_stride,
dst, dst_stride, w, h, filter_params,
subpel_y_q4, y_step_q4, ref_idx);
}
}
......
......@@ -90,6 +90,9 @@ specialize qw/vp10_filter_by_weight8x8 sse2 msa/;
add_proto qw/void vp10_convolve_horiz/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
specialize qw/vp10_convolve_horiz ssse3/;
add_proto qw/void vp10_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
specialize qw/vp10_convolve_vert ssse3/;
#
# dct
#
......
......@@ -37,27 +37,29 @@ static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
// they're zero vectors.
}
typedef void (*store_pixel_t)(__m128i x, uint8_t *src, uint8_t *dst);
typedef void (*store_pixel_t)(const __m128i *x, uint8_t *src, uint8_t *dst);
static INLINE void store_4_pixel_only(__m128i x, uint8_t *src, uint8_t *dst) {
static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *src,
uint8_t *dst) {
__m128i u;
(void)src;
x = _mm_packus_epi16(x, x);
*(int *)dst = _mm_cvtsi128_si32(x);
u = _mm_packus_epi16(*x, *x);
*(int *)dst = _mm_cvtsi128_si32(u);
}
static INLINE __m128i accumulate_store(__m128i x, uint8_t *src) {
static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
__m128i y = _mm_loadl_epi64((__m128i const *)src);
y = _mm_unpacklo_epi8(y, zero);
y = _mm_add_epi16(x, y);
y = _mm_add_epi16(*x, y);
y = _mm_add_epi16(y, one);
y = _mm_srai_epi16(y, 1);
y = _mm_packus_epi16(y, y);
return y;
}
static INLINE void accumulate_store_4_pixel(__m128i x, uint8_t *src,
static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *src,
uint8_t *dst) {
__m128i y = accumulate_store(x, src);
*(int *)dst = _mm_cvtsi128_si32(y);
......@@ -102,7 +104,7 @@ void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
store_func(sumPairRow[1], dst, buf);
store_func(&sumPairRow[1], dst, buf);
}
void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
......@@ -714,3 +716,190 @@ void vp10_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
}
}
}
// Vertical convolution filtering
static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *src,
uint8_t *dst) {
__m128i u;
uint32_t temp;
(void)src;
u = _mm_packus_epi16(*x, *x);
temp = _mm_cvtsi128_si32(u);
*(uint16_t *)dst = (uint16_t)temp;
}
static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *src,
uint8_t *dst) {
uint32_t temp;
__m128i y = accumulate_store(x, src);
temp = _mm_cvtsi128_si32(y);
*(uint16_t *)dst = (uint16_t)temp;
}
static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *src,
uint8_t *dst) {
__m128i u;
(void)src;
u = _mm_packus_epi16(*x, *x);
_mm_storel_epi64((__m128i *)dst, u);
}
static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *src,
uint8_t *dst) {
__m128i y = accumulate_store(x, src);
_mm_storel_epi64((__m128i *)dst, y);
}
static store_pixel_t store8pixelTab[2] = {
store_8_pixel_only, accumulate_store_8_pixel};
static store_pixel_t store2pixelTab[2] = {
store_2_pixel_only, accumulate_store_2_pixel};
static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
__m128i *f) {
const __m128i k_256 = _mm_set1_epi16(1 << 8);
const __m128i zero = _mm_setzero_si128();
__m128i min_x2x3, max_x2x3, sum;
__m128i s0 = _mm_loadu_si128((__m128i const *)(src));
__m128i s1 = _mm_loadu_si128((__m128i const *)(src + src_stride));
__m128i s2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
__m128i s3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
__m128i s4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
__m128i s5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
__m128i s6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
__m128i s7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
__m128i s8 = _mm_loadu_si128((__m128i const *)(src + 8 * src_stride));
__m128i s9 = _mm_loadu_si128((__m128i const *)(src + 9 * src_stride));
__m128i s10 = _mm_loadu_si128((__m128i const *)(src + 10 * src_stride));
__m128i s11 = _mm_loadu_si128((__m128i const *)(src + 11 * src_stride));
s0 = _mm_unpacklo_epi8(s0, s1);
s2 = _mm_unpacklo_epi8(s2, s3);
s4 = _mm_unpacklo_epi8(s4, s5);
s6 = _mm_unpacklo_epi8(s6, s7);
s8 = _mm_unpacklo_epi8(s8, s9);
s10 = _mm_unpacklo_epi8(s10, s11);
s0 = _mm_maddubs_epi16(s0, f[0]);
s2 = _mm_maddubs_epi16(s2, f[1]);
s4 = _mm_maddubs_epi16(s4, f[2]);
s6 = _mm_maddubs_epi16(s6, f[3]);
s8 = _mm_maddubs_epi16(s8, f[4]);
s10 = _mm_maddubs_epi16(s10, f[5]);
min_x2x3 = _mm_min_epi16(s4, s6);
max_x2x3 = _mm_max_epi16(s4, s6);
sum = _mm_adds_epi16(s0, s2);
sum = _mm_adds_epi16(sum, s10);
sum = _mm_adds_epi16(sum, s8);
sum = _mm_adds_epi16(sum, min_x2x3);
sum = _mm_adds_epi16(sum, max_x2x3);
sum = _mm_mulhrs_epi16(sum, k_256);
sum = _mm_packus_epi16(sum, sum);
sum = _mm_unpacklo_epi8(sum, zero);
return sum;
}
static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
__m128i *f, int tapsNum,
store_pixel_t store_func,
uint8_t *dst) {
__m128i sum;
if (10 == tapsNum) {
src -= src_stride;
}
sum = filter_vert_ssse3(src, src_stride, f);
store_func(&sum, dst, dst);
}
void filter_vert_compute_small(const uint8_t *src, int src_stride, __m128i *f,
int tapsNum, store_pixel_t store_func, int h,
uint8_t *dst, int dst_stride) {
int rowIndex = 0;
do {
filter_vert_horiz_parallel_ssse3(src, src_stride, f, tapsNum, store_func,
dst);
rowIndex++;
src += src_stride;
dst += dst_stride;
} while (rowIndex < h);
}
void filter_vert_compute_large(const uint8_t *src, int src_stride, __m128i *f,
int tapsNum, store_pixel_t store_func, int w,
int h, uint8_t *dst, int dst_stride) {
int col;
int rowIndex = 0;
const uint8_t *src_ptr = src;
uint8_t *dst_ptr = dst;
do {
for (col = 0; col < w; col += 8) {
filter_vert_horiz_parallel_ssse3(src_ptr, src_stride, f, tapsNum,
store_func, dst_ptr);
src_ptr += 8;
dst_ptr += 8;
}
rowIndex++;
src_ptr = src + rowIndex * src_stride;
dst_ptr = dst + rowIndex * dst_stride;
} while (rowIndex < h);
}
void vp10_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams filter_params,
const int subpel_y_q4, int y_step_q4, int avg) {
__m128i verf[6];
SubpelFilterCoeffs vCoeffs;
const uint8_t *src_ptr;
uint8_t *dst_ptr = dst;
store_pixel_t store2p = store2pixelTab[avg];
store_pixel_t store4p = store4pixelTab[avg];
store_pixel_t store8p = store8pixelTab[avg];
const int tapsNum = filter_params.taps;
if (0 == subpel_y_q4 || 16 != y_step_q4) {
vp10_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
subpel_y_q4, y_step_q4, avg);
return;
}
vCoeffs = vp10_get_subpel_filter_ver_signal_dir(
filter_params, subpel_y_q4 - 1);
if (!vCoeffs) {
vp10_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
subpel_y_q4, y_step_q4, avg);
return;
}
verf[0] = *((const __m128i *)(vCoeffs));
verf[1] = *((const __m128i *)(vCoeffs + 1));
verf[2] = *((const __m128i *)(vCoeffs + 2));
verf[3] = *((const __m128i *)(vCoeffs + 3));
verf[4] = *((const __m128i *)(vCoeffs + 4));
verf[5] = *((const __m128i *)(vCoeffs + 5));
src -= src_stride * ((tapsNum >> 1) - 1);
src_ptr = src;
if (w > 4) {
filter_vert_compute_large(src_ptr, src_stride, verf, tapsNum, store8p,
w, h, dst_ptr, dst_stride);
} else if (4 == w) {
filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store4p,
h, dst_ptr, dst_stride);
} else if (2 == w) {
filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store2p,
h, dst_ptr, dst_stride);
} else {
assert(0);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment