Commit 229690a9 authored by Yi Luo's avatar Yi Luo
Browse files

Convolution horizontal filter SSSE3 optimization

- Apply signal direction/4-pixel vertical/8-pixel vertical
  parallelism.
- Add unit test to verify the bit exact result.
- Overall encoding time improves ~24% on Xeon E5-2680 CPU.

Change-Id: I104dcbfd43451476fee1f94cd16ca5f965878e59
parent d10161ea
......@@ -207,6 +207,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm1d_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm2d_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm2d_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_convolve_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_convolve_optimz_test.cc
TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
......
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vp10_rtcd.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
namespace {
using std::tr1::tuple;
using libvpx_test::ACMRandom;
typedef void (*conv_horiz_t)(const uint8_t*, int, uint8_t*, int,
int, int, const InterpFilterParams,
const int, int, int);
// Test parameter list:
// <convolve_horiz_func, <width, height>, filter_params, subpel_x_q4, avg>
typedef tuple<int, int> BlockDimension;
typedef tuple<conv_horiz_t, BlockDimension, INTERP_FILTER, int, int> ConvParams;
// Note:
// src_ and src_ref_ have special boundary requirement
// dst_ and dst_ref_ don't
const size_t maxWidth = 256;
const size_t maxHeight = 256;
const size_t maxBlockSize = maxWidth * maxHeight;
const int horizOffset = 32;
const int vertiOffset = 32;
const int stride = 128;
const int x_step_q4 = 16;
class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
public:
virtual ~VP10ConvolveOptimzTest() {}
virtual void SetUp() {
conv_ = GET_PARAM(0);
BlockDimension block = GET_PARAM(1);
width_ = std::tr1::get<0>(block);
height_ = std::tr1::get<1>(block);
filter_ = GET_PARAM(2);
subpel_ = GET_PARAM(3);
avg_ = GET_PARAM(4);
alloc_ = new uint8_t[maxBlockSize * 4];
src_ = alloc_ + (vertiOffset * maxWidth);
src_ += horizOffset;
src_ref_ = src_ + maxBlockSize;
dst_ = alloc_ + 2 * maxBlockSize;
dst_ref_ = alloc_ + 3 * maxBlockSize;
}
virtual void TearDown() {
delete[] alloc_;
libvpx_test::ClearSystemState();
}
protected:
void RunHorizFilterBitExactCheck();
private:
void PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
uint8_t *dst, uint8_t *dst_ref,
int w, int h);
void DiffFilterBuffer(const uint8_t *buf, const uint8_t *buf_ref,
int w, int h, int fgroup, int findex);
conv_horiz_t conv_;
uint8_t *alloc_;
uint8_t *src_;
uint8_t *dst_;
uint8_t *src_ref_;
uint8_t *dst_ref_;
int width_;
int height_;
int filter_;
int subpel_;
int avg_;
};
void VP10ConvolveOptimzTest::PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
uint8_t *dst, uint8_t *dst_ref,
int w, int h) {
int r, c;
ACMRandom rnd(ACMRandom::DeterministicSeed());
memset(src, 0, maxBlockSize);
memset(src_ref, 0, maxBlockSize);
memset(dst, 0, maxBlockSize);
memset(dst_ref, 0, maxBlockSize);
uint8_t *src_ptr = src;
uint8_t *dst_ptr = dst;
uint8_t *src_ref_ptr = src_ref;
uint8_t *dst_ref_ptr = dst_ref;
for (r = 0; r < height_; ++r) {
for (c = 0; c < width_; ++c) {
src_ptr[c] = rnd.Rand8();
src_ref_ptr[c] = src_ptr[c];
dst_ptr[c] = rnd.Rand8();
dst_ref_ptr[c] = dst_ptr[c];
}
src_ptr += stride;
src_ref_ptr += stride;
dst_ptr += stride;
dst_ref_ptr += stride;
}
}
void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf,
const uint8_t *buf_ref,
int w, int h,
int filter_group,
int filter_index) {
int r, c;
const uint8_t *dst_ptr = buf;
const uint8_t *dst_ref_ptr = buf_ref;
for (r = 0; r < h; ++r) {
for (c = 0; c < w; ++c) {
EXPECT_EQ((uint8_t)dst_ref_ptr[c], (uint8_t)dst_ptr[c])
<< "Error at row: " << r << " col: " << c << " "
<< "w = " << w << " " << "h = " << h << " "
<< "filter group index = " << filter_group << " "
<< "filter index = " << filter_index;
}
dst_ptr += stride;
dst_ref_ptr += stride;
}
}
void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_, height_,
filter_params, subpel_, x_step_q4, avg_);
conv_(src_, stride, dst_, stride, width_, height_,
filter_params, subpel_, x_step_q4, avg_);
DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
// Note:
// Here we need calculate a height which is different from the specified one
// and test again.
int intermediate_height =
(((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
intermediate_height, filter_params, subpel_, x_step_q4,
avg_);
conv_(src_, stride, dst_, stride, width_,
intermediate_height, filter_params, subpel_, x_step_q4,
avg_);
DiffFilterBuffer(dst_, dst_ref_, width_, intermediate_height, filter_,
subpel_);
}
TEST_P(VP10ConvolveOptimzTest, HorizBitExactCheck) {
RunHorizFilterBitExactCheck();
}
using std::tr1::make_tuple;
const BlockDimension kBlockDim[] = {
make_tuple(4, 4),
make_tuple(4, 8),
make_tuple(8, 4),
make_tuple(8, 8),
make_tuple(8, 16),
make_tuple(16, 8),
make_tuple(16, 16),
make_tuple(16, 32),
make_tuple(32, 16),
make_tuple(32, 32),
make_tuple(32, 64),
make_tuple(64, 32),
make_tuple(64, 64),
make_tuple(64, 128),
make_tuple(128, 64),
make_tuple(128, 128),
};
// 10/12-tap filters
const INTERP_FILTER kFilter[] = {6, 4, 2};
const int kSubpelXQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
const int kAvg[] = {0, 1};
#if HAVE_SSSE3 && CONFIG_EXT_INTERP
INSTANTIATE_TEST_CASE_P(
SSSE3, VP10ConvolveOptimzTest,
::testing::Combine(
::testing::Values(vp10_convolve_horiz_ssse3),
::testing::ValuesIn(kBlockDim),
::testing::ValuesIn(kFilter),
::testing::ValuesIn(kSubpelXQ4),
::testing::ValuesIn(kAvg)));
#endif // HAVE_SSSE3 && CONFIG_EXT_INTERP
} // namespace
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vp10_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "test/acm_random.h"
#include "vp10/common/filter.h"
......@@ -40,6 +41,8 @@ TEST(VP10ConvolveTest, vp10_convolve8) {
int w = 1;
int h = 1;
vp10_rtcd();
for (int i = 0; i < filter_size * filter_size; i++) {
src[i] = rnd.Rand16() % (1 << 8);
}
......@@ -86,6 +89,8 @@ TEST(VP10ConvolveTest, vp10_convolve) {
int subpel_x_q4;
int subpel_y_q4;
vp10_rtcd();
for (int i = 0; i < filter_size * filter_size; i++) {
src[i] = rnd.Rand16() % (1 << 8);
}
......@@ -150,6 +155,8 @@ TEST(VP10ConvolveTest, vp10_convolve_avg) {
int subpel_x_q4;
int subpel_y_q4;
vp10_rtcd();
for (int i = 0; i < filter_size * filter_size; i++) {
src0[i] = rnd.Rand16() % (1 << 8);
src1[i] = rnd.Rand16() % (1 << 8);
......
......@@ -302,3 +302,43 @@ const int16_t *vp10_get_interp_filter_kernel(
return (const int16_t*)
vp10_interp_filter_params_list[interp_filter].filter_ptr;
}
SubpelFilterCoeffs vp10_get_subpel_filter_signal_dir(
const InterpFilterParams p, int index) {
#if CONFIG_EXT_INTERP && HAVE_SSSE3
if (p.filter_ptr == (const int16_t *)sub_pel_filters_12sharp) {
return &sub_pel_filters_12sharp_signal_dir[index][0];
}
if (p.filter_ptr == (const int16_t *)sub_pel_filters_10sharp) {
return &sub_pel_filters_10sharp_signal_dir[index][0];
}
#endif
#if USE_TEMPORALFILTER_12TAP && HAVE_SSSE3
if (p.filter_ptr == (const int16_t *)sub_pel_filters_temporalfilter_12) {
return &sub_pel_filters_temporalfilter_12_signal_dir[index][0];
}
#endif
(void)p;
(void)index;
return NULL;
}
SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
const InterpFilterParams p, int index) {
#if CONFIG_EXT_INTERP && HAVE_SSSE3
if (p.filter_ptr == (const int16_t *)sub_pel_filters_12sharp) {
return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
}
if (p.filter_ptr == (const int16_t *)sub_pel_filters_10sharp) {
return &sub_pel_filters_10sharp_ver_signal_dir[index][0];
}
#endif
#if USE_TEMPORALFILTER_12TAP && HAVE_SSSE3
if (p.filter_ptr == (const int16_t *)sub_pel_filters_temporalfilter_12) {
return &sub_pel_filters_temporalfilter_12_ver_signal_dir[index][0];
}
#endif
(void)p;
(void)index;
return NULL;
}
......@@ -91,6 +91,27 @@ static INLINE int vp10_is_interpolating_filter(
const InterpFilterParams ip = vp10_get_interp_filter_params(interp_filter);
return (ip.filter_ptr[ip.taps / 2 - 1] == 128);
}
#if USE_TEMPORALFILTER_12TAP
extern const int8_t sub_pel_filters_temporalfilter_12_signal_dir[15][2][16];
extern const int8_t sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16];
#endif
#if CONFIG_EXT_INTERP
extern const int8_t sub_pel_filters_12sharp_signal_dir[15][2][16];
extern const int8_t sub_pel_filters_10sharp_signal_dir[15][2][16];
extern const int8_t sub_pel_filters_12sharp_ver_signal_dir[15][6][16];
extern const int8_t sub_pel_filters_10sharp_ver_signal_dir[15][6][16];
#endif
typedef const int8_t (*SubpelFilterCoeffs)[16];
SubpelFilterCoeffs vp10_get_subpel_filter_signal_dir(
const InterpFilterParams p, int index);
SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
const InterpFilterParams p, int index);
#ifdef __cplusplus
} // extern "C"
#endif
......
#include <assert.h>
#include <string.h>
#include "./vp10_rtcd.h"
#include "vp10/common/filter.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_ports/mem.h"
......@@ -10,7 +11,7 @@
#define MAX_STEP (32)
#define MAX_FILTER_TAP (12)
static void convolve_horiz(const uint8_t *src, int src_stride, uint8_t *dst,
void vp10_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h,
const InterpFilterParams filter_params,
const int subpel_x_q4, int x_step_q4, int avg) {
......@@ -121,8 +122,8 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
vp10_get_interp_filter_params(interp_filter);
#endif
assert(filter_params.taps <= MAX_FILTER_TAP);
convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
subpel_x_q4, x_step_q4, ref_idx);
vp10_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
subpel_x_q4, x_step_q4, ref_idx);
} else if (ignore_horiz) {
#if CONFIG_DUAL_FILTER
InterpFilterParams filter_params =
......@@ -162,9 +163,9 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
assert(filter_params.taps <= MAX_FILTER_TAP);
convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
temp_stride, w, intermediate_height, filter_params,
subpel_x_q4, x_step_q4, 0);
vp10_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
temp, temp_stride, w, intermediate_height,
filter_params, subpel_x_q4, x_step_q4, 0);
#if CONFIG_DUAL_FILTER
filter_params = filter_params_y;
......
......@@ -8,6 +8,7 @@ print <<EOF
#include "vp10/common/common.h"
#include "vp10/common/enums.h"
#include "vp10/common/quant_common.h"
#include "vp10/common/filter.h"
#include "vp10/common/vp10_txfm.h"
struct macroblockd;
......@@ -83,6 +84,12 @@ add_proto qw/void vp10_filter_by_weight8x8/, "const uint8_t *src, int src_stride
specialize qw/vp10_filter_by_weight8x8 sse2 msa/;
}
#
# 10/12-tap convolution filters
#
add_proto qw/void vp10_convolve_horiz/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
specialize qw/vp10_convolve_horiz ssse3/;
#
# dct
#
......
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "vp10/common/filter.h"
// Note:
// Filter coefficients are from "filter.c". We use,
// sub_pel_filters_temporalfilter_12[],
// sub_pel_filters_12sharp[],
// sub_pel_filters_10sharp[].
// (2-1) Parallel filtering along the intended signal direction
// 12-tap filter padding:
// {filter_coefficients, 0, 0, 0, 0},
// {0, 0, filter_coefficients, 0, 0),
#if USE_TEMPORALFILTER_12TAP
DECLARE_ALIGNED(16, const int8_t,
sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]) = {
{
{0, 1, -1, 3, -7, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0},
{0, 0, 0, 1, -1, 3, -7, 127, 8, -4, 2, -1, 0, 0, 0, 0},
},
{
{0, 1, -3, 5, -12, 124, 18, -8, 4, -2, 1, 0, 0, 0, 0, 0},
{0, 0, 0, 1, -3, 5, -12, 124, 18, -8, 4, -2, 1, 0, 0, 0},
},
{
{-1, 2, -4, 8, -17, 120, 28, -11, 6, -3, 1, -1, 0, 0, 0, 0},
{0, 0, -1, 2, -4, 8, -17, 120, 28, -11, 6, -3, 1, -1, 0, 0},
},
{
{-1, 2, -4, 10, -21, 114, 38, -15, 8, -4, 2, -1, 0, 0, 0, 0},
{0, 0, -1, 2, -4, 10, -21, 114, 38, -15, 8, -4, 2, -1, 0, 0},
},
{
{-1, 3, -5, 11, -23, 107, 49, -18, 9, -5, 2, -1, 0, 0, 0, 0},
{0, 0, -1, 3, -5, 11, -23, 107, 49, -18, 9, -5, 2, -1, 0, 0},
},
{
{-1, 3, -6, 12, -25, 99, 60, -21, 11, -6, 3, -1, 0, 0, 0, 0},
{0, 0, -1, 3, -6, 12, -25, 99, 60, -21, 11, -6, 3, -1, 0, 0},
},
{
{-1, 3, -6, 12, -25, 90, 70, -23, 12, -6, 3, -1, 0, 0, 0, 0},
{0, 0, -1, 3, -6, 12, -25, 90, 70, -23, 12, -6, 3, -1, 0, 0},
},
{
{-1, 3, -6, 12, -24, 80, 80, -24, 12, -6, 3, -1, 0, 0, 0, 0},
{0, 0, -1, 3, -6, 12, -24, 80, 80, -24, 12, -6, 3, -1, 0, 0},
},
{
{-1, 3, -6, 12, -23, 70, 90, -25, 12, -6, 3, -1, 0, 0, 0, 0},
{0, 0, -1, 3, -6, 12, -23, 70, 90, -25, 12, -6, 3, -1, 0, 0},
},
{
{-1, 3, -6, 11, -21, 60, 99, -25, 12, -6, 3, -1, 0, 0, 0, 0},
{0, 0, -1, 3, -6, 11, -21, 60, 99, -25, 12, -6, 3, -1, 0, 0},
},
{
{-1, 2, -5, 9, -18, 49, 107, -23, 11, -5, 3, -1, 0, 0, 0, 0},
{0, 0, -1, 2, -5, 9, -18, 49, 107, -23, 11, -5, 3, -1, 0, 0},
},
{
{-1, 2, -4, 8, -15, 38, 114, -21, 10, -4, 2, -1, 0, 0, 0, 0},
{0, 0, -1, 2, -4, 8, -15, 38, 114, -21, 10, -4, 2, -1, 0, 0},
},
{
{-1, 1, -3, 6, -11, 28, 120, -17, 8, -4, 2, -1, 0, 0, 0, 0},
{0, 0, -1, 1, -3, 6, -11, 28, 120, -17, 8, -4, 2, -1, 0, 0},
},
{
{0, 1, -2, 4, -8, 18, 124, -12, 5, -3, 1, 0, 0, 0, 0, 0},
{0, 0, 0, 1, -2, 4, -8, 18, 124, -12, 5, -3, 1, 0, 0, 0},
},
{
{0, 0, -1, 2, -4, 8, 127, -7, 3, -1, 1, 0, 0, 0, 0, 0},
{0, 0, 0, 0, -1, 2, -4, 8, 127, -7, 3, -1, 1, 0, 0, 0},
},
};
#endif // USE_TEMPORALFILTER_12TAP
#if CONFIG_EXT_INTERP
DECLARE_ALIGNED(16, const int8_t,
sub_pel_filters_12sharp_signal_dir[15][2][16]) = {
{
{0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0, 0, 0, 0, 0},
{0, 0, 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0, 0, 0},
},
{
{-1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1, 0, 0, 0, 0},
{0, 0, -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1, 0, 0},
},
{
{-1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0, 0, 0},
{0, 0, -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0},
},
{
{-1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1, 0, 0, 0, 0},
{0, 0, -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1, 0, 0},
},
{
{-2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2, 0, 0, 0, 0},
{0, 0, -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2, 0, 0},
},
{
{-2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2, 0, 0, 0, 0},
{0, 0, -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2, 0, 0},
},
{
{-2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2, 0, 0, 0, 0},
{0, 0, -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2, 0, 0},
},
{
{-2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2, 0, 0, 0, 0},
{0, 0, -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2, 0, 0},
},
{
{-2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2, 0, 0, 0, 0},
{0, 0, -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2, 0, 0},
},
{
{-2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2, 0, 0, 0, 0},
{0, 0, -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2, 0, 0},
},
{
{-2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2, 0, 0, 0, 0},
{0, 0, -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2, 0, 0},
},
{
{-1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1, 0, 0, 0, 0},
{0, 0, -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1, 0, 0},
},
{
{-1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1, 0, 0, 0, 0},
{0, 0, -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1, 0, 0},
},
{
{-1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1, 0, 0, 0, 0},
{0, 0, -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1, 0, 0},
},
{
{0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0, 0, 0, 0, 0},
{0, 0, 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0, 0, 0},
},
};
// 10-tap filter padding:
// {0, filter_coefficients, 0, 0, 0, 0, 0},
// {0, 0, 0, filter_coefficients, 0, 0, 0),
DECLARE_ALIGNED(16, const int8_t,
sub_pel_filters_10sharp_signal_dir[15][2][16]) = {
{
{0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0},
{0, 0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0},
},
{
{0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0, 0, 0},
{0, 0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0},
},
{
{0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0},
{0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0},
},
{
{0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0, 0, 0},
{0, 0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0},
},
{
{0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0, 0, 0},
{0, 0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0},
},
{