Commit cb507ff2 authored by Yi Luo's avatar Yi Luo Committed by Gerrit Code Review
Browse files

Merge "HBD inverse HT 8x8 and 16x16 sse4.1 optimization" into nextgenv2

parents cf5083d4 28cdee44
......@@ -182,7 +182,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_mask6_test.cc
endif
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += vp10_iht4x4_test.cc
LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += vp10_highbd_iht_test.cc
endif # CONFIG_VP9_HIGHBITDEPTH
endif # VP10
......
......@@ -23,33 +23,40 @@ namespace {
using std::tr1::tuple;
using libvpx_test::ACMRandom;
void iht4x4_ref(const int32_t *coeff, uint16_t *output, int stride,
int tx_type, int bd) {
vp10_inv_txfm2d_add_4x4_c(coeff, output, stride, tx_type, bd);
}
typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
int tx_type, int bd);
typedef void (*IHbdHtFunc)(const int32_t *coeff, uint16_t *output, int stride,
int tx_type, int bd);
// IhbdHt4x4Param argument list:
// <target optimization function, tx_type, bit_depth>
typedef tuple<IHbdHtFunc, int, int> IHbdHt4x4Param;
// Test parameter argument list:
// <transform reference function,
// optimized inverse transform function,
// inverse transform reference function,
// num_coeffs,
// tx_type,
// bit_depth>
typedef tuple<HbdHtFunc, IHbdHtFunc, IHbdHtFunc, int, int, int> IHbdHtParam;
class VP10HighbdInvTrans4x4HT
: public ::testing::TestWithParam<IHbdHt4x4Param> {
class VP10HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
public:
virtual ~VP10HighbdInvTrans4x4HT() {}
virtual ~VP10HighbdInvHTNxN() {}
virtual void SetUp() {
inv_txfm_ = GET_PARAM(0);
tx_type_ = GET_PARAM(1);
bit_depth_ = GET_PARAM(2);
num_coeffs_ = 4 * 4;
txfm_ref_ = GET_PARAM(0);
inv_txfm_ = GET_PARAM(1);
inv_txfm_ref_ = GET_PARAM(2);
num_coeffs_ = GET_PARAM(3);
tx_type_ = GET_PARAM(4);
bit_depth_ = GET_PARAM(5);
input_ = reinterpret_cast<int16_t *>(
vpx_memalign(16, sizeof(input_[0]) * num_coeffs_));
// Note:
// Inverse transform input buffer is 32-byte aligned
// refer to function void alloc_mode_context() in
// vp10/encoder/context_tree.c
// Refer to <root>/vp10/encoder/context_tree.c, function,
// void alloc_mode_context().
coeffs_ = reinterpret_cast<int32_t *>(
vpx_memalign(32, sizeof(coeffs_[0]) * num_coeffs_));
output_ = reinterpret_cast<uint16_t *>(
......@@ -59,6 +66,7 @@ class VP10HighbdInvTrans4x4HT
}
virtual void TearDown() {
vpx_free(input_);
vpx_free(coeffs_);
vpx_free(output_);
vpx_free(output_ref_);
......@@ -69,67 +77,109 @@ class VP10HighbdInvTrans4x4HT
void RunBitexactCheck();
private:
static int32_t ClampCoeffs(int number, int bit) {
const int max = (1 << bit) - 1;
const int min = -max;
return clamp(number, min, max);
int GetStride() const {
if (16 == num_coeffs_) {
return 4;
} else if (64 == num_coeffs_) {
return 8;
} else if (256 == num_coeffs_) {
return 16;
} else {
return 0;
}
}
HbdHtFunc txfm_ref_;
IHbdHtFunc inv_txfm_;
IHbdHtFunc inv_txfm_ref_;
int num_coeffs_;
int tx_type_;
int bit_depth_;
int num_coeffs_;
int16_t *input_;
int32_t *coeffs_;
uint16_t *output_;
uint16_t *output_ref_;
};
void VP10HighbdInvTrans4x4HT::RunBitexactCheck() {
void VP10HighbdInvHTNxN::RunBitexactCheck() {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int stride = 4;
const int num_tests = 2000000;
const int stride = GetStride();
const int num_tests = 20000;
const uint16_t mask = (1 << bit_depth_) - 1;
for (int i = 0; i < num_tests; ++i) {
for (int j = 0; j < num_coeffs_; ++j) {
coeffs_[j] = ClampCoeffs((rnd.Rand16() - rnd.Rand16()) << 2, 18);
input_[j] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask);
output_ref_[j] = rnd.Rand16() & mask;
output_[j] = output_ref_[j];
}
iht4x4_ref(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
txfm_ref_(input_, coeffs_, stride, tx_type_, bit_depth_);
inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
ASM_REGISTER_STATE_CHECK(inv_txfm_(coeffs_, output_, stride, tx_type_,
bit_depth_));
for (int j = 0; j < num_coeffs_; ++j) {
EXPECT_EQ(output_ref_[j], output_[j])
<< "Not bit-exact result at index: " << j
<< "At test block: " << i;
<< " At test block: " << i;
}
}
}
TEST_P(VP10HighbdInvTrans4x4HT, InvTransResultCheck) {
TEST_P(VP10HighbdInvHTNxN, InvTransResultCheck) {
RunBitexactCheck();
}
using std::tr1::make_tuple;
#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
const IHbdHt4x4Param kArrayIht4x4Param[] = {
make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 0, 10),
make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 0, 12),
make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 1, 10),
make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 1, 12),
make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 2, 10),
make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 2, 12),
make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 3, 10),
make_tuple(&vp10_inv_txfm2d_add_4x4_sse4_1, 3, 12)
#define PARAM_LIST_4X4 &vp10_fwd_txfm2d_4x4_c, \
&vp10_inv_txfm2d_add_4x4_sse4_1, \
&vp10_inv_txfm2d_add_4x4_c, 16
#define PARAM_LIST_8X8 &vp10_fwd_txfm2d_8x8_c, \
&vp10_inv_txfm2d_add_8x8_sse4_1, \
&vp10_inv_txfm2d_add_8x8_c, 64
#define PARAM_LIST_16X16 &vp10_fwd_txfm2d_16x16_c, \
&vp10_inv_txfm2d_add_16x16_sse4_1, \
&vp10_inv_txfm2d_add_16x16_c, 256
const IHbdHtParam kArrayIhtParam[] = {
// 16x16
make_tuple(PARAM_LIST_16X16, 0, 10),
make_tuple(PARAM_LIST_16X16, 0, 12),
make_tuple(PARAM_LIST_16X16, 1, 10),
make_tuple(PARAM_LIST_16X16, 1, 12),
make_tuple(PARAM_LIST_16X16, 2, 10),
make_tuple(PARAM_LIST_16X16, 2, 12),
make_tuple(PARAM_LIST_16X16, 3, 10),
make_tuple(PARAM_LIST_16X16, 3, 12),
// 8x8
make_tuple(PARAM_LIST_8X8, 0, 10),
make_tuple(PARAM_LIST_8X8, 0, 12),
make_tuple(PARAM_LIST_8X8, 1, 10),
make_tuple(PARAM_LIST_8X8, 1, 12),
make_tuple(PARAM_LIST_8X8, 2, 10),
make_tuple(PARAM_LIST_8X8, 2, 12),
make_tuple(PARAM_LIST_8X8, 3, 10),
make_tuple(PARAM_LIST_8X8, 3, 12),
// 4x4
make_tuple(PARAM_LIST_4X4, 0, 10),
make_tuple(PARAM_LIST_4X4, 0, 12),
make_tuple(PARAM_LIST_4X4, 1, 10),
make_tuple(PARAM_LIST_4X4, 1, 12),
make_tuple(PARAM_LIST_4X4, 2, 10),
make_tuple(PARAM_LIST_4X4, 2, 12),
make_tuple(PARAM_LIST_4X4, 3, 10),
make_tuple(PARAM_LIST_4X4, 3, 12),
};
INSTANTIATE_TEST_CASE_P(
SSE4_1, VP10HighbdInvTrans4x4HT,
::testing::ValuesIn(kArrayIht4x4Param));
SSE4_1, VP10HighbdInvHTNxN,
::testing::ValuesIn(kArrayIhtParam));
#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
} // namespace
......@@ -629,9 +629,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp10_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
specialize qw/vp10_inv_txfm2d_add_4x4 sse4_1/;
add_proto qw/void vp10_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
specialize qw/vp10_inv_txfm2d_add_8x8/;
specialize qw/vp10_inv_txfm2d_add_8x8 sse4_1/;
add_proto qw/void vp10_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
specialize qw/vp10_inv_txfm2d_add_16x16/;
specialize qw/vp10_inv_txfm2d_add_16x16 sse4_1/;
add_proto qw/void vp10_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
specialize qw/vp10_inv_txfm2d_add_32x32/;
add_proto qw/void vp10_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
......
This diff is collapsed.
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H
#define _HIGHBD_TXFM_UTILITY_SSE4_H
#include <smmintrin.h> /* SSE4.1 */
#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
do { \
__m128i u0, u1, u2, u3; \
u0 = _mm_unpacklo_epi32(x0, x1); \
u1 = _mm_unpackhi_epi32(x0, x1); \
u2 = _mm_unpacklo_epi32(x2, x3); \
u3 = _mm_unpackhi_epi32(x2, x3); \
y0 = _mm_unpacklo_epi64(u0, u2); \
y1 = _mm_unpackhi_epi64(u0, u2); \
y2 = _mm_unpacklo_epi64(u1, u3); \
y3 = _mm_unpackhi_epi64(u1, u3); \
} while (0)
static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
TRANSPOSE_4X4(in[0], in[2], in[4], in[6],
out[0], out[2], out[4], out[6]);
TRANSPOSE_4X4(in[1], in[3], in[5], in[7],
out[8], out[10], out[12], out[14]);
TRANSPOSE_4X4(in[8], in[10], in[12], in[14],
out[1], out[3], out[5], out[7]);
TRANSPOSE_4X4(in[9], in[11], in[13], in[15],
out[9], out[11], out[13], out[15]);
}
static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
// Upper left 8x8
TRANSPOSE_4X4(in[0], in[4], in[8], in[12],
out[0], out[4], out[8], out[12]);
TRANSPOSE_4X4(in[1], in[5], in[9], in[13],
out[16], out[20], out[24], out[28]);
TRANSPOSE_4X4(in[16], in[20], in[24], in[28],
out[1], out[5], out[9], out[13]);
TRANSPOSE_4X4(in[17], in[21], in[25], in[29],
out[17], out[21], out[25], out[29]);
// Upper right 8x8
TRANSPOSE_4X4(in[2], in[6], in[10], in[14],
out[32], out[36], out[40], out[44]);
TRANSPOSE_4X4(in[3], in[7], in[11], in[15],
out[48], out[52], out[56], out[60]);
TRANSPOSE_4X4(in[18], in[22], in[26], in[30],
out[33], out[37], out[41], out[45]);
TRANSPOSE_4X4(in[19], in[23], in[27], in[31],
out[49], out[53], out[57], out[61]);
// Lower left 8x8
TRANSPOSE_4X4(in[32], in[36], in[40], in[44],
out[2], out[6], out[10], out[14]);
TRANSPOSE_4X4(in[33], in[37], in[41], in[45],
out[18], out[22], out[26], out[30]);
TRANSPOSE_4X4(in[48], in[52], in[56], in[60],
out[3], out[7], out[11], out[15]);
TRANSPOSE_4X4(in[49], in[53], in[57], in[61],
out[19], out[23], out[27], out[31]);
// Lower right 8x8
TRANSPOSE_4X4(in[34], in[38], in[42], in[46],
out[34], out[38], out[42], out[46]);
TRANSPOSE_4X4(in[35], in[39], in[43], in[47],
out[50], out[54], out[58], out[62]);
TRANSPOSE_4X4(in[50], in[54], in[58], in[62],
out[35], out[39], out[43], out[47]);
TRANSPOSE_4X4(in[51], in[55], in[59], in[63],
out[51], out[55], out[59], out[63]);
}
// Note:
// rounding = 1 << (bit - 1)
static INLINE __m128i half_btf_sse4_1(__m128i w0, __m128i n0,
__m128i w1, __m128i n1,
__m128i rounding, int bit) {
__m128i x, y;
x = _mm_mullo_epi32(w0, n0);
y = _mm_mullo_epi32(w1, n1);
x = _mm_add_epi32(x, y);
x = _mm_add_epi32(x, rounding);
x = _mm_srai_epi32(x, bit);
return x;
}
#endif // _HIGHBD_TXFM_UTILITY_SSE4_H
......@@ -15,6 +15,7 @@
#include "./vpx_config.h"
#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
#include "vp10/common/vp10_txfm.h"
#include "vp10/common/x86/highbd_txfm_utility_sse4.h"
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
#include "vpx_ports/mem.h"
......@@ -406,30 +407,6 @@ static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
in[15] = _mm_srai_epi32(in[15], shift);
}
#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
do { \
__m128i u0, u1, u2, u3; \
u0 = _mm_unpacklo_epi32(x0, x1); \
u1 = _mm_unpackhi_epi32(x0, x1); \
u2 = _mm_unpacklo_epi32(x2, x3); \
u3 = _mm_unpackhi_epi32(x2, x3); \
y0 = _mm_unpacklo_epi64(u0, u2); \
y1 = _mm_unpackhi_epi64(u0, u2); \
y2 = _mm_unpacklo_epi64(u1, u3); \
y3 = _mm_unpackhi_epi64(u1, u3); \
} while (0)
static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
TRANSPOSE_4X4(in[0], in[2], in[4], in[6],
out[0], out[2], out[4], out[6]);
TRANSPOSE_4X4(in[1], in[3], in[5], in[7],
out[8], out[10], out[12], out[14]);
TRANSPOSE_4X4(in[8], in[10], in[12], in[14],
out[1], out[3], out[5], out[7]);
TRANSPOSE_4X4(in[9], in[11], in[13], in[15],
out[9], out[11], out[13], out[15]);
}
static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
_mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
_mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
......@@ -1797,47 +1774,6 @@ static void col_txfm_16x16_rounding(__m128i *in, int shift) {
col_txfm_8x8_rounding(&in[48], shift);
}
static void transpose_16x16(const __m128i *in, __m128i *out) {
// Upper left 8x8
TRANSPOSE_4X4(in[0], in[4], in[8], in[12],
out[0], out[4], out[8], out[12]);
TRANSPOSE_4X4(in[1], in[5], in[9], in[13],
out[16], out[20], out[24], out[28]);
TRANSPOSE_4X4(in[16], in[20], in[24], in[28],
out[1], out[5], out[9], out[13]);
TRANSPOSE_4X4(in[17], in[21], in[25], in[29],
out[17], out[21], out[25], out[29]);
// Upper right 8x8
TRANSPOSE_4X4(in[2], in[6], in[10], in[14],
out[32], out[36], out[40], out[44]);
TRANSPOSE_4X4(in[3], in[7], in[11], in[15],
out[48], out[52], out[56], out[60]);
TRANSPOSE_4X4(in[18], in[22], in[26], in[30],
out[33], out[37], out[41], out[45]);
TRANSPOSE_4X4(in[19], in[23], in[27], in[31],
out[49], out[53], out[57], out[61]);
// Lower left 8x8
TRANSPOSE_4X4(in[32], in[36], in[40], in[44],
out[2], out[6], out[10], out[14]);
TRANSPOSE_4X4(in[33], in[37], in[41], in[45],
out[18], out[22], out[26], out[30]);
TRANSPOSE_4X4(in[48], in[52], in[56], in[60],
out[3], out[7], out[11], out[15]);
TRANSPOSE_4X4(in[49], in[53], in[57], in[61],
out[19], out[23], out[27], out[31]);
// Lower right 8x8
TRANSPOSE_4X4(in[34], in[38], in[42], in[46],
out[34], out[38], out[42], out[46]);
TRANSPOSE_4X4(in[35], in[39], in[43], in[47],
out[50], out[54], out[58], out[62]);
TRANSPOSE_4X4(in[50], in[54], in[58], in[62],
out[35], out[39], out[43], out[47]);
TRANSPOSE_4X4(in[51], in[55], in[59], in[63],
out[51], out[55], out[59], out[63]);
}
static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
const int size_8x8 = 16 * 4;
write_buffer_8x8(&in[0], output);
......
......@@ -112,6 +112,10 @@ VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_txfm1d_sse4.h
VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_fwd_txfm1d_sse4.c
VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_fwd_txfm2d_sse4.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_txfm_utility_sse4.h
endif
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c
VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment