Commit 95f52605 authored by Sarah Parker's avatar Sarah Parker Committed by Urvang Joshi

Replace hbd adst4 with lbd adst4

0.05% drop in performance for 10 bit
0.03% drop in performance for 12 bit

Updated relevant tests:
- Use the fadst4 function from VP9 as the reference.
- Update some max/avg error thresholds

Change-Id: Ic8c5b591eea3309427d2bb42828d44e640f718a1
parent 8b618f62
......@@ -735,67 +735,42 @@ void av1_idct32_new(const int32_t *input, int32_t *output,
void av1_iadst4_new(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range) {
const int32_t size = 4;
const int32_t *cospi;
int32_t stage = 0;
int32_t *bf0, *bf1;
int32_t step[4];
// stage 0;
range_check(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
assert(output != input);
bf1 = output;
bf1[0] = input[0];
bf1[1] = -input[3];
bf1[2] = -input[1];
bf1[3] = input[2];
range_check(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
cospi = cospi_arr(cos_bit[stage]);
bf0 = output;
bf1 = step;
bf1[0] = bf0[0];
bf1[1] = bf0[1];
bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
range_check(stage, input, bf1, size, stage_range[stage]);
(void)cos_bit;
int bd = stage_range[0];
int64_t s0, s1, s2, s3, s4, s5, s6, s7;
// stage 3
stage++;
bf0 = step;
bf1 = output;
bf1[0] = bf0[0] + bf0[2];
bf1[1] = bf0[1] + bf0[3];
bf1[2] = bf0[0] - bf0[2];
bf1[3] = bf0[1] - bf0[3];
range_check(stage, input, bf1, size, stage_range[stage]);
int32_t x0 = input[0];
int32_t x1 = input[1];
int32_t x2 = input[2];
int32_t x3 = input[3];
// stage 4
stage++;
cospi = cospi_arr(cos_bit[stage]);
bf0 = output;
bf1 = step;
bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
bf1[1] = half_btf(cospi[56], bf0[0], -cospi[8], bf0[1], cos_bit[stage]);
bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
bf1[3] = half_btf(cospi[24], bf0[2], -cospi[40], bf0[3], cos_bit[stage]);
range_check(stage, input, bf1, size, stage_range[stage]);
if (!(x0 | x1 | x2 | x3)) {
output[0] = output[1] = output[2] = output[3] = 0;
return;
}
// stage 5
stage++;
bf0 = step;
bf1 = output;
bf1[0] = bf0[1];
bf1[1] = bf0[2];
bf1[2] = bf0[3];
bf1[3] = bf0[0];
range_check(stage, input, bf1, size, stage_range[stage]);
s0 = sinpi_1_9 * x0;
s1 = sinpi_2_9 * x0;
s2 = sinpi_3_9 * x1;
s3 = sinpi_4_9 * x2;
s4 = sinpi_1_9 * x2;
s5 = sinpi_2_9 * x3;
s6 = sinpi_4_9 * x3;
s7 = HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
s0 = s0 + s3 + s5;
s1 = s1 - s4 - s6;
s3 = s2;
s2 = sinpi_3_9 * s7;
// 1-D transform scaling factor is sqrt(2).
// The overall dynamic range is 14b (input) + 14b (multiplication scaling)
// + 1b (addition) = 29b.
// Hence the output bit depth is 15b.
output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
}
void av1_iadst8_new(const int32_t *input, int32_t *output,
......
......@@ -72,77 +72,67 @@ static void idct4x4_sse4_1(__m128i *in, int bit) {
}
static void iadst4x4_sse4_1(__m128i *in, int bit) {
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
bit = 14;
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
const __m128i zero = _mm_setzero_si128();
const __m128i sinpi1 = _mm_set1_epi32(sinpi_1_9);
const __m128i sinpi2 = _mm_set1_epi32(sinpi_2_9);
const __m128i sinpi3 = _mm_set1_epi32(sinpi_3_9);
const __m128i sinpi4 = _mm_set1_epi32(sinpi_4_9);
__m128i t;
__m128i s0, s1, s2, s3, s4, s5, s6, s7;
__m128i x0, x1, x2, x3;
__m128i u0, u1, u2, u3;
__m128i v0, v1, v2, v3, x, y;
__m128i v0, v1, v2, v3;
v0 = _mm_unpacklo_epi32(in[0], in[1]);
v1 = _mm_unpackhi_epi32(in[0], in[1]);
v2 = _mm_unpacklo_epi32(in[2], in[3]);
v3 = _mm_unpackhi_epi32(in[2], in[3]);
u0 = _mm_unpacklo_epi64(v0, v2);
u1 = _mm_unpackhi_epi64(v0, v2);
u2 = _mm_unpacklo_epi64(v1, v3);
u3 = _mm_unpackhi_epi64(v1, v3);
// stage 0
// stage 1
u1 = _mm_sub_epi32(zero, u1);
u3 = _mm_sub_epi32(zero, u3);
// stage 2
v0 = u0;
v1 = u3;
x = _mm_mullo_epi32(u1, cospi32);
y = _mm_mullo_epi32(u2, cospi32);
v2 = _mm_add_epi32(x, y);
v2 = _mm_add_epi32(v2, rnding);
v2 = _mm_srai_epi32(v2, bit);
v3 = _mm_sub_epi32(x, y);
v3 = _mm_add_epi32(v3, rnding);
v3 = _mm_srai_epi32(v3, bit);
// stage 3
u0 = _mm_add_epi32(v0, v2);
u1 = _mm_add_epi32(v1, v3);
u2 = _mm_sub_epi32(v0, v2);
u3 = _mm_sub_epi32(v1, v3);
// stage 4
x = _mm_mullo_epi32(u0, cospi8);
y = _mm_mullo_epi32(u1, cospi56);
in[3] = _mm_add_epi32(x, y);
in[3] = _mm_add_epi32(in[3], rnding);
in[3] = _mm_srai_epi32(in[3], bit);
x = _mm_mullo_epi32(u0, cospi56);
y = _mm_mullo_epi32(u1, cospim8);
in[0] = _mm_add_epi32(x, y);
in[0] = _mm_add_epi32(in[0], rnding);
in[0] = _mm_srai_epi32(in[0], bit);
x = _mm_mullo_epi32(u2, cospi40);
y = _mm_mullo_epi32(u3, cospi24);
in[1] = _mm_add_epi32(x, y);
in[1] = _mm_add_epi32(in[1], rnding);
in[1] = _mm_srai_epi32(in[1], bit);
x = _mm_mullo_epi32(u2, cospi24);
y = _mm_mullo_epi32(u3, cospim40);
in[2] = _mm_add_epi32(x, y);
in[2] = _mm_add_epi32(in[2], rnding);
in[2] = _mm_srai_epi32(in[2], bit);
x0 = _mm_unpacklo_epi64(v0, v2);
x1 = _mm_unpackhi_epi64(v0, v2);
x2 = _mm_unpacklo_epi64(v1, v3);
x3 = _mm_unpackhi_epi64(v1, v3);
s0 = _mm_mullo_epi32(x0, sinpi1);
s1 = _mm_mullo_epi32(x0, sinpi2);
s2 = _mm_mullo_epi32(x1, sinpi3);
s3 = _mm_mullo_epi32(x2, sinpi4);
s4 = _mm_mullo_epi32(x2, sinpi1);
s5 = _mm_mullo_epi32(x3, sinpi2);
s6 = _mm_mullo_epi32(x3, sinpi4);
t = _mm_sub_epi32(x0, x2);
s7 = _mm_add_epi32(t, x3);
t = _mm_add_epi32(s0, s3);
s0 = _mm_add_epi32(t, s5);
t = _mm_sub_epi32(s1, s4);
s1 = _mm_sub_epi32(t, s6);
s3 = s2;
s2 = _mm_mullo_epi32(s7, sinpi3);
u0 = _mm_add_epi32(s0, s3);
u1 = _mm_add_epi32(s1, s3);
u2 = s2;
t = _mm_add_epi32(s0, s1);
u3 = _mm_sub_epi32(t, s3);
u0 = _mm_add_epi32(u0, rnding);
u0 = _mm_srai_epi32(u0, bit);
u1 = _mm_add_epi32(u1, rnding);
u1 = _mm_srai_epi32(u1, bit);
u2 = _mm_add_epi32(u2, rnding);
u2 = _mm_srai_epi32(u2, bit);
u3 = _mm_add_epi32(u3, rnding);
u3 = _mm_srai_epi32(u3, bit);
in[0] = u0;
in[1] = u1;
in[2] = u2;
in[3] = u3;
}
static INLINE void round_shift_4x4(__m128i *in, int shift) {
......
......@@ -692,66 +692,45 @@ void av1_fdct32_new(const int32_t *input, int32_t *output,
void av1_fadst4_new(const int32_t *input, int32_t *output,
const int8_t *cos_bit, const int8_t *stage_range) {
const int32_t size = 4;
const int32_t *cospi;
int32_t stage = 0;
int32_t *bf0, *bf1;
int32_t step[4];
// stage 0;
range_check(stage, input, input, size, stage_range[stage]);
// stage 1;
stage++;
bf1 = output;
bf1[0] = input[3];
bf1[1] = input[0];
bf1[2] = input[1];
bf1[3] = input[2];
range_check(stage, input, bf1, size, stage_range[stage]);
// stage 2
stage++;
cospi = cospi_arr(cos_bit[stage]);
bf0 = output;
bf1 = step;
bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
bf1[1] = half_btf(-cospi[8], bf0[1], cospi[56], bf0[0], cos_bit[stage]);
bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
bf1[3] = half_btf(-cospi[40], bf0[3], cospi[24], bf0[2], cos_bit[stage]);
range_check(stage, input, bf1, size, stage_range[stage]);
// stage 3
stage++;
bf0 = step;
bf1 = output;
bf1[0] = bf0[0] + bf0[2];
bf1[1] = bf0[1] + bf0[3];
bf1[2] = -bf0[2] + bf0[0];
bf1[3] = -bf0[3] + bf0[1];
range_check(stage, input, bf1, size, stage_range[stage]);
// stage 4
stage++;
cospi = cospi_arr(cos_bit[stage]);
bf0 = output;
bf1 = step;
bf1[0] = bf0[0];
bf1[1] = bf0[1];
bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
range_check(stage, input, bf1, size, stage_range[stage]);
(void)cos_bit;
(void)stage_range;
int64_t x0, x1, x2, x3;
int64_t s0, s1, s2, s3, s4, s5, s6, s7;
x0 = input[0];
x1 = input[1];
x2 = input[2];
x3 = input[3];
if (!(x0 | x1 | x2 | x3)) {
output[0] = output[1] = output[2] = output[3] = 0;
return;
}
// stage 5
stage++;
bf0 = step;
bf1 = output;
bf1[0] = bf0[0];
bf1[1] = -bf0[2];
bf1[2] = bf0[3];
bf1[3] = -bf0[1];
range_check(stage, input, bf1, size, stage_range[stage]);
s0 = sinpi_1_9 * x0;
s1 = sinpi_4_9 * x0;
s2 = sinpi_2_9 * x1;
s3 = sinpi_1_9 * x1;
s4 = sinpi_3_9 * x2;
s5 = sinpi_4_9 * x3;
s6 = sinpi_2_9 * x3;
s7 = x0 + x1 - x3;
x0 = s0 + s2 + s5;
x1 = sinpi_3_9 * s7;
x2 = s1 - s3 + s6;
x3 = s4;
s0 = x0 + x3;
s1 = x1;
s2 = x2 - x3;
s3 = x2 - x0 + x3;
// 1-D transform scaling factor is sqrt(2).
output[0] = (int32_t)fdct_round_shift(s0);
output[1] = (int32_t)fdct_round_shift(s1);
output[2] = (int32_t)fdct_round_shift(s2);
output[3] = (int32_t)fdct_round_shift(s3);
}
void av1_fadst8_new(const int32_t *input, int32_t *output,
......
......@@ -121,72 +121,57 @@ static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
}
static void fadst4x4_sse4_1(__m128i *in, int bit) {
const int32_t *cospi = cospi_arr(bit);
const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
bit = 14;
const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
const __m128i kZero = _mm_setzero_si128();
__m128i s0, s1, s2, s3;
const __m128i sinpi1 = _mm_set1_epi32(sinpi_1_9);
const __m128i sinpi2 = _mm_set1_epi32(sinpi_2_9);
const __m128i sinpi3 = _mm_set1_epi32(sinpi_3_9);
const __m128i sinpi4 = _mm_set1_epi32(sinpi_4_9);
__m128i t;
__m128i s0, s1, s2, s3, s4, s5, s6, s7;
__m128i x0, x1, x2, x3;
__m128i u0, u1, u2, u3;
__m128i v0, v1, v2, v3;
// stage 0
// stage 1
// stage 2
u0 = _mm_mullo_epi32(in[3], cospi8);
u1 = _mm_mullo_epi32(in[0], cospi56);
u2 = _mm_add_epi32(u0, u1);
s0 = _mm_add_epi32(u2, rnding);
s0 = _mm_srai_epi32(s0, bit);
v0 = _mm_mullo_epi32(in[3], cospi56);
v1 = _mm_mullo_epi32(in[0], cospi8);
v2 = _mm_sub_epi32(v0, v1);
s1 = _mm_add_epi32(v2, rnding);
s1 = _mm_srai_epi32(s1, bit);
u0 = _mm_mullo_epi32(in[1], cospi40);
u1 = _mm_mullo_epi32(in[2], cospi24);
u2 = _mm_add_epi32(u0, u1);
s2 = _mm_add_epi32(u2, rnding);
s2 = _mm_srai_epi32(s2, bit);
s0 = _mm_mullo_epi32(in[0], sinpi1);
s1 = _mm_mullo_epi32(in[0], sinpi4);
s2 = _mm_mullo_epi32(in[1], sinpi2);
s3 = _mm_mullo_epi32(in[1], sinpi1);
s4 = _mm_mullo_epi32(in[2], sinpi3);
s5 = _mm_mullo_epi32(in[3], sinpi4);
s6 = _mm_mullo_epi32(in[3], sinpi2);
t = _mm_add_epi32(in[0], in[1]);
s7 = _mm_sub_epi32(t, in[3]);
t = _mm_add_epi32(s0, s2);
x0 = _mm_add_epi32(t, s5);
x1 = _mm_mullo_epi32(s7, sinpi3);
t = _mm_sub_epi32(s1, s3);
x2 = _mm_add_epi32(t, s6);
x3 = s4;
s0 = _mm_add_epi32(x0, x3);
s1 = x1;
s2 = _mm_sub_epi32(x2, x3);
t = _mm_sub_epi32(x2, x0);
s3 = _mm_add_epi32(t, x3);
u0 = _mm_add_epi32(s0, rnding);
u0 = _mm_srai_epi32(u0, bit);
u1 = _mm_add_epi32(s1, rnding);
u1 = _mm_srai_epi32(u1, bit);
u2 = _mm_add_epi32(s2, rnding);
u2 = _mm_srai_epi32(u2, bit);
u3 = _mm_add_epi32(s3, rnding);
u3 = _mm_srai_epi32(u3, bit);
v0 = _mm_mullo_epi32(in[1], cospi24);
v1 = _mm_mullo_epi32(in[2], cospi40);
v2 = _mm_sub_epi32(v0, v1);
s3 = _mm_add_epi32(v2, rnding);
s3 = _mm_srai_epi32(s3, bit);
// stage 3
u0 = _mm_add_epi32(s0, s2);
u2 = _mm_sub_epi32(s0, s2);
u1 = _mm_add_epi32(s1, s3);
u3 = _mm_sub_epi32(s1, s3);
// stage 4
v0 = _mm_mullo_epi32(u2, cospi32);
v1 = _mm_mullo_epi32(u3, cospi32);
v2 = _mm_add_epi32(v0, v1);
s2 = _mm_add_epi32(v2, rnding);
u2 = _mm_srai_epi32(s2, bit);
v2 = _mm_sub_epi32(v0, v1);
s3 = _mm_add_epi32(v2, rnding);
u3 = _mm_srai_epi32(s3, bit);
// u0, u1, u2, u3
u2 = _mm_sub_epi32(kZero, u2);
u1 = _mm_sub_epi32(kZero, u1);
// u0, u2, u3, u1
// Transpose 4x4 32-bit
v0 = _mm_unpacklo_epi32(u0, u2);
v1 = _mm_unpackhi_epi32(u0, u2);
v2 = _mm_unpacklo_epi32(u3, u1);
v3 = _mm_unpackhi_epi32(u3, u1);
v0 = _mm_unpacklo_epi32(u0, u1);
v1 = _mm_unpackhi_epi32(u0, u1);
v2 = _mm_unpacklo_epi32(u2, u3);
v3 = _mm_unpackhi_epi32(u2, u3);
in[0] = _mm_unpacklo_epi64(v0, v2);
in[1] = _mm_unpackhi_epi64(v0, v2);
......
......@@ -86,9 +86,10 @@ TEST(av1_fwd_txfm1d, accuracy) {
reference_hybrid_1d(ref_input, ref_output, txfm_size, txfm_type);
for (int ni = 0; ni < txfm_size; ++ni) {
EXPECT_LE(
ASSERT_LE(
abs(output[ni] - static_cast<int32_t>(round(ref_output[ni]))),
max_error);
max_error)
<< "tx size = " << txfm_size << ", tx type = " << txfm_type;
}
}
}
......
......@@ -139,20 +139,20 @@ vector<AV1FwdTxfm2dParam> GetTxfm2dParamList() {
vector<AV1FwdTxfm2dParam> param_list;
for (int t = 0; t <= FLIPADST_ADST; ++t) {
const TX_TYPE tx_type = static_cast<TX_TYPE>(t);
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_4X4, 2, 0.2));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_4X4, 2, 0.5));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_8X8, 5, 0.6));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_16X16, 11, 1.5));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_32X32, 70, 7));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_4X8, 2.5, 0.4));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_8X4, 2.5, 0.4));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_4X8, 2.9, 0.55));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_8X4, 3.2, 0.56));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_8X16, 6.5, 1));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_16X8, 6.5, 1));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_16X32, 46, 7));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_32X16, 30, 7));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_4X16, 5, 0.6));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_16X4, 5, 0.6));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_4X16, 5, 0.7));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_16X4, 5.5, 0.9));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_8X32, 14, 2.1));
param_list.push_back(AV1FwdTxfm2dParam(tx_type, TX_32X8, 11, 1.6));
......
......@@ -90,7 +90,62 @@ void reference_idct_1d(const double *in, double *out, int size) {
}
}
// TODO(any): Copied from dct.c. Should be replaced by a proper reference
// function that takes 'double' input & output.
static void fadst4(const tran_low_t *input, tran_low_t *output) {
tran_high_t x0, x1, x2, x3;
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
x0 = input[0];
x1 = input[1];
x2 = input[2];
x3 = input[3];
if (!(x0 | x1 | x2 | x3)) {
output[0] = output[1] = output[2] = output[3] = 0;
return;
}
s0 = sinpi_1_9 * x0;
s1 = sinpi_4_9 * x0;
s2 = sinpi_2_9 * x1;
s3 = sinpi_1_9 * x1;
s4 = sinpi_3_9 * x2;
s5 = sinpi_4_9 * x3;
s6 = sinpi_2_9 * x3;
s7 = x0 + x1 - x3;
x0 = s0 + s2 + s5;
x1 = sinpi_3_9 * s7;
x2 = s1 - s3 + s6;
x3 = s4;
s0 = x0 + x3;
s1 = x1;
s2 = x2 - x3;
s3 = x2 - x0 + x3;
// 1-D transform scaling factor is sqrt(2).
output[0] = (tran_low_t)fdct_round_shift(s0);
output[1] = (tran_low_t)fdct_round_shift(s1);
output[2] = (tran_low_t)fdct_round_shift(s2);
output[3] = (tran_low_t)fdct_round_shift(s3);
}
void reference_adst_1d(const double *in, double *out, int size) {
if (size == 4) { // Special case.
tran_low_t int_input[4];
for (int i = 0; i < 4; ++i) {
int_input[i] = static_cast<tran_low_t>(round(in[i]));
}
tran_low_t int_output[4];
fadst4(int_input, int_output);
for (int i = 0; i < 4; ++i) {
out[i] = int_output[i];
}
return;
}
for (int k = 0; k < size; ++k) {
out[k] = 0;
for (int n = 0; n < size; ++n) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment