Commit 3e8cceb3 authored by Debargha Mukherjee's avatar Debargha Mukherjee

Speed up of DST and the search in ext_tx

Adds an early termination to the ext_tx search, and also
implements the DST transforms more efficiently.

About 4 times faster with the ext-tx experiment.

There is a 0.09% drop in performance on derflr from 1.735% to
1.648%, but worth it with the speedup achieved.

Change-Id: I2ede9d69c557f25e0a76cd5d701cc0e36e825c7c
parent 7c514e2d
......@@ -242,13 +242,13 @@ static TX_TYPE ext_tx_to_txtype[EXT_TX_TYPES] = {
FLIPADST_FLIPADST,
ADST_FLIPADST,
FLIPADST_ADST,
DST_DST,
DST_DCT,
DCT_DST,
DST_ADST,
ADST_DST,
DST_FLIPADST,
FLIPADST_DST,
DST_DST,
};
#endif // CONFIG_EXT_TX
......
......@@ -317,17 +317,17 @@ static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
#if CONFIG_EXT_TX
const vpx_tree_index vp10_ext_tx_tree[TREE_SIZE(EXT_TX_TYPES)] = {
-NORM, 2,
-ALT9, 4,
-ALT15, 4,
6, 16,
8, 10,
-ALT10, -ALT11,
-ALT9, -ALT10,
12, 14,
-ALT1, -ALT2,
-ALT4, -ALT5,
18, 24,
20, 22,
-ALT12, -ALT13,
-ALT14, -ALT15,
-ALT11, -ALT12,
-ALT13, -ALT14,
26, 28,
-ALT3, -ALT6,
-ALT7, -ALT8
......
......@@ -100,13 +100,13 @@ typedef enum {
FLIPADST_FLIPADST = 6,
ADST_FLIPADST = 7,
FLIPADST_ADST = 8,
DST_DST = 9,
DST_DCT = 10,
DCT_DST = 11,
DST_ADST = 12,
ADST_DST = 13,
DST_FLIPADST = 14,
FLIPADST_DST = 15,
DST_DCT = 9,
DCT_DST = 10,
DST_ADST = 11,
ADST_DST = 12,
DST_FLIPADST = 13,
FLIPADST_DST = 14,
DST_DST = 15,
#endif // CONFIG_EXT_TX
TX_TYPES,
} TX_TYPE;
......
This diff is collapsed.
......@@ -36,79 +36,166 @@ static INLINE void range_check(const tran_low_t *input, const int size,
#if CONFIG_EXT_TX
void fdst4(const tran_low_t *input, tran_low_t *output) {
static const int N = 4;
// {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
static const int32_t sinvalue_lookup[] = {
141124871, 228344838,
};
int i, j;
for (i = 0; i < N; i++) {
int64_t sum = 0;
for (j = 0; j < N; j++) {
int idx = (i + 1) * (j + 1);
int sign = 0;
if (idx > N + 1) {
sign = (idx / (N + 1)) & 1;
idx %= (N + 1);
}
idx = idx > N + 1 - idx ? N + 1 - idx : idx;
if (idx == 0) continue;
idx--;
sum += (int64_t)input[j] * sinvalue_lookup[idx] * (sign ? -1 : 1);
}
output[i] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
}
int64_t sum;
int64_t s03 = (input[0] + input[3]);
int64_t d03 = (input[0] - input[3]);
int64_t s12 = (input[1] + input[2]);
int64_t d12 = (input[1] - input[2]);
sum = s03 * sinvalue_lookup[0] + s12 * sinvalue_lookup[1];
output[0] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d03 * sinvalue_lookup[1] + d12 * sinvalue_lookup[0];
output[1] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = s03 * sinvalue_lookup[1] - s12 * sinvalue_lookup[0];
output[2] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d03 * sinvalue_lookup[0] - d12 * sinvalue_lookup[1];
output[3] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
}
void fdst8(const tran_low_t *input, tran_low_t *output) {
static const int N = 8;
// {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2
static const int sinvalue_lookup[] = {
86559612, 162678858, 219176632, 249238470
};
int i, j;
for (i = 0; i < N; i++) {
int64_t sum = 0;
for (j = 0; j < N; j++) {
int idx = (i + 1) * (j + 1);
int sign = 0;
if (idx > N + 1) {
sign = (idx / (N + 1)) & 1;
idx %= (N + 1);
}
idx = idx > N + 1 - idx ? N + 1 - idx : idx;
if (idx == 0) continue;
idx--;
sum += (int64_t)input[j] * sinvalue_lookup[idx] * (sign ? -1 : 1);
}
output[i] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
}
int64_t sum;
int64_t s07 = (input[0] + input[7]);
int64_t d07 = (input[0] - input[7]);
int64_t s16 = (input[1] + input[6]);
int64_t d16 = (input[1] - input[6]);
int64_t s25 = (input[2] + input[5]);
int64_t d25 = (input[2] - input[5]);
int64_t s34 = (input[3] + input[4]);
int64_t d34 = (input[3] - input[4]);
sum = s07 * sinvalue_lookup[0] + s16 * sinvalue_lookup[1] +
s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[3];
output[0] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d07 * sinvalue_lookup[1] + d16 * sinvalue_lookup[3] +
d25 * sinvalue_lookup[2] + d34 * sinvalue_lookup[0];
output[1] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = (s07 + s16 - s34)* sinvalue_lookup[2];
output[2] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d07 * sinvalue_lookup[3] + d16 * sinvalue_lookup[0] -
d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[1];
output[3] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = s07 * sinvalue_lookup[3] - s16 * sinvalue_lookup[0] -
s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[1];
output[4] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = (d07 - d16 + d34)* sinvalue_lookup[2];
output[5] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = s07 * sinvalue_lookup[1] - s16 * sinvalue_lookup[3] +
s25 * sinvalue_lookup[2] - s34 * sinvalue_lookup[0];
output[6] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d07 * sinvalue_lookup[0] - d16 * sinvalue_lookup[1] +
d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[3];
output[7] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
}
void fdst16(const tran_low_t *input, tran_low_t *output) {
static const int N = 16;
// {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2)
static const int sinvalue_lookup[] = {
47852167, 94074787, 137093803, 175444254,
207820161, 233119001, 250479254, 259309736
};
int i, j;
for (i = 0; i < N; i++) {
int64_t sum = 0;
for (j = 0; j < N; j++) {
int idx = (i + 1) * (j + 1);
int sign = 0;
if (idx > N + 1) {
sign = (idx / (N + 1)) & 1;
idx %= (N + 1);
}
idx = idx > N + 1 - idx ? N + 1 - idx : idx;
if (idx == 0) continue;
idx--;
sum += (int64_t)input[j] * sinvalue_lookup[idx] * (sign ? -1 : 1);
}
output[i] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
}
int64_t sum;
int64_t s015 = (input[0] + input[15]);
int64_t d015 = (input[0] - input[15]);
int64_t s114 = (input[1] + input[14]);
int64_t d114 = (input[1] - input[14]);
int64_t s213 = (input[2] + input[13]);
int64_t d213 = (input[2] - input[13]);
int64_t s312 = (input[3] + input[12]);
int64_t d312 = (input[3] - input[12]);
int64_t s411 = (input[4] + input[11]);
int64_t d411 = (input[4] - input[11]);
int64_t s510 = (input[5] + input[10]);
int64_t d510 = (input[5] - input[10]);
int64_t s69 = (input[6] + input[9]);
int64_t d69 = (input[6] - input[9]);
int64_t s78 = (input[7] + input[8]);
int64_t d78 = (input[7] - input[8]);
sum = s015 * sinvalue_lookup[0] + s114 * sinvalue_lookup[1] +
s213 * sinvalue_lookup[2] + s312 * sinvalue_lookup[3] +
s411 * sinvalue_lookup[4] + s510 * sinvalue_lookup[5] +
s69 * sinvalue_lookup[6] + s78 * sinvalue_lookup[7];
output[0] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d015 * sinvalue_lookup[1] + d114 * sinvalue_lookup[3] +
d213 * sinvalue_lookup[5] + d312 * sinvalue_lookup[7] +
d411 * sinvalue_lookup[6] + d510 * sinvalue_lookup[4] +
d69 * sinvalue_lookup[2] + d78 * sinvalue_lookup[0];
output[1] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = s015 * sinvalue_lookup[2] + s114 * sinvalue_lookup[5] +
s213 * sinvalue_lookup[7] + s312 * sinvalue_lookup[4] +
s411 * sinvalue_lookup[1] - s510 * sinvalue_lookup[0] -
s69 * sinvalue_lookup[3] - s78 * sinvalue_lookup[6];
output[2] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d015 * sinvalue_lookup[3] + d114 * sinvalue_lookup[7] +
d213 * sinvalue_lookup[4] + d312 * sinvalue_lookup[0] -
d411 * sinvalue_lookup[2] - d510 * sinvalue_lookup[6] -
d69 * sinvalue_lookup[5] - d78 * sinvalue_lookup[1];
output[3] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = s015 * sinvalue_lookup[4] + s114 * sinvalue_lookup[6] +
s213 * sinvalue_lookup[1] - s312 * sinvalue_lookup[2] -
s411 * sinvalue_lookup[7] - s510 * sinvalue_lookup[3] +
s69 * sinvalue_lookup[0] + s78 * sinvalue_lookup[5];
output[4] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d015 * sinvalue_lookup[5] + d114 * sinvalue_lookup[4] -
d213 * sinvalue_lookup[0] - d312 * sinvalue_lookup[6] -
d411 * sinvalue_lookup[3] + d510 * sinvalue_lookup[1] +
d69 * sinvalue_lookup[7] + d78 * sinvalue_lookup[2];
output[5] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = s015 * sinvalue_lookup[6] + s114 * sinvalue_lookup[2] -
s213 * sinvalue_lookup[3] - s312 * sinvalue_lookup[5] +
s411 * sinvalue_lookup[0] + s510 * sinvalue_lookup[7] +
s69 * sinvalue_lookup[1] - s78 * sinvalue_lookup[4];
output[6] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d015 * sinvalue_lookup[7] + d114 * sinvalue_lookup[0] -
d213 * sinvalue_lookup[6] - d312 * sinvalue_lookup[1] +
d411 * sinvalue_lookup[5] + d510 * sinvalue_lookup[2] -
d69 * sinvalue_lookup[4] - d78 * sinvalue_lookup[3];
output[7] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = s015 * sinvalue_lookup[7] - s114 * sinvalue_lookup[0] -
s213 * sinvalue_lookup[6] + s312 * sinvalue_lookup[1] +
s411 * sinvalue_lookup[5] - s510 * sinvalue_lookup[2] -
s69 * sinvalue_lookup[4] + s78 * sinvalue_lookup[3];
output[8] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d015 * sinvalue_lookup[6] - d114 * sinvalue_lookup[2] -
d213 * sinvalue_lookup[3] + d312 * sinvalue_lookup[5] +
d411 * sinvalue_lookup[0] - d510 * sinvalue_lookup[7] +
d69 * sinvalue_lookup[1] + d78 * sinvalue_lookup[4];
output[9] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = s015 * sinvalue_lookup[5] - s114 * sinvalue_lookup[4] -
s213 * sinvalue_lookup[0] + s312 * sinvalue_lookup[6] -
s411 * sinvalue_lookup[3] - s510 * sinvalue_lookup[1] +
s69 * sinvalue_lookup[7] - s78 * sinvalue_lookup[2];
output[10] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d015 * sinvalue_lookup[4] - d114 * sinvalue_lookup[6] +
d213 * sinvalue_lookup[1] + d312 * sinvalue_lookup[2] -
d411 * sinvalue_lookup[7] + d510 * sinvalue_lookup[3] +
d69 * sinvalue_lookup[0] - d78 * sinvalue_lookup[5];
output[11] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = s015 * sinvalue_lookup[3] - s114 * sinvalue_lookup[7] +
s213 * sinvalue_lookup[4] - s312 * sinvalue_lookup[0] -
s411 * sinvalue_lookup[2] + s510 * sinvalue_lookup[6] -
s69 * sinvalue_lookup[5] + s78 * sinvalue_lookup[1];
output[12] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d015 * sinvalue_lookup[2] - d114 * sinvalue_lookup[5] +
d213 * sinvalue_lookup[7] - d312 * sinvalue_lookup[4] +
d411 * sinvalue_lookup[1] + d510 * sinvalue_lookup[0] -
d69 * sinvalue_lookup[3] + d78 * sinvalue_lookup[6];
output[13] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = s015 * sinvalue_lookup[1] - s114 * sinvalue_lookup[3] +
s213 * sinvalue_lookup[5] - s312 * sinvalue_lookup[7] +
s411 * sinvalue_lookup[6] - s510 * sinvalue_lookup[4] +
s69 * sinvalue_lookup[2] - s78 * sinvalue_lookup[0];
output[14] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
sum = d015 * sinvalue_lookup[0] - d114 * sinvalue_lookup[1] +
d213 * sinvalue_lookup[2] - d312 * sinvalue_lookup[3] +
d411 * sinvalue_lookup[4] - d510 * sinvalue_lookup[5] +
d69 * sinvalue_lookup[6] - d78 * sinvalue_lookup[7];
output[15] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
}
#endif // CONFIG_EXT_TX
......
......@@ -640,7 +640,6 @@ static void choose_tx_size_from_rd(VP10_COMP *cpi, MACROBLOCK *x,
int start_tx_type, end_tx_type;
#endif // CONFIG_EXT_TX
const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
assert(skip_prob > 0);
s0 = vp10_cost_bit(skip_prob, 0);
......@@ -675,8 +674,14 @@ static void choose_tx_size_from_rd(VP10_COMP *cpi, MACROBLOCK *x,
int r_tx_size = 0;
#if CONFIG_EXT_TX
if (mbmi->ext_txfrm >= GET_EXT_TX_TYPES(n))
continue;
if (is_inter_block(mbmi)) {
if (mbmi->ext_txfrm >= GET_EXT_TX_TYPES(n)) {
continue;
} else if (mbmi->ext_txfrm >= ALT11 && best_tx_type == NORM) {
// Terminate if the best so far is still NORM
break;
}
}
#endif // CONFIG_EXT_TX
for (m = 0; m <= n - (n == (int) max_tx_size); ++m) {
......@@ -725,8 +730,8 @@ static void choose_tx_size_from_rd(VP10_COMP *cpi, MACROBLOCK *x,
last_rd = rd;
#if CONFIG_EXT_TX
if (rd < (is_inter_block(mbmi) &&
(best_tx_type == NORM) ? ext_tx_th : 1) * best_rd) {
if (rd < (is_inter_block(mbmi) && best_tx_type == NORM ? ext_tx_th : 1) *
best_rd) {
#else
if (rd < best_rd) {
#endif // CONFIG_EXT_TX
......@@ -747,7 +752,7 @@ static void choose_tx_size_from_rd(VP10_COMP *cpi, MACROBLOCK *x,
mbmi->tx_size = best_tx;
#if CONFIG_EXT_TX
mbmi->ext_txfrm = best_tx_type;
mbmi->ext_txfrm = best_tx_type > -1 ? best_tx_type : NORM;
txfm_rd_in_plane(x, &r, &d, &s,
&sse, ref_best_rd, 0, bs, best_tx,
cpi->sf.use_fast_coef_costing);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment