Commit 8089315a authored by Urvang Joshi's avatar Urvang Joshi

TX64X64: Reuse scan, quant etc from 32x32.

- Reuse scan order
- Truncate to max eob of 32*32
- Quantization and entropy coding same as done for TX_32X32
- Reuse quantization matrices of TX_32X32

Compression performance is roughly neutral:
https://arewecompressedyet.com/?job=tx64x64_oldscans%402017-11-06T03%3A11%3A53.868Z&job=tx64x64_reusescans%402017-11-06T03%3A12%3A55.738Z

Change-Id: Ie9182c1c69a42a3c1ab4fc980abbd6000c64f179
parent 9bf85997
......@@ -351,6 +351,10 @@ void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
}
// Zero out the bottom 64x32 area.
memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
// Re-pack non-zero coeffs in the first 32x32 indices.
for (int row = 1; row < 32; ++row) {
memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
}
}
void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
......@@ -361,6 +365,7 @@ void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
// Zero out the bottom 32x32 area.
memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
// Note: no repacking needed here.
}
void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
......@@ -373,6 +378,10 @@ void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
for (int row = 0; row < 32; ++row) {
memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
}
// Re-pack non-zero coeffs in the first 32x32 indices.
for (int row = 1; row < 32; ++row) {
memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
}
}
#endif // CONFIG_TX64X64
......
......@@ -391,12 +391,31 @@ void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
#if CONFIG_TX64X64
void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
// TODO(urvang): Can the same array be reused, instead of using a new array?
// Remap 32x32 input into a modified 64x64 by:
// - Copying over these values in top-left 32x32 locations.
// - Setting the rest of the locations to 0.
int32_t mod_input[64 * 64];
for (int row = 0; row < 32; ++row) {
memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
}
memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input));
int txfm_buf[64 * 64 + 64 + 64];
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_64X64, bd);
inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64,
bd);
}
void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
// Remap 32x32 input into a modified 64x32 by:
// - Copying over these values in top-left 32x32 locations.
// - Setting the rest of the locations to 0.
int32_t mod_input[64 * 32];
for (int row = 0; row < 32; ++row) {
memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
}
#if CONFIG_TXMG
int txfm_buf[64 * 32 + 64 + 64];
int32_t rinput[64 * 32];
......@@ -408,20 +427,28 @@ void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output,
int h = tx_size_high[tx_size];
int rw = h;
int rh = w;
transpose_int32(rinput, rw, input, w, w, h);
transpose_int32(rinput, rw, mod_input, w, w, h);
transpose_uint16(routput, rw, output, stride, w, h);
inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
transpose_uint16(output, stride, routput, rw, rw, rh);
#else
int txfm_buf[64 * 32 + 64 + 64];
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_64X32, bd);
#endif
inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32,
bd);
#endif // CONFIG_TXMG
}
void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
// Remap 32x32 input into a modified 32x64 input by:
// - Copying over these values in top-left 32x32 locations.
// - Setting the rest of the locations to 0.
int32_t mod_input[32 * 64];
memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input));
memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input));
int txfm_buf[64 * 32 + 64 + 64];
inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X64, bd);
inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64,
bd);
}
#endif // CONFIG_TX64X64
......
......@@ -1409,6 +1409,16 @@ static INLINE void transpose_int32(int32_t *dst, int dst_stride,
for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
}
static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
return
#if CONFIG_TX64X64 && !CONFIG_DAALA_TX
tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64
? 1024
:
#endif // CONFIG_TX64X64 && !CONFIG_DAALA_TX
tx_size_2d[tx_size];
}
#ifdef __cplusplus
} // extern "C"
#endif
......
......@@ -2502,6 +2502,8 @@ void av1_inverse_transform_block(const MACROBLOCKD *xd,
uint8_t *dst, int stride, int eob) {
if (!eob) return;
assert(eob <= av1_get_max_eob(tx_size));
TxfmParam txfm_param;
init_txfm_param(xd, plane, tx_size, tx_type, eob, &txfm_param);
#if CONFIG_MRC_TX
......
......@@ -487,10 +487,12 @@ void aom_qm_init(AV1_COMMON *cm) {
current = 0;
for (t = 0; t < TX_SIZES_ALL; ++t) {
const int size = tx_size_2d[t];
// Don't use QM for sizes > 32x32
if (q == NUM_QM_LEVELS - 1 || size > 1024) {
if (q == NUM_QM_LEVELS - 1) {
cm->gqmatrix[q][c][t] = NULL;
cm->giqmatrix[q][c][t] = NULL;
} else if (size > 1024) { // Reuse matrices for TX_32X32
cm->gqmatrix[q][c][t] = cm->gqmatrix[q][c][TX_32X32];
cm->giqmatrix[q][c][t] = cm->giqmatrix[q][c][TX_32X32];
} else {
assert(current + size <= QM_TOTAL_SIZE);
cm->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -69,7 +69,7 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
const TX_SIZE txs_ctx = get_txsize_context(tx_size);
const PLANE_TYPE plane_type = get_plane_type(plane);
MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
const int seg_eob = tx_size_2d[tx_size];
const int seg_eob = av1_get_max_eob(tx_size);
int c = 0;
int update_eob = -1;
const int16_t *const dequant =
......
......@@ -110,7 +110,7 @@ static int decode_coefs(MACROBLOCKD *xd, PLANE_TYPE type, tran_low_t *dqcoeff,
int ctx, const int16_t *scan, const int16_t *nb,
int16_t *max_scan_line, aom_reader *r) {
FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
const int max_eob = tx_size_2d[tx_size];
const int max_eob = av1_get_max_eob(tx_size);
const int ref = is_inter_block(&xd->mi[0]->mbmi);
#if CONFIG_AOM_QM && !CONFIG_NEW_QUANT
const qm_val_t *iqmatrix = iqm[tx_size];
......
......@@ -427,7 +427,7 @@ static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
TOKEN_STATS *token_stats) {
const TOKENEXTRA *p = *tp;
int count = 0;
const int seg_eob = tx_size_2d[tx_size];
const int seg_eob = av1_get_max_eob(tx_size);
#if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
if (tx_type == MRC_DCT && ((is_inter && SIGNAL_MRC_MASK_INTER) ||
......
......@@ -2528,6 +2528,10 @@ void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
}
// Zero out the bottom 64x32 area.
memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
// Re-pack non-zero coeffs in the first 32x32 indices.
for (int row = 1; row < 32; ++row) {
memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
}
}
void av1_fht64x32_c(const int16_t *input, tran_low_t *output, int stride,
......@@ -2623,6 +2627,10 @@ void av1_fht64x32_c(const int16_t *input, tran_low_t *output, int stride,
for (int row = 0; row < n; ++row) {
memset(output + row * n2 + n, 0, n * sizeof(*output));
}
// Re-pack non-zero coeffs in the first 32x32 indices.
for (int row = 1; row < 32; ++row) {
memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
}
}
void av1_fht32x64_c(const int16_t *input, tran_low_t *output, int stride,
......@@ -2714,6 +2722,7 @@ void av1_fht32x64_c(const int16_t *input, tran_low_t *output, int stride,
// Zero out the bottom 32x32 area.
memset(output + n * n, 0, n * n * sizeof(*output));
// Note: no repacking needed here.
}
#endif // CONFIG_TX64X64
......
......@@ -134,7 +134,7 @@ static int optimize_b_greedy(const AV1_COMMON *cm, MACROBLOCK *mb, int plane,
const int eob = p->eobs[block];
assert(mb->qindex > 0);
assert((!plane_type && !plane) || (plane_type && plane));
assert(eob <= tx_size_2d[tx_size]);
assert(eob <= av1_get_max_eob(tx_size));
const int ref = is_inter_block(&xd->mi[0]->mbmi);
const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
......@@ -510,12 +510,20 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
#if CONFIG_AOM_QM
int seg_id = mbmi->segment_id;
// Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
const qm_val_t *qmatrix = IS_2D_TRANSFORM(tx_type)
? pd->seg_qmatrix[seg_id][tx_size]
: cm->gqmatrix[NUM_QM_LEVELS - 1][0][tx_size];
const qm_val_t *iqmatrix = IS_2D_TRANSFORM(tx_type)
? pd->seg_iqmatrix[seg_id][tx_size]
: cm->giqmatrix[NUM_QM_LEVELS - 1][0][tx_size];
const TX_SIZE qm_tx_size =
#if CONFIG_TX64X64
tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64
? TX_32X32
:
#endif // CONFIG_TX64X64
tx_size;
const qm_val_t *qmatrix =
IS_2D_TRANSFORM(tx_type) ? pd->seg_qmatrix[seg_id][qm_tx_size]
: cm->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
const qm_val_t *iqmatrix =
IS_2D_TRANSFORM(tx_type)
? pd->seg_iqmatrix[seg_id][qm_tx_size]
: cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
#endif
TxfmParam txfm_param;
......@@ -531,7 +539,6 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
#endif
#endif
const int tx2d_size = tx_size_2d[tx_size];
QUANT_PARAM qparam;
const int16_t *src_diff;
......@@ -611,16 +618,17 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
#endif // CONFIG_TXMG
if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
const int n_coeffs = av1_get_max_eob(tx_size);
if (LIKELY(!x->skip_block)) {
#if CONFIG_DAALA_TX
quant_func_list[xform_quant_idx][1](coeff, tx2d_size, p, qcoeff, dqcoeff,
quant_func_list[xform_quant_idx][1](coeff, n_coeffs, p, qcoeff, dqcoeff,
eob, scan_order, &qparam);
#else
quant_func_list[xform_quant_idx][txfm_param.is_hbd](
coeff, tx2d_size, p, qcoeff, dqcoeff, eob, scan_order, &qparam);
coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, &qparam);
#endif
} else {
av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
}
}
#if CONFIG_LV_MAP
......
......@@ -327,7 +327,7 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, mbmi);
const int16_t *scan = scan_order->scan;
const int seg_eob = tx_size_2d[tx_size];
const int seg_eob = av1_get_max_eob(tx_size);
int c;
const int bwl = tx_size_wide_log2[tx_size];
const int width = tx_size_wide[tx_size];
......@@ -683,7 +683,7 @@ int av1_cost_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
cost += av1_tx_type_cost(cm, x, xd, mbmi->sb_type, plane, tx_size, tx_type);
#endif
const int seg_eob = tx_size_2d[tx_size];
const int seg_eob = av1_get_max_eob(tx_size);
int eob_cost = get_eob_cost(eob, seg_eob, coeff_costs, tx_type);
av1_get_br_level_counts(levels, width, height, level_counts);
......@@ -1358,7 +1358,7 @@ static int get_low_coeff_cost(int coeff_idx, const TxbCache *txb_cache,
static INLINE void set_eob(TxbInfo *txb_info, int eob) {
txb_info->eob = eob;
txb_info->seg_eob = tx_size_2d[txb_info->tx_size];
txb_info->seg_eob = av1_get_max_eob(txb_info->tx_size);
}
// TODO(angiebird): add static to this function once it's called
......@@ -1814,7 +1814,7 @@ static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
int update = 0;
// return update; //TODO: training only.
if (txb_info->eob == 0) return update;
const int max_eob = tx_size_2d[txb_info->tx_size];
const int max_eob = av1_get_max_eob(txb_info->tx_size);
#if TEST_OPTIMIZE_TXB
int64_t sse;
......@@ -1967,7 +1967,7 @@ static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
int cost_diff = 0;
int64_t dist_diff = 0;
int64_t rd_diff = 0;
const int max_eob = tx_size_2d[txb_info->tx_size];
const int max_eob = av1_get_max_eob(txb_info->tx_size);
#if TEST_OPTIMIZE_TXB
int64_t sse;
......@@ -2095,7 +2095,7 @@ int av1_optimize_txb(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
const int16_t *dequant = p->dequant_QTX;
const int seg_eob = tx_size_2d[tx_size];
const int seg_eob = av1_get_max_eob(tx_size);
const int bwl = tx_size_wide_log2[tx_size];
const int width = tx_size_wide[tx_size];
const int height = tx_size_high[tx_size];
......
......@@ -1929,7 +1929,7 @@ void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
) {
// Transform domain distortion computation is more efficient as it does
// not involve an inverse transform, but it is less accurate.
const int buffer_length = tx_size_2d[tx_size];
const int buffer_length = av1_get_max_eob(tx_size);
int64_t this_sse;
// TX-domain results need to shift down to Q2/D10 to match pixel
// domain distortion values which are in Q2^2
......@@ -2126,7 +2126,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
#endif
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
const int buffer_length = tx_size_2d[tx_size];
const int buffer_length = av1_get_max_eob(tx_size);
int64_t tmp_dist;
int64_t tmp;
#if CONFIG_DAALA_TX
......@@ -3744,7 +3744,7 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
#endif
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
const int buffer_length = tx_size_2d[tx_size];
const int buffer_length = av1_get_max_eob(tx_size);
int64_t tmp_dist, tmp_sse;
#if CONFIG_DIST_8X8
int blk_w = block_size_wide[plane_bsize];
......
......@@ -137,7 +137,7 @@ static INLINE int av1_get_token_cost(int v, int16_t *token, int cat6_bits) {
static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
TX_SIZE tx_size) {
const int eob_max = tx_size_2d[tx_size];
const int eob_max = av1_get_max_eob(tx_size);
return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
}
......
......@@ -75,7 +75,7 @@ class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> {
}
}
intptr_t getCoeffNum() { return tx_size_2d[tx_size_]; }
intptr_t getCoeffNum() { return av1_get_max_eob(tx_size_); }
void FillRandomData() {
const intptr_t block_size = getCoeffNum();
......
......@@ -171,7 +171,7 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
}
}
int coeff_num() const { return tx_size_2d[tx_size_]; }
int coeff_num() const { return av1_get_max_eob(tx_size_); }
void FillCoeff(tran_low_t c) {
const int n_coeffs = coeff_num();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment