Reduce dqcoeff array size in decoder

The decoding process handles detokenization and reconstruction per
transform block sequentially. There is no need to offset the dqcoeff
buffer according to the transform block index. This allows to
reduce the memory spill and improve cache performance.

......@@ -188,8 +188,11 @@ typedef struct macroblockd {
/* dqcoeff are shared by all the planes. So planes must be decoded serially */
DECLARE_ALIGNED(16, tran_low_t, dqcoeff[64 * 64]);
DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
int lossless;
int corrupted;
......@@ -188,7 +188,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
struct macroblockd_plane *const pd = &xd->plane[plane];
if (eob > 0) {
TX_TYPE tx_type = DCT_DCT;
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
tran_low_t *const dqcoeff = pd->dqcoeff;
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
if (xd->lossless) {
......@@ -217,7 +217,7 @@ int vp9_decode_block_tokens(MACROBLOCKD *xd,
pd->left_context + y);
const scan_order *so = get_scan(xd, tx_size, pd->plane_type, block);
const int eob = decode_coefs(xd, pd->plane_type,
BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
pd->dqcoeff, tx_size,
dequant, ctx, so->scan, so->neighbors, r);
vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
return eob;
