Commit 7a428ba2 authored by Yushin Cho's avatar Yushin Cho

Add a new experiment, DAALA_DIST

This commit adds a new experiment, Daala's distortion function,
which is designed to better approximate perceptual distortion
in 8x8 pixel blocks.

This experiment is expected to work best with PVQ.

It measures the variance of overlapped 4x4 regions in the 8x8 area,
then uses these variances to scale the MSE of weighted frequency domain
distortion of 8x8 block.

Since AV1 calculates distortion in blocks as small as 4x4, it is not possible to
directly replace the existing distortion functions of AV1,
such as dist_block() and block_rd_txf().
Hence, there has been substantial changes in order to apply
Daala's 8x8 distortion function.
The daala distortion function is applied
after all 4x4 tx blocks in a 8x8 block are encoded (during RDO),
as in below two cases:
1) intra/inter sub8x8 predictions and
2) 4x4 transform with prediction size >= 8.

To enable this experiment, add '--enable-daala-dist' with configure.

TODO: Significant tuning of parameters is required since the function has
originally came from Daala thus most parameters would not work
correctly outside Daala.
The fact that chroma distortion is added to the distortion of AV1's RDO is
also critical since Daala's distortion function is applied to luma only
and chroma continues to use MSE.

Change-Id: If35fdd3aec7efe401f351ba1c99891ad57a3d957
parent 48b1cb35
......@@ -153,6 +153,44 @@ void av1_foreach_transformed_block_in_plane(
}
}
#if CONFIG_DAALA_DIST
void av1_foreach_8x8_transformed_block_in_plane(
const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
foreach_transformed_block_visitor visit,
foreach_transformed_block_visitor mi_visit, void *arg) {
const struct macroblockd_plane *const pd = &xd->plane[plane];
const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
// block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
// 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
// transform size varies per plane, look it up in a common way.
const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
const uint8_t txw_unit = tx_size_wide_unit[tx_size];
const uint8_t txh_unit = tx_size_high_unit[tx_size];
const int step = txw_unit * txh_unit;
int i = 0, r, c;
// If mb_to_right_edge is < 0 we are in a situation in which
// the current block size extends into the UMV and we won't
// visit the sub blocks that are wholly within the UMV.
const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
// Keep track of the row and column of the blocks we use so that we know
// if we are in the unrestricted motion border.
for (r = 0; r < max_blocks_high; r += txh_unit) {
// Skip visiting the sub blocks that are wholly within the UMV.
for (c = 0; c < max_blocks_wide; c += txw_unit) {
visit(plane, i, r, c, plane_bsize, tx_size, arg);
// Call whenever each 8x8 block is done
if ((r & 1) && (c & 1))
mi_visit(plane, i, r - 1, c - 1, plane_bsize, TX_8X8, arg);
i += step;
}
}
}
#endif
void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
BLOCK_SIZE bsize,
foreach_transformed_block_visitor visit,
......
......@@ -456,7 +456,7 @@ typedef struct macroblockd_plane {
const qm_val_t *seg_qmatrix[MAX_SEGMENTS][2][TX_SIZES];
#endif
#if CONFIG_PVQ
#if CONFIG_PVQ || CONFIG_DAALA_DIST
DECLARE_ALIGNED(16, int16_t, pred[MAX_SB_SQUARE]);
// PVQ: forward transformed predicted image, a reference for PVQ.
tran_low_t *pvq_ref_coeff;
......@@ -972,6 +972,13 @@ void av1_foreach_transformed_block_in_plane(
const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
foreach_transformed_block_visitor visit, void *arg);
#if CONFIG_DAALA_DIST
void av1_foreach_8x8_transformed_block_in_plane(
const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
foreach_transformed_block_visitor visit,
foreach_transformed_block_visitor mi_visit, void *arg);
#endif
void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
BLOCK_SIZE bsize,
foreach_transformed_block_visitor visit,
......
......@@ -23,71 +23,6 @@
#include <stdlib.h>
#include <string.h>
/* Quantization matrices for 8x8. For other block sizes, we currently just do
resampling. */
/* Flat quantization, i.e. optimize for PSNR. */
const int OD_QM8_Q4_FLAT[] = {
16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16
};
# if 0
/* M1: MPEG2 matrix for inter (which has a dead zone). */
const int OD_QM8_Q4[] = {
16, 17, 18, 19, 20, 21, 22, 23,
17, 18, 19, 20, 21, 22, 23, 24,
18, 19, 20, 21, 22, 23, 24, 25,
19, 20, 21, 22, 23, 24, 26, 27,
20, 21, 22, 23, 25, 26, 27, 28,
21, 22, 23, 24, 26, 27, 28, 30,
22, 23, 24, 26, 27, 28, 30, 31,
23, 24, 25, 27, 28, 30, 31, 33};
# endif
# if 0
/* M2: MPEG2 matrix for intra (no dead zone). */
const int OD_QM8_Q4[] = {
16, 16, 19, 22, 22, 26, 26, 27,
16, 16, 22, 22, 26, 27, 27, 29,
19, 22, 26, 26, 27, 29, 29, 35,
22, 24, 27, 27, 29, 32, 34, 38,
26, 27, 29, 29, 32, 35, 38, 46,
27, 29, 34, 34, 35, 40, 46, 56,
29, 34, 34, 37, 40, 48, 56, 69,
34, 37, 38, 40, 48, 58, 69, 83
};
# endif
# if 0
/* M3: Taken from dump_psnrhvs. */
const int OD_QM8_Q4[] = {
16, 16, 17, 20, 24, 29, 36, 42,
16, 17, 17, 19, 22, 26, 31, 37,
17, 17, 21, 23, 26, 30, 34, 40,
20, 19, 23, 28, 31, 35, 39, 45,
24, 22, 26, 31, 36, 41, 46, 51,
29, 26, 30, 35, 41, 47, 52, 58,
36, 31, 34, 39, 46, 52, 59, 66,
42, 37, 40, 45, 51, 58, 66, 73
};
# endif
# if 1
/* M4: a compromise equal to .5*(M3 + .5*(M2+transpose(M2))) */
const int OD_QM8_Q4_HVS[] = {
16, 16, 18, 21, 24, 28, 32, 36,
16, 17, 20, 21, 24, 27, 31, 35,
18, 20, 24, 25, 27, 31, 33, 38,
21, 21, 25, 28, 30, 34, 37, 42,
24, 24, 27, 30, 34, 38, 43, 49,
28, 27, 31, 34, 38, 44, 50, 58,
32, 31, 33, 37, 43, 50, 58, 68,
36, 35, 38, 42, 49, 58, 68, 78
};
#endif
/* Imported from encode.c in daala */
/* These are the PVQ equivalent of quantization matrices, except that
the values are per-band. */
......
......@@ -16,14 +16,15 @@
# include "generic_code.h"
# include "odintrin.h"
extern const int OD_QM8_Q4_FLAT[];
extern const int OD_QM8_Q4_HVS[];
extern const uint16_t EXP_CDF_TABLE[][16];
extern const uint16_t LAPLACE_OFFSET[];
#define AV1_PVQ_ENABLE_ACTIVITY_MASKING (0)
#if !CONFIG_DAALA_DIST
#define AV1_PVQ_ENABLE_ACTIVITY_MASKING (0)
#endif
# define PVQ_MAX_PARTITIONS (1 + 3*(OD_TXSIZES-1))
# define OD_NOREF_ADAPT_SPEED (4)
......
......@@ -11280,3 +11280,62 @@ static uint16_t wt_matrix_ref[NUM_QM_LEVELS][2][2][4 * 4 + 8 * 8 + 16 * 16 +
};
#endif
#if CONFIG_PVQ || CONFIG_DAALA_DIST
/* Quantization matrices for 8x8. For other block sizes, we currently just do
resampling. */
/* Flat quantization, i.e. optimize for PSNR. */
const int OD_QM8_Q4_FLAT[] = { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16 };
#if 0
/* M1: MPEG2 matrix for inter (which has a dead zone). */
const int OD_QM8_Q4[] = {
16, 17, 18, 19, 20, 21, 22, 23,
17, 18, 19, 20, 21, 22, 23, 24,
18, 19, 20, 21, 22, 23, 24, 25,
19, 20, 21, 22, 23, 24, 26, 27,
20, 21, 22, 23, 25, 26, 27, 28,
21, 22, 23, 24, 26, 27, 28, 30,
22, 23, 24, 26, 27, 28, 30, 31,
23, 24, 25, 27, 28, 30, 31, 33};
#endif
#if 0
/* M2: MPEG2 matrix for intra (no dead zone). */
const int OD_QM8_Q4[] = {
16, 16, 19, 22, 22, 26, 26, 27,
16, 16, 22, 22, 26, 27, 27, 29,
19, 22, 26, 26, 27, 29, 29, 35,
22, 24, 27, 27, 29, 32, 34, 38,
26, 27, 29, 29, 32, 35, 38, 46,
27, 29, 34, 34, 35, 40, 46, 56,
29, 34, 34, 37, 40, 48, 56, 69,
34, 37, 38, 40, 48, 58, 69, 83
};
#endif
#if 0
/* M3: Taken from dump_psnrhvs. */
const int OD_QM8_Q4[] = {
16, 16, 17, 20, 24, 29, 36, 42,
16, 17, 17, 19, 22, 26, 31, 37,
17, 17, 21, 23, 26, 30, 34, 40,
20, 19, 23, 28, 31, 35, 39, 45,
24, 22, 26, 31, 36, 41, 46, 51,
29, 26, 30, 35, 41, 47, 52, 58,
36, 31, 34, 39, 46, 52, 59, 66,
42, 37, 40, 45, 51, 58, 66, 73
};
#endif
#if 1
/* M4: a compromise equal to .5*(M3 + .5*(M2+transpose(M2))) */
const int OD_QM8_Q4_HVS[] = { 16, 16, 18, 21, 24, 28, 32, 36, 16, 17, 20,
21, 24, 27, 31, 35, 18, 20, 24, 25, 27, 31,
33, 38, 21, 21, 25, 28, 30, 34, 37, 42, 24,
24, 27, 30, 34, 38, 43, 49, 28, 27, 31, 34,
38, 44, 50, 58, 32, 31, 33, 37, 43, 50, 58,
68, 36, 35, 38, 42, 49, 58, 68, 78 };
#endif
#endif
......@@ -98,6 +98,11 @@ static INLINE int get_dq_profile_from_ctx(int qindex, int q_ctx, int is_inter,
}
#endif // CONFIG_NEW_QUANT
#if CONFIG_PVQ || CONFIG_DAALA_DIST
extern const int OD_QM8_Q4_FLAT[];
extern const int OD_QM8_Q4_HVS[];
#endif
#ifdef __cplusplus
} // extern "C"
#endif
......
......@@ -3285,6 +3285,10 @@ static void daala_dec_init(AV1_COMMON *const cm, daala_dec_ctx *daala_dec,
// TODO(yushin) : activity masking info needs be signaled by a bitstream
daala_dec->use_activity_masking = AV1_PVQ_ENABLE_ACTIVITY_MASKING;
#if !CONFIG_DAALA_DIST
daala_dec->use_activity_masking = 0;
#endif
if (daala_dec->use_activity_masking)
daala_dec->qm = OD_HVS_QM;
else
......
......@@ -202,6 +202,13 @@ struct macroblock {
int pvq_speed;
int pvq_coded; // Indicates whether pvq_info needs be stored to tokenize
#endif
#if CONFIG_DAALA_DIST
// Keep rate of each 4x4 block in the current macroblock during RDO
// This is needed when using the 8x8 Daala distortion metric during RDO,
// because it evaluates distortion in a different order than the underlying
// 4x4 blocks are coded.
int rate_4x4[256];
#endif
};
// Converts block_index for given transform size to index of the block in raster
......
......@@ -479,7 +479,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
TX_SIZE tx_size, int ctx,
AV1_XFORM_QUANT xform_quant_idx) {
MACROBLOCKD *const xd = &x->e_mbd;
#if !CONFIG_PVQ
#if !(CONFIG_PVQ || CONFIG_DAALA_DIST)
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
#else
......@@ -504,6 +504,14 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
FWD_TXFM_PARAM fwd_txfm_param;
#if CONFIG_PVQ || CONFIG_DAALA_DIST
uint8_t *dst;
int16_t *pred;
const int dst_stride = pd->dst.stride;
int tx_blk_size;
int i, j;
#endif
#if !CONFIG_PVQ
const int tx2d_size = tx_size_2d[tx_size];
QUANT_PARAM qparam;
......@@ -523,14 +531,11 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
#else
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
uint8_t *src, *dst;
int16_t *src_int16, *pred;
const int src_stride = p->src.stride;
const int dst_stride = pd->dst.stride;
int tx_blk_size;
int i, j;
int skip = 1;
PVQ_INFO *pvq_info = NULL;
uint8_t *src;
int16_t *src_int16;
const int src_stride = p->src.stride;
(void)scan_order;
(void)qcoeff;
......@@ -539,10 +544,20 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
assert(block < MAX_PVQ_BLOCKS_IN_SB);
pvq_info = &x->pvq[block][plane];
}
dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
src = &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
src_int16 =
&p->src_int16[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
// transform block size in pixels
tx_blk_size = tx_size_wide[tx_size];
for (j = 0; j < tx_blk_size; j++)
for (i = 0; i < tx_blk_size; i++)
src_int16[diff_stride * j + i] = src[src_stride * j + i];
#endif
#if CONFIG_PVQ || CONFIG_DAALA_DIST
dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
// transform block size in pixels
......@@ -552,10 +567,10 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
// in order to use existing VP10 transform functions
for (j = 0; j < tx_blk_size; j++)
for (i = 0; i < tx_blk_size; i++) {
src_int16[diff_stride * j + i] = src[src_stride * j + i];
pred[diff_stride * j + i] = dst[dst_stride * j + i];
}
#endif
(void)ctx;
fwd_txfm_param.tx_type = tx_type;
......
......@@ -55,6 +55,9 @@
#if CONFIG_PVQ
#include "av1/encoder/pvq_encoder.h"
#endif
#if CONFIG_PVQ || CONFIG_DAALA_DIST
#include "av1/common/pvq.h"
#endif
#if CONFIG_DUAL_FILTER
#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
......@@ -448,6 +451,170 @@ static const TX_TYPE_1D htx_tab[TX_TYPES] = {
#endif // CONFIG_EXT_TX
};
#if CONFIG_DAALA_DIST
static int od_compute_var_4x4(od_coeff *x, int stride) {
int sum;
int s2;
int i;
sum = 0;
s2 = 0;
for (i = 0; i < 4; i++) {
int j;
for (j = 0; j < 4; j++) {
int t;
t = x[i * stride + j];
sum += t;
s2 += t * t;
}
}
// TODO(yushin) : Check wheter any changes are required for high bit depth.
return (s2 - (sum * sum >> 4)) >> 4;
}
static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x,
od_coeff *y, int stride) {
od_coeff e[8 * 8];
od_coeff et[8 * 8];
int16_t src[8 * 8];
tran_low_t coeff[8 * 8];
double sum;
int min_var;
double mean_var;
double var_stat;
double activity;
double calibration;
int i;
int j;
double vardist;
FWD_TXFM_PARAM fwd_txfm_param;
vardist = 0;
OD_ASSERT(qm != OD_FLAT_QM);
#if 1
min_var = INT_MAX;
mean_var = 0;
for (i = 0; i < 3; i++) {
for (j = 0; j < 3; j++) {
int varx;
int vary;
varx = od_compute_var_4x4(x + 2 * i * stride + 2 * j, stride);
vary = od_compute_var_4x4(y + 2 * i * stride + 2 * j, stride);
min_var = OD_MINI(min_var, varx);
mean_var += 1. / (1 + varx);
/* The cast to (double) is to avoid an overflow before the sqrt.*/
vardist += varx - 2 * sqrt(varx * (double)vary) + vary;
}
}
/* We use a different variance statistic depending on whether activity
masking is used, since the harmonic mean appeared slghtly worse with
masking off. The calibration constant just ensures that we preserve the
rate compared to activity=1. */
if (use_activity_masking) {
calibration = 1.95;
var_stat = 9. / mean_var;
} else {
calibration = 1.62;
var_stat = min_var;
}
/* 1.62 is a calibration constant, 0.25 is a noise floor and 1/6 is the
activity masking constant. */
activity = calibration * pow(.25 + var_stat, -1. / 6);
#else
activity = 1;
#endif
sum = 0;
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++)
e[8 * i + j] = x[i * stride + j] - y[i * stride + j];
}
for (i = 0; i < 8; i++)
for (j = 0; j < 8; j++) src[8 * i + j] = e[8 * i + j];
fwd_txfm_param.tx_type = DCT_DCT;
fwd_txfm_param.tx_size = TX_8X8;
fwd_txfm_param.lossless = 0;
fwd_txfm(&src[0], &coeff[0], 8, &fwd_txfm_param);
for (i = 0; i < 8; i++)
for (j = 0; j < 8; j++) et[8 * i + j] = coeff[8 * i + j] >> 3;
sum = 0;
for (i = 0; i < 8; i++)
for (j = 0; j < 8; j++)
sum += et[8 * i + j] * (double)et[8 * i + j] * 16. /
OD_QM8_Q4_HVS[i * 8 + j];
return activity * activity * (sum + vardist);
}
// Note : Inputs x and y are in a pixel domain
static double od_compute_dist(int qm, int activity_masking, od_coeff *x,
od_coeff *y, int bsize_w, int bsize_h,
int qindex) {
int i;
double sum;
sum = 0;
(void)qindex;
assert(bsize_w >= 8 && bsize_h >= 8);
if (qm == OD_FLAT_QM) {
for (i = 0; i < bsize_w * bsize_h; i++) {
double tmp;
tmp = x[i] - y[i];
sum += tmp * tmp;
}
} else {
for (i = 0; i < bsize_h; i += 8) {
int j;
for (j = 0; j < bsize_w; j += 8) {
sum += od_compute_dist_8x8(qm, activity_masking, &x[i * bsize_w + j],
&y[i * bsize_w + j], bsize_w);
}
}
/* Compensate for the fact that the quantization matrix lowers the
distortion value. We tried a half-dozen values and picked the one where
we liked the ntt-short1 curves best. The tuning is approximate since
the different metrics go in different directions. */
/*Start interpolation at coded_quantizer 1.7=f(36) and end it at 1.2=f(47)*/
// TODO(yushin): Check whether qindex of AV1 work here, replacing daala's
// coded_quantizer.
/*sum *= qindex >= 47 ? 1.2 :
qindex <= 36 ? 1.7 :
1.7 + (1.2 - 1.7)*(qindex - 36)/(47 - 36);*/
}
return sum;
}
static int64_t av1_daala_dist(const uint8_t *src, int src_stride,
const uint8_t *dst, int dst_stride, int tx_size,
int qm, int use_activity_masking, int qindex) {
int i, j;
int64_t d;
const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
const int bsw = block_size_wide[tx_bsize];
const int bsh = block_size_high[tx_bsize];
DECLARE_ALIGNED(16, od_coeff, orig[MAX_TX_SQUARE]);
DECLARE_ALIGNED(16, od_coeff, rec[MAX_TX_SQUARE]);
assert(qm == OD_HVS_QM);
for (j = 0; j < bsh; j++)
for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
for (j = 0; j < bsh; j++)
for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
d = (int64_t)od_compute_dist(qm, use_activity_masking, orig, rec, bsw, bsh,
qindex);
return d;
}
#endif // #if CONFIG_DAALA_DIST
static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
......@@ -829,9 +996,10 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
// TODO(yushin) : Since 4x4 case does not need ssz, better to refactor into
// a separate function that does not do the extra computations for ssz.
int64_t av1_block_error2_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
const tran_low_t *ref, intptr_t block_size,
int64_t *ssz) {
static int64_t av1_block_error2_c(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
const tran_low_t *ref, intptr_t block_size,
int64_t *ssz) {
int64_t error;
// Use the existing sse codes for calculating distortion of decoded signal:
......@@ -1015,7 +1183,15 @@ static void dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
if (cpi->sf.use_transform_domain_distortion) {
#if CONFIG_DAALA_DIST
int qm = OD_HVS_QM;
int use_activity_masking = 0;
#if CONFIG_PVQ
use_activity_masking = x->daala_enc.use_activity_masking;
#endif
#endif
if (cpi->sf.use_transform_domain_distortion && !CONFIG_DAALA_DIST) {
// Transform domain distortion computation is more accurate as it does
// not involve an inverse transform, but it is less accurate.
const int buffer_length = tx_size_2d[tx_size];
......@@ -1061,7 +1237,17 @@ static void dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
assert(cpi != NULL);
assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
#if CONFIG_DAALA_DIST
if (plane == 0) {
if (bsw >= 8 && bsh >= 8)
tmp = av1_daala_dist(src, src_stride, dst, dst_stride, tx_size, qm,
use_activity_masking, x->qindex);
else
tmp = 0;
} else
#endif
cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
*out_sse = (int64_t)tmp * 16;
if (eob) {
......@@ -1106,10 +1292,17 @@ static void dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
#endif
inv_txfm_add(dqcoeff, recon, MAX_TX_SIZE, &inv_txfm_param);
}
cpi->fn_ptr[tx_bsize].vf(src, src_stride, recon, MAX_TX_SIZE, &tmp);
#if CONFIG_DAALA_DIST
if (plane == 0) {
if (bsw >= 8 && bsh >= 8)
tmp = av1_daala_dist(src, src_stride, recon, MAX_TX_SIZE, tx_size, qm,
use_activity_masking, x->qindex);
else
tmp = 0;
} else
#endif
cpi->fn_ptr[tx_bsize].vf(src, src_stride, recon, MAX_TX_SIZE, &tmp);
}
*out_dist = (int64_t)tmp * 16;
}
}
......@@ -1176,6 +1369,14 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
int coeff_ctx = combine_entropy_contexts(*(args->t_above + blk_col),
*(args->t_left + blk_row));
RD_STATS this_rd_stats;
#if CONFIG_DAALA_DIST
int qm = OD_HVS_QM;
int use_activity_masking = 0;
#if CONFIG_PVQ
use_activity_masking = x->daala_enc.use_activity_masking;
#endif
#endif
av1_init_rd_stats(&this_rd_stats);
if (args->exit_early) return;
......@@ -1186,8 +1387,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
};
av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
&b_args);
if (args->cpi->sf.use_transform_domain_distortion) {
if (args->cpi->sf.use_transform_domain_distortion && !CONFIG_DAALA_DIST) {
dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size,
&this_rd_stats.dist, &this_rd_stats.sse);
} else {
......@@ -1195,7 +1395,6 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
// inv_txfm_add, so we can't just call dist_block here.
const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
const aom_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
......@@ -1210,18 +1409,56 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
const int16_t *diff = &p->src_diff[(blk_row * diff_stride + blk_col)
<< tx_size_wide_log2[0]];
unsigned int tmp;
this_rd_stats.sse = sum_squares_2d(diff, diff_stride, tx_size);
#if CONFIG_DAALA_DIST
if (plane == 0) {
const int bsw = block_size_wide[tx_bsize];
const int bsh = block_size_high[tx_bsize];
if (bsw >= 8 && bsh >= 8) {
const int16_t *pred = &pd->pred[(blk_row * diff_stride + blk_col)
<< tx_size_wide_log2[0]];