diff --git a/av1/common/cfl.c b/av1/common/cfl.c index b9e356be6ee626ad461192e4c3b5e8dc2d411f7a..1ba6dda02b8b4b2e1230d41e7da231e176349a67 100644 --- a/av1/common/cfl.c +++ b/av1/common/cfl.c @@ -179,8 +179,14 @@ static void cfl_dc_pred(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) { // TODO(ltrudeau) Because of max_block_wide and max_block_high, num_pel will // not be a power of two. So these divisions will have to use a lookup table. - cfl->dc_pred_q7[CFL_PRED_U] = (sum_u << 7) / num_pel; - cfl->dc_pred_q7[CFL_PRED_V] = (sum_v << 7) / num_pel; + cfl->dc_pred_q6[CFL_PRED_U] = ((sum_u << 6) + (num_pel >> 1)) / num_pel; + cfl->dc_pred_q6[CFL_PRED_V] = ((sum_v << 6) + (num_pel >> 1)) / num_pel; + + // Loss is never more than 1/2 (in Q6) + assert(fabs(cfl->dc_pred_q6[CFL_PRED_U] - (sum_u / ((double)num_pel) * 64)) <= + 0.5); + assert(fabs(cfl->dc_pred_q6[CFL_PRED_V] - (sum_v / ((double)num_pel) * 64)) <= + 0.5); } static void cfl_compute_averages(CFL_CTX *cfl, TX_SIZE tx_size) { @@ -197,7 +203,7 @@ static void cfl_compute_averages(CFL_CTX *cfl, TX_SIZE tx_size) { const uint8_t *y_pix = cfl->y_down_pix; // TODO(ltrudeau) Convert to uint16 for HBD support const uint8_t *t_y_pix; - int *averages_q10 = cfl->y_averages_q10; + int *averages_q3 = cfl->y_averages_q3; cfl_load(cfl, 0, 0, width, height); @@ -212,11 +218,12 @@ static void cfl_compute_averages(CFL_CTX *cfl, TX_SIZE tx_size) { } t_y_pix += MAX_SB_SIZE; } - averages_q10[a++] = (sum << 10) >> num_pel_log2; + averages_q3[a++] = + ((sum << 3) + (1 << (num_pel_log2 - 1))) >> num_pel_log2; - // Assert no loss from fixed point - assert((double)averages_q10[a - 1] == - (sum / ((double)(1 << num_pel_log2))) * (1 << 10)); + // Loss is never more than 1/2 (in Q3) + assert(fabs((double)averages_q3[a - 1] - + (sum / ((double)(1 << num_pel_log2))) * (1 << 3)) <= 0.5); } assert(a % stride == 0); y_pix += block_row_stride; @@ -253,7 +260,7 @@ void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, // TODO(ltrudeau) Convert to uint16 to support HBD const uint8_t *y_pix = cfl->y_down_pix; - const int dc_pred_bias_q13 = (cfl->dc_pred_q7[plane - 1] << 6) + (1 << 12); + const int dc_pred_bias_q6 = cfl->dc_pred_q6[plane - 1] + 32; const double alpha = cfl_idx_to_alpha( mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs[plane - 1], plane - 1); // TODO(ltrudeau) Convert alpha to fixed point. @@ -263,23 +270,23 @@ void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, (row << tx_size_wide_log2[0]) >> tx_size_wide_log2[tx_size]; const int avg_col = (col << tx_size_high_log2[0]) >> tx_size_high_log2[tx_size]; - const int avg_q10 = - cfl->y_averages_q10[cfl->y_averages_stride * avg_row + avg_col]; + const int avg_q3 = + cfl->y_averages_q3[cfl->y_averages_stride * avg_row + avg_col]; cfl_load(cfl, row, col, width, height); for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { - const int pred_q13 = - get_scaled_luma_q13(alpha_q3, y_pix[i], avg_q10) + dc_pred_bias_q13; + const int pred_q6 = + get_scaled_luma_q6(alpha_q3, y_pix[i], avg_q3) + dc_pred_bias_q6; // TODO(ltrudeau) Manage HBD. - if (pred_q13 <= 0) { + if (pred_q6 <= 0) { dst[i] = 0; - } else if (pred_q13 > (255 << 13)) { + } else if (pred_q6 > (255 << 6)) { dst[i] = 255; } else { - dst[i] = (uint8_t)(pred_q13 >> 13); - assert(dst[i] == (int)(alpha * (y_pix[i] - (avg_q10 / 1024.0)) + - (cfl->dc_pred_q7[plane - 1] / 128.0) + 0.5)); + dst[i] = (uint8_t)(pred_q6 >> 6); + assert(dst[i] == (int)(alpha * (y_pix[i] - (avg_q3 / 8.0)) + + (cfl->dc_pred_q6[plane - 1] / 64.0) + 0.5)); } } dst += dst_stride; diff --git a/av1/common/cfl.h b/av1/common/cfl.h index 7470eb8ad1c6dcc4bb0ff21f62dbea28519f66af..ec378874c33eb961f3cb54056a0b101e69c73b31 100644 --- a/av1/common/cfl.h +++ b/av1/common/cfl.h @@ -43,9 +43,12 @@ typedef struct { // Transform level averages of the luma reconstructed values over the entire // prediction unit - // Fixed point y_averages is Q12.10: + // Fixed point y_averages is Q12.3: // * Worst case division is 1/1024 - int y_averages_q10[MAX_NUM_TXB]; + // * Max error will be 1/16th. + // Note: 3 is chosen so that y_averages fits in 15 bits when 12 bit input is + // used + int y_averages_q3[MAX_NUM_TXB]; int y_averages_stride; int are_parameters_computed; @@ -54,9 +57,11 @@ typedef struct { int subsampling_x, subsampling_y; // Block level DC_PRED for each chromatic plane - // Fixed point dc_pred is Q12.7: + // Fixed point dc_pred is Q12.6 // * Worst case division is 1/128 - int dc_pred_q7[CFL_PRED_PLANES]; + // * Max error is 1/128th + // Note: 6 is chosen because alpha_q3 * y_average_q3 implies Q6 + int dc_pred_q6[CFL_PRED_PLANES]; // The rate associated with each alpha codeword int costs[CFL_ALPHABET_SIZE]; @@ -75,8 +80,8 @@ static const int cfl_alpha_codes[CFL_ALPHABET_SIZE][CFL_PRED_PLANES] = { { 0, 3 }, { 5, 1 }, { 1, 5 }, { 0, 5 } }; -static INLINE int get_scaled_luma_q13(int alpha_q3, int y_pix, int avg_q10) { - return alpha_q3 * ((y_pix << 10) - avg_q10); +static INLINE int get_scaled_luma_q6(int alpha_q3, int y_pix, int avg_q3) { + return alpha_q3 * ((y_pix << 3) - avg_q3); } void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm); diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c index 28461416f5ccb2e9d4fb0734e5502cdea717b98f..710cdc812a947f844f54c4a07fda692ac814c7c0 100644 --- a/av1/encoder/encodemb.c +++ b/av1/encoder/encodemb.c @@ -1429,15 +1429,15 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, #if CONFIG_CFL static int cfl_alpha_dist(const uint8_t *y_pix, int y_stride, - const int y_averages_q10[MAX_NUM_TXB], + const int y_averages_q3[MAX_NUM_TXB], const uint8_t *src, int src_stride, int width, - int height, TX_SIZE tx_size, int dc_pred_q7, + int height, TX_SIZE tx_size, int dc_pred_q6, double alpha, int *dist_neg_out) { int dist = 0; int diff; if (alpha == 0.0) { - const int dc_pred_bias = (dc_pred_q7 + 64) >> 7; + const int dc_pred_bias = (dc_pred_q6 + 32) >> 6; for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { diff = src[i] - dc_pred_bias; @@ -1451,7 +1451,7 @@ static int cfl_alpha_dist(const uint8_t *y_pix, int y_stride, return dist; } - const int dc_pred_bias_q13 = (dc_pred_q7 << 6) + (1 << 12); + const int dc_pred_bias_q6 = dc_pred_q6 + 32; // TODO(ltrudeau) Convert alpha to fixed point const int alpha_q3 = (int)(alpha * 8); int dist_neg = 0; @@ -1466,26 +1466,24 @@ static int cfl_alpha_dist(const uint8_t *y_pix, int y_stride, const int h = b_j + tx_height; for (int b_i = 0; b_i < width; b_i += tx_width) { const int w = b_i + tx_width; - const int tx_avg_q10 = y_averages_q10[a++]; + const int tx_avg_q3 = y_averages_q3[a++]; t_y_pix = y_pix; t_src = src; for (int t_j = b_j; t_j < h; t_j++) { for (int t_i = b_i; t_i < w; t_i++) { - const int scaled_luma_q13 = - get_scaled_luma_q13(alpha_q3, t_y_pix[t_i], tx_avg_q10); - const int uv = t_src[t_i]; + const int scaled_luma_q6 = + get_scaled_luma_q6(alpha_q3, t_y_pix[t_i], tx_avg_q3); + // TODO(ltrudeau) add support for HBD. - diff = - uv - - (clamp(scaled_luma_q13 + dc_pred_bias_q13, 0, (255 << 13)) >> 13); + diff = uv - + (clamp(scaled_luma_q6 + dc_pred_bias_q6, 0, (255 << 6)) >> 6); dist += diff * diff; // TODO(ltrudeau) add support for HBD. diff = uv - - (clamp(-scaled_luma_q13 + dc_pred_bias_q13, 0, (255 << 13)) >> - 13); + (clamp(-scaled_luma_q6 + dc_pred_bias_q6, 0, (255 << 6)) >> 6); dist_neg += diff * diff; } t_y_pix += y_stride; @@ -1535,9 +1533,9 @@ static void cfl_compute_alpha_ind(MACROBLOCK *const x, FRAME_CONTEXT *ec_ctx, cfl_compute_parameters(xd, tx_size); const int width = cfl->uv_width; const int height = cfl->uv_height; - const int dc_pred_u_q7 = cfl->dc_pred_q7[CFL_PRED_U]; - const int dc_pred_v_q7 = cfl->dc_pred_q7[CFL_PRED_V]; - const int *y_averages_q10 = cfl->y_averages_q10; + const int dc_pred_u_q6 = cfl->dc_pred_q6[CFL_PRED_U]; + const int dc_pred_v_q6 = cfl->dc_pred_q6[CFL_PRED_V]; + const int *y_averages_q3 = cfl->y_averages_q3; const uint8_t *y_pix = cfl->y_down_pix; CFL_SIGN_TYPE *signs = mbmi->cfl_alpha_signs; @@ -1546,20 +1544,20 @@ static void cfl_compute_alpha_ind(MACROBLOCK *const x, FRAME_CONTEXT *ec_ctx, int sse[CFL_PRED_PLANES][CFL_MAGS_SIZE]; sse[CFL_PRED_U][0] = - cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q10, src_u, src_stride_u, - width, height, tx_size, dc_pred_u_q7, 0, NULL); + cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u, + width, height, tx_size, dc_pred_u_q6, 0, NULL); sse[CFL_PRED_V][0] = - cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q10, src_v, src_stride_v, - width, height, tx_size, dc_pred_v_q7, 0, NULL); + cfl_alpha_dist(y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v, + width, height, tx_size, dc_pred_v_q6, 0, NULL); for (int m = 1; m < CFL_MAGS_SIZE; m += 2) { assert(cfl_alpha_mags[m + 1] == -cfl_alpha_mags[m]); sse[CFL_PRED_U][m] = cfl_alpha_dist( - y_pix, MAX_SB_SIZE, y_averages_q10, src_u, src_stride_u, width, height, - tx_size, dc_pred_u_q7, cfl_alpha_mags[m], &sse[CFL_PRED_U][m + 1]); + y_pix, MAX_SB_SIZE, y_averages_q3, src_u, src_stride_u, width, height, + tx_size, dc_pred_u_q6, cfl_alpha_mags[m], &sse[CFL_PRED_U][m + 1]); sse[CFL_PRED_V][m] = cfl_alpha_dist( - y_pix, MAX_SB_SIZE, y_averages_q10, src_v, src_stride_v, width, height, - tx_size, dc_pred_v_q7, cfl_alpha_mags[m], &sse[CFL_PRED_V][m + 1]); + y_pix, MAX_SB_SIZE, y_averages_q3, src_v, src_stride_v, width, height, + tx_size, dc_pred_v_q6, cfl_alpha_mags[m], &sse[CFL_PRED_V][m + 1]); } int dist;