Commit 467205ac authored by Luc Trudeau's avatar Luc Trudeau
Browse files

[CFL] Cache DC_PRED during CfL-RDO

By default, the DC_PRED is not cached (this includes
decoding). During cfl_rd_pick_alpha(), DC_PRED caching
is enabled, the DC_PRED is cached after the first time it
is computed (for each plane) and then it is reused when
testing all the other scaling parameters.

Change-Id: Ie8ba0bb0427c4d9be8de5b44e6330e8a78b9c7d9
parent 9cea993b
......@@ -537,14 +537,22 @@ typedef struct {
(CFL_SUB8X8_VAL_MI_SIZE * CFL_SUB8X8_VAL_MI_SIZE)
#endif // CONFIG_DEBUG
#define CFL_MAX_BLOCK_SIZE (BLOCK_32X32)
#define CFL_PRED_BUF_LINE (32)
#define CFL_PRED_BUF_SQUARE (CFL_PRED_BUF_LINE * CFL_PRED_BUF_LINE)
#define CFL_BUF_LINE (32)
#define CFL_BUF_SQUARE (CFL_BUF_LINE * CFL_BUF_LINE)
typedef struct cfl_ctx {
// The CfL prediction buffer is used in two steps:
// 1. Stores Q3 reconstructed luma pixels
// (only Q2 is required, but Q3 is used to avoid shifts)
// 2. Stores Q3 AC contributions (step1 - tx block avg)
int16_t pred_buf_q3[CFL_PRED_BUF_SQUARE];
int16_t pred_buf_q3[CFL_BUF_SQUARE];
// Cache the DC_PRED when performing RDO, so it does not have to be recomputed
// for every scaling parameter
int dc_pred_is_cached[CFL_PRED_PLANES];
// The DC_PRED cache is disable when decoding
int use_dc_pred_cache;
// Only cache the first row of the DC_PRED
int16_t dc_pred_cache[CFL_PRED_PLANES][CFL_BUF_LINE];
// Height and width currently used in the CfL prediction buffer.
int buf_height, buf_width;
......
......@@ -14,8 +14,8 @@
#include "av1/common/onyxc_int.h"
void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm) {
assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_PRED_BUF_LINE);
assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_PRED_BUF_LINE);
assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
if ((cm->subsampling_x != 0 && cm->subsampling_x != 1) ||
(cm->subsampling_y != 0 && cm->subsampling_y != 1)) {
aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
......@@ -29,6 +29,11 @@ void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm) {
cfl->subsampling_y = cm->subsampling_y;
cfl->are_parameters_computed = 0;
cfl->store_y = 0;
// The DC_PRED cache is disabled by default and is only enabled in
// cfl_rd_pick_alpha
cfl->use_dc_pred_cache = 0;
cfl->dc_pred_is_cached[CFL_PRED_U] = 0;
cfl->dc_pred_is_cached[CFL_PRED_V] = 0;
#if CONFIG_DEBUG
cfl_clear_sub8x8_val(cfl);
cfl->store_counter = 0;
......@@ -36,6 +41,56 @@ void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm) {
#endif // CONFIG_DEBUG
}
void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
CFL_PRED_TYPE pred_plane, int width) {
assert(pred_plane < CFL_PRED_PLANES);
assert(width <= CFL_BUF_LINE);
#if CONFIG_HIGHBITDEPTH
if (get_bitdepth_data_path_index(xd)) {
uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
return;
}
#endif // CONFIG_HIGHBITDEPTH
memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width);
}
static void cfl_load_dc_pred_lbd(const int16_t *dc_pred_cache, uint8_t *dst,
int dst_stride, int width, int height) {
for (int j = 0; j < height; j++) {
memcpy(dst, dc_pred_cache, width);
dst += dst_stride;
}
}
static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst,
int dst_stride, int width, int height) {
const size_t num_bytes = width << 1;
for (int j = 0; j < height; j++) {
memcpy(dst, dc_pred_cache, num_bytes);
dst += dst_stride;
}
}
void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
TX_SIZE tx_size, CFL_PRED_TYPE pred_plane) {
const int width = tx_size_wide[tx_size];
const int height = tx_size_high[tx_size];
assert(pred_plane < CFL_PRED_PLANES);
assert(width <= CFL_BUF_LINE);
assert(height <= CFL_BUF_LINE);
#if CONFIG_HIGHBITDEPTH
if (get_bitdepth_data_path_index(xd)) {
uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
width, height);
return;
}
#endif // CONFIG_HIGHBITDEPTH
cfl_load_dc_pred_lbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride,
width, height);
}
// Due to frame boundary issues, it is possible that the total area covered by
// chroma exceeds that of luma. When this happens, we fill the missing pixels by
// repeating the last columns and/or rows.
......@@ -48,25 +103,24 @@ static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) {
int16_t *pred_buf_q3 = cfl->pred_buf_q3 + (width - diff_width);
for (int j = 0; j < min_height; j++) {
const int16_t last_pixel = pred_buf_q3[-1];
assert(pred_buf_q3 + diff_width <=
cfl->pred_buf_q3 + CFL_PRED_BUF_SQUARE);
assert(pred_buf_q3 + diff_width <= cfl->pred_buf_q3 + CFL_BUF_SQUARE);
for (int i = 0; i < diff_width; i++) {
pred_buf_q3[i] = last_pixel;
}
pred_buf_q3 += CFL_PRED_BUF_LINE;
pred_buf_q3 += CFL_BUF_LINE;
}
cfl->buf_width = width;
}
if (diff_height > 0) {
int16_t *pred_buf_q3 =
cfl->pred_buf_q3 + ((height - diff_height) * CFL_PRED_BUF_LINE);
cfl->pred_buf_q3 + ((height - diff_height) * CFL_BUF_LINE);
for (int j = 0; j < diff_height; j++) {
const int16_t *last_row_q3 = pred_buf_q3 - CFL_PRED_BUF_LINE;
assert(pred_buf_q3 + width <= cfl->pred_buf_q3 + CFL_PRED_BUF_SQUARE);
const int16_t *last_row_q3 = pred_buf_q3 - CFL_BUF_LINE;
assert(pred_buf_q3 + width <= cfl->pred_buf_q3 + CFL_BUF_SQUARE);
for (int i = 0; i < width; i++) {
pred_buf_q3[i] = last_row_q3[i];
}
pred_buf_q3 += CFL_PRED_BUF_LINE;
pred_buf_q3 += CFL_BUF_LINE;
}
cfl->buf_height = height;
}
......@@ -84,11 +138,11 @@ static void cfl_subtract_average(CFL_CTX *cfl, TX_SIZE tx_size) {
cfl_pad(cfl, tx_width, tx_height);
for (int j = 0; j < tx_height; j++) {
assert(pred_buf_q3 + tx_width <= cfl->pred_buf_q3 + CFL_PRED_BUF_SQUARE);
assert(pred_buf_q3 + tx_width <= cfl->pred_buf_q3 + CFL_BUF_SQUARE);
for (int i = 0; i < tx_width; i++) {
sum_q3 += pred_buf_q3[i];
}
pred_buf_q3 += CFL_PRED_BUF_LINE;
pred_buf_q3 += CFL_BUF_LINE;
}
const int avg_q3 = (sum_q3 + (1 << (num_pel_log2 - 1))) >> num_pel_log2;
// Loss is never more than 1/2 (in Q3)
......@@ -96,11 +150,11 @@ static void cfl_subtract_average(CFL_CTX *cfl, TX_SIZE tx_size) {
1);
pred_buf_q3 = cfl->pred_buf_q3;
for (int j = 0; j < tx_height; j++) {
assert(pred_buf_q3 + tx_width <= cfl->pred_buf_q3 + CFL_PRED_BUF_SQUARE);
assert(pred_buf_q3 + tx_width <= cfl->pred_buf_q3 + CFL_BUF_SQUARE);
for (int i = 0; i < tx_width; i++) {
pred_buf_q3[i] -= avg_q3;
}
pred_buf_q3 += CFL_PRED_BUF_LINE;
pred_buf_q3 += CFL_BUF_LINE;
}
}
......@@ -117,14 +171,14 @@ static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
static void cfl_build_prediction_lbd(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, int width, int height,
int alpha_q3) {
assert((height - 1) * CFL_PRED_BUF_LINE + width <= CFL_PRED_BUF_SQUARE);
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
dst[i] =
clip_pixel(get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]) + dst[i]);
}
dst += dst_stride;
pred_buf_q3 += CFL_PRED_BUF_LINE;
pred_buf_q3 += CFL_BUF_LINE;
}
}
......@@ -132,14 +186,14 @@ static void cfl_build_prediction_lbd(const int16_t *pred_buf_q3, uint8_t *dst,
static void cfl_build_prediction_hbd(const int16_t *pred_buf_q3, uint16_t *dst,
int dst_stride, int width, int height,
int alpha_q3, int bit_depth) {
assert((height - 1) * CFL_PRED_BUF_LINE + width <= CFL_PRED_BUF_SQUARE);
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
dst[i] = clip_pixel_highbd(
get_scaled_luma_q0(alpha_q3, pred_buf_q3[i]) + dst[i], bit_depth);
}
dst += dst_stride;
pred_buf_q3 += CFL_PRED_BUF_LINE;
pred_buf_q3 += CFL_BUF_LINE;
}
}
#endif // CONFIG_HIGHBITDEPTH
......@@ -202,7 +256,7 @@ void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
static void cfl_luma_subsampling_420_lbd(const uint8_t *input, int input_stride,
int16_t *output_q3, int width,
int height) {
assert((height - 1) * CFL_PRED_BUF_LINE + width <= CFL_PRED_BUF_SQUARE);
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
int top = i << 1;
......@@ -211,47 +265,47 @@ static void cfl_luma_subsampling_420_lbd(const uint8_t *input, int input_stride,
<< 1;
}
input += input_stride << 1;
output_q3 += CFL_PRED_BUF_LINE;
output_q3 += CFL_BUF_LINE;
}
}
static void cfl_luma_subsampling_422_lbd(const uint8_t *input, int input_stride,
int16_t *output_q3, int width,
int height) {
assert((height - 1) * CFL_PRED_BUF_LINE + width <= CFL_PRED_BUF_SQUARE);
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
int left = i << 1;
output_q3[i] = (input[left] + input[left + 1]) << 2;
}
input += input_stride;
output_q3 += CFL_PRED_BUF_LINE;
output_q3 += CFL_BUF_LINE;
}
}
static void cfl_luma_subsampling_440_lbd(const uint8_t *input, int input_stride,
int16_t *output_q3, int width,
int height) {
assert((height - 1) * CFL_PRED_BUF_LINE + width <= CFL_PRED_BUF_SQUARE);
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
output_q3[i] = (input[i] + input[i + input_stride]) << 2;
}
input += input_stride << 1;
output_q3 += CFL_PRED_BUF_LINE;
output_q3 += CFL_BUF_LINE;
}
}
static void cfl_luma_subsampling_444_lbd(const uint8_t *input, int input_stride,
int16_t *output_q3, int width,
int height) {
assert((height - 1) * CFL_PRED_BUF_LINE + width <= CFL_PRED_BUF_SQUARE);
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
output_q3[i] = input[i] << 3;
}
input += input_stride;
output_q3 += CFL_PRED_BUF_LINE;
output_q3 += CFL_BUF_LINE;
}
}
......@@ -269,7 +323,7 @@ static const cfl_subsample_lbd_fn subsample_lbd[2][2] = {
static void cfl_luma_subsampling_420_hbd(const uint16_t *input,
int input_stride, int16_t *output_q3,
int width, int height) {
assert((height - 1) * CFL_PRED_BUF_LINE + width <= CFL_PRED_BUF_SQUARE);
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
int top = i << 1;
......@@ -278,47 +332,47 @@ static void cfl_luma_subsampling_420_hbd(const uint16_t *input,
<< 1;
}
input += input_stride << 1;
output_q3 += CFL_PRED_BUF_LINE;
output_q3 += CFL_BUF_LINE;
}
}
static void cfl_luma_subsampling_422_hbd(const uint16_t *input,
int input_stride, int16_t *output_q3,
int width, int height) {
assert((height - 1) * CFL_PRED_BUF_LINE + width <= CFL_PRED_BUF_SQUARE);
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
int left = i << 1;
output_q3[i] = (input[left] + input[left + 1]) << 2;
}
input += input_stride;
output_q3 += CFL_PRED_BUF_LINE;
output_q3 += CFL_BUF_LINE;
}
}
static void cfl_luma_subsampling_440_hbd(const uint16_t *input,
int input_stride, int16_t *output_q3,
int width, int height) {
assert((height - 1) * CFL_PRED_BUF_LINE + width <= CFL_PRED_BUF_SQUARE);
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
output_q3[i] = (input[i] + input[i + input_stride]) << 2;
}
input += input_stride << 1;
output_q3 += CFL_PRED_BUF_LINE;
output_q3 += CFL_BUF_LINE;
}
}
static void cfl_luma_subsampling_444_hbd(const uint16_t *input,
int input_stride, int16_t *output_q3,
int width, int height) {
assert((height - 1) * CFL_PRED_BUF_LINE + width <= CFL_PRED_BUF_SQUARE);
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
output_q3[i] = input[i] << 3;
}
input += input_stride;
output_q3 += CFL_PRED_BUF_LINE;
output_q3 += CFL_BUF_LINE;
}
}
......@@ -358,12 +412,12 @@ static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
}
// Check that we will remain inside the pixel buffer.
assert(store_row + store_height <= CFL_PRED_BUF_LINE);
assert(store_col + store_width <= CFL_PRED_BUF_LINE);
assert(store_row + store_height <= CFL_BUF_LINE);
assert(store_col + store_width <= CFL_BUF_LINE);
// Store the input into the CfL pixel buffer
int16_t *pred_buf_q3 =
cfl->pred_buf_q3 + (store_row * CFL_PRED_BUF_LINE + store_col);
cfl->pred_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
#if CONFIG_HIGHBITDEPTH
if (use_hbd) {
......
......@@ -25,6 +25,11 @@ static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) {
return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6);
}
static INLINE CFL_PRED_TYPE get_cfl_pred_type(PLANE_TYPE plane) {
assert(plane > 0);
return plane - 1;
}
void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
TX_SIZE tx_size, int plane);
......@@ -32,4 +37,10 @@ void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size);
void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
BLOCK_SIZE bsize);
void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
CFL_PRED_TYPE pred_plane, int width);
void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
TX_SIZE tx_size, CFL_PRED_TYPE pred_plane);
#endif // AV1_COMMON_CFL_H_
......@@ -2762,10 +2762,6 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
const PREDICTION_MODE mode =
(plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
av1_predict_intra_block(cm, xd, pd->width, pd->height,
txsize_to_bsize[tx_size], mode, dst, dst_stride, dst,
dst_stride, blk_col, blk_row, plane);
#if CONFIG_CFL
if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
#if CONFIG_DEBUG
......@@ -2777,9 +2773,26 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
#endif
CFL_CTX *const cfl = &xd->cfl;
CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
if (cfl->dc_pred_is_cached[pred_plane] == 0) {
av1_predict_intra_block(cm, xd, pd->width, pd->height,
txsize_to_bsize[tx_size], mode, dst, dst_stride,
dst, dst_stride, blk_col, blk_row, plane);
if (cfl->use_dc_pred_cache) {
cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]);
cfl->dc_pred_is_cached[pred_plane] = 1;
}
} else {
cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane);
}
cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
return;
}
#endif
av1_predict_intra_block(cm, xd, pd->width, pd->height,
txsize_to_bsize[tx_size], mode, dst, dst_stride, dst,
dst_stride, blk_col, blk_row, plane);
}
// Copy the given row of dst into the equivalent row of ref, saving
......
......@@ -5418,6 +5418,7 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
#endif
xd->cfl.use_dc_pred_cache = 1;
const int64_t mode_rd =
RDCOST(x->rdmult, x->intra_uv_mode_cost[mbmi->mode][UV_CFL_PRED], 0);
int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
......@@ -5508,6 +5509,9 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
mbmi->cfl_alpha_idx = ind;
mbmi->cfl_alpha_signs = best_joint_sign;
xd->cfl.use_dc_pred_cache = 0;
xd->cfl.dc_pred_is_cached[0] = 0;
xd->cfl.dc_pred_is_cached[1] = 0;
return best_rate_overhead;
}
#endif // CONFIG_CFL
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment