vp9_pickmode.c 68.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
/*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include <assert.h>
12 13 14
#include <limits.h>
#include <math.h>
#include <stdio.h>
15

16
#include "./vp9_rtcd.h"
Johann's avatar
Johann committed
17
#include "./vpx_dsp_rtcd.h"
18 19

#include "vpx_mem/vpx_mem.h"
20
#include "vpx_ports/mem.h"
21

22
#include "vp9/common/vp9_blockd.h"
23 24
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_mvref_common.h"
25
#include "vp9/common/vp9_pred_common.h"
26 27
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
Scott LaVarnway's avatar
Scott LaVarnway committed
28
#include "vp9/common/vp9_scan.h"
29

30
#include "vp9/encoder/vp9_cost.h"
Dmitry Kovalev's avatar
Dmitry Kovalev committed
31
#include "vp9/encoder/vp9_encoder.h"
32
#include "vp9/encoder/vp9_pickmode.h"
33
#include "vp9/encoder/vp9_ratectrl.h"
34
#include "vp9/encoder/vp9_rd.h"
35

36 37 38 39 40 41
typedef struct {
  uint8_t *data;
  int stride;
  int in_use;
} PRED_BUFFER;

42
static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
43 44 45 46
                      const TileInfo *const tile,
                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                      int_mv *mv_ref_list,
                      int mi_row, int mi_col) {
47 48 49 50 51 52 53 54 55 56
  const int *ref_sign_bias = cm->ref_frame_sign_bias;
  int i, refmv_count = 0;

  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];

  int different_ref_found = 0;
  int context_counter = 0;
  int const_motion = 0;

  // Blank the reference vector list
James Zern's avatar
James Zern committed
57
  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
58 59 60 61 62 63 64 65

  // The nearest 2 blocks are treated differently
  // if the size < 8x8 we get the mv from the bmi substructure,
  // and we also need to keep a mode count.
  for (i = 0; i < 2; ++i) {
    const POSITION *const mv_ref = &mv_ref_search[i];
    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
      const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
66
                                                   xd->mi_stride];
67 68 69 70 71 72
      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
      // Keep counts for entropy encoding.
      context_counter += mode_2_counter[candidate->mode];
      different_ref_found = 1;

      if (candidate->ref_frame[0] == ref_frame)
73 74
        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1),
                        refmv_count, mv_ref_list, Done);
75 76 77 78 79 80 81 82 83 84 85 86
    }
  }

  const_motion = 1;

  // Check the rest of the neighbors in much the same way
  // as before except we don't need to keep track of sub blocks or
  // mode counts.
  for (; i < MVREF_NEIGHBOURS && !refmv_count; ++i) {
    const POSITION *const mv_ref = &mv_ref_search[i];
    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
      const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
87
                                                    xd->mi_stride]->mbmi;
88 89 90
      different_ref_found = 1;

      if (candidate->ref_frame[0] == ref_frame)
91
        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, Done);
92 93 94 95 96 97 98 99 100 101 102
    }
  }

  // Since we couldn't find 2 mvs from the same reference frame
  // go back through the neighbors and find motion vectors from
  // different reference frames.
  if (different_ref_found && !refmv_count) {
    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
      const POSITION *mv_ref = &mv_ref_search[i];
      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
        const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
103
                                              * xd->mi_stride]->mbmi;
104 105

        // If the candidate is INTRA we don't want to consider its mv.
106 107
        IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
                                 refmv_count, mv_ref_list, Done);
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
      }
    }
  }

 Done:

  mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter];

  // Clamp vectors
  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);

  return const_motion;
}

123 124 125 126
static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
                                  int_mv *tmp_mv, int *rate_mv,
                                  int64_t best_rd_sofar) {
127
  MACROBLOCKD *xd = &x->e_mbd;
128
  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
129
  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
130 131
  const int step_param = cpi->sf.mv.fullpel_search_step_param;
  const int sadpb = x->sadperbit16;
132
  MV mvp_full;
133
  const int ref = mbmi->ref_frame[0];
134
  const MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
135 136 137 138 139 140 141
  int dis;
  int rate_mode;
  const int tmp_col_min = x->mv_col_min;
  const int tmp_col_max = x->mv_col_max;
  const int tmp_row_min = x->mv_row_min;
  const int tmp_row_max = x->mv_row_max;
  int rv = 0;
142
  int cost_list[5];
143 144
  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                        ref);
145 146 147 148 149 150 151
  if (scaled_ref_frame) {
    int i;
    // Swap out the reference frame for a version that's been scaled to
    // match the resolution of the current frame, allowing the existing
    // motion search code to be used without additional modifications.
    for (i = 0; i < MAX_MB_PLANE; i++)
      backup_yv12[i] = xd->plane[i].pre[0];
152
    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
153
  }
154
  vp9_set_mv_search_range(x, &ref_mv);
155

Yaowu Xu's avatar
Yaowu Xu committed
156 157 158 159
  assert(x->mv_best_ref_index[ref] <= 2);
  if (x->mv_best_ref_index[ref] < 2)
    mvp_full = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
  else
Dmitry Kovalev's avatar
Dmitry Kovalev committed
160
    mvp_full = x->pred_mv[ref];
161 162 163 164

  mvp_full.col >>= 3;
  mvp_full.row >>= 3;

165
  vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
166
                        cond_cost_list(cpi, cost_list),
167
                        &ref_mv, &tmp_mv->as_mv, INT_MAX, 0);
168

169 170 171 172 173
  x->mv_col_min = tmp_col_min;
  x->mv_col_max = tmp_col_max;
  x->mv_row_min = tmp_row_min;
  x->mv_row_max = tmp_row_max;

174 175 176
  // calculate the bit cost on motion vector
  mvp_full.row = tmp_mv->as_mv.row * 8;
  mvp_full.col = tmp_mv->as_mv.col * 8;
177

178 179
  *rate_mv = vp9_mv_bit_cost(&mvp_full, &ref_mv,
                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
180

181 182 183 184 185 186 187 188 189 190 191 192
  rate_mode = cpi->inter_mode_cost[mbmi->mode_context[ref]]
                                  [INTER_OFFSET(NEWMV)];
  rv = !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) >
         best_rd_sofar);

  if (rv) {
    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
                                 cpi->common.allow_high_precision_mv,
                                 x->errorperbit,
                                 &cpi->fn_ptr[bsize],
                                 cpi->sf.mv.subpel_force_stop,
                                 cpi->sf.mv.subpel_iters_per_step,
193
                                 cond_cost_list(cpi, cost_list),
194
                                 x->nmvjointcost, x->mvcost,
195
                                 &dis, &x->pred_sse[ref], NULL, 0, 0);
196 197
    *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
198 199 200 201 202 203 204
  }

  if (scaled_ref_frame) {
    int i;
    for (i = 0; i < MAX_MB_PLANE; i++)
      xd->plane[i].pre[0] = backup_yv12[i];
  }
205
  return rv;
206 207
}

208 209 210 211 212 213 214 215 216 217 218 219
static void block_variance(const uint8_t *src, int src_stride,
                           const uint8_t *ref, int ref_stride,
                           int w, int h, unsigned int *sse, int *sum,
                           int block_size, unsigned int *sse8x8,
                           int *sum8x8, unsigned int *var8x8) {
  int i, j, k = 0;

  *sse = 0;
  *sum = 0;

  for (i = 0; i < h; i += block_size) {
    for (j = 0; j < w; j += block_size) {
Johann's avatar
Johann committed
220
      vpx_get8x8var(src + src_stride * i + j, src_stride,
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
                    ref + ref_stride * i + j, ref_stride,
                    &sse8x8[k], &sum8x8[k]);
      *sse += sse8x8[k];
      *sum += sum8x8[k];
      var8x8[k] = sse8x8[k] - (((unsigned int)sum8x8[k] * sum8x8[k]) >> 6);
      k++;
    }
  }
}

static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
                               unsigned int *sse_i, int *sum_i,
                               unsigned int *var_o, unsigned int *sse_o,
                               int *sum_o) {
  const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
  const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
  const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
  int i, j, k = 0;

  for (i = 0; i < nh; i += 2) {
    for (j = 0; j < nw; j += 2) {
      sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] +
          sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
      sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
          sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
      var_o[k] = sse_o[k] - (((unsigned int)sum_o[k] * sum_o[k]) >>
          (b_width_log2_lookup[unit_size] +
              b_height_log2_lookup[unit_size] + 6));
      k++;
    }
  }
}

static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                    MACROBLOCK *x, MACROBLOCKD *xd,
                                    int *out_rate_sum, int64_t *out_dist_sum,
                                    unsigned int *var_y, unsigned int *sse_y,
                                    int mi_row, int mi_col, int *early_term) {
  // Note our transform coeffs are 8 times an orthogonal transform.
  // Hence quantizer step is also 8 times. To get effective quantizer
  // we need to divide by 8 before sending to modeling function.
  unsigned int sse;
  int rate;
  int64_t dist;
  struct macroblock_plane *const p = &x->plane[0];
  struct macroblockd_plane *const pd = &xd->plane[0];
  const uint32_t dc_quant = pd->dequant[0];
  const uint32_t ac_quant = pd->dequant[1];
  const int64_t dc_thr = dc_quant * dc_quant >> 6;
  const int64_t ac_thr = ac_quant * ac_quant >> 6;
  unsigned int var;
  int sum;
  int skip_dc = 0;

  const int bw = b_width_log2_lookup[bsize];
  const int bh = b_height_log2_lookup[bsize];
  const int num8x8 = 1 << (bw + bh - 2);
  unsigned int sse8x8[64] = {0};
  int sum8x8[64] = {0};
  unsigned int var8x8[64] = {0};
  TX_SIZE tx_size;
  int i, k;

  // Calculate variance for whole partition, and also save 8x8 blocks' variance
  // to be used in following transform skipping test.
  block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
                 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
  var = sse - (((int64_t)sum * sum) >> (bw + bh + 4));

  *var_y = var;
  *sse_y = sse;

  if (cpi->common.tx_mode == TX_MODE_SELECT) {
    if (sse > (var << 2))
      tx_size = MIN(max_txsize_lookup[bsize],
                    tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
    else
      tx_size = TX_8X8;

300 301 302 303 304
    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
        cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id))
      tx_size = TX_8X8;
    else if (tx_size > TX_16X16)
      tx_size = TX_16X16;
305 306 307 308 309 310
  } else {
    tx_size = MIN(max_txsize_lookup[bsize],
                  tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
  }

  assert(tx_size >= TX_8X8);
311
  xd->mi[0]->mbmi.tx_size = tx_size;
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379

  // Evaluate if the partition block is a skippable block in Y plane.
  {
    unsigned int sse16x16[16] = {0};
    int sum16x16[16] = {0};
    unsigned int var16x16[16] = {0};
    const int num16x16 = num8x8 >> 2;

    unsigned int sse32x32[4] = {0};
    int sum32x32[4] = {0};
    unsigned int var32x32[4] = {0};
    const int num32x32 = num8x8 >> 4;

    int ac_test = 1;
    int dc_test = 1;
    const int num = (tx_size == TX_8X8) ? num8x8 :
        ((tx_size == TX_16X16) ? num16x16 : num32x32);
    const unsigned int *sse_tx = (tx_size == TX_8X8) ? sse8x8 :
        ((tx_size == TX_16X16) ? sse16x16 : sse32x32);
    const unsigned int *var_tx = (tx_size == TX_8X8) ? var8x8 :
        ((tx_size == TX_16X16) ? var16x16 : var32x32);

    // Calculate variance if tx_size > TX_8X8
    if (tx_size >= TX_16X16)
      calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
                         sum16x16);
    if (tx_size == TX_32X32)
      calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32,
                         sse32x32, sum32x32);

    // Skipping test
    x->skip_txfm[0] = 0;
    for (k = 0; k < num; k++)
      // Check if all ac coefficients can be quantized to zero.
      if (!(var_tx[k] < ac_thr || var == 0)) {
        ac_test = 0;
        break;
      }

    for (k = 0; k < num; k++)
      // Check if dc coefficient can be quantized to zero.
      if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
        dc_test = 0;
        break;
      }

    if (ac_test) {
      x->skip_txfm[0] = 2;

      if (dc_test)
        x->skip_txfm[0] = 1;
    } else if (dc_test) {
      skip_dc = 1;
    }
  }

  if (x->skip_txfm[0] == 1) {
    int skip_uv[2] = {0};
    unsigned int var_uv[2];
    unsigned int sse_uv[2];

    *out_rate_sum = 0;
    *out_dist_sum = sse << 4;

    // Transform skipping test in UV planes.
    for (i = 1; i <= 2; i++) {
      struct macroblock_plane *const p = &x->plane[i];
      struct macroblockd_plane *const pd = &xd->plane[i];
380
      const TX_SIZE uv_tx_size = get_uv_tx_size(&xd->mi[0]->mbmi, pd);
381
      const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
382 383 384 385 386
      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
      const int uv_bw = b_width_log2_lookup[uv_bsize];
      const int uv_bh = b_height_log2_lookup[uv_bsize];
      const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
          (uv_bh - b_height_log2_lookup[unit_size]);
387 388 389 390 391
      const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
      const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
      int j = i - 1;

      vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
392 393
      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p->src.buf, p->src.stride,
          pd->dst.buf, pd->dst.stride, &sse_uv[j]);
394

395 396 397 398 399
      if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
          (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
        skip_uv[j] = 1;
      else
        break;
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
    }

    // If the transform in YUV planes are skippable, the mode search checks
    // fewer inter modes and doesn't check intra modes.
    if (skip_uv[0] & skip_uv[1]) {
      *early_term = 1;
    }

    return;
  }

  if (!skip_dc) {
#if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
                                   dc_quant >> (xd->bd - 5), &rate, &dist);
    } else {
      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
                                   dc_quant >> 3, &rate, &dist);
    }
#else
    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
                                 dc_quant >> 3, &rate, &dist);
#endif  // CONFIG_VP9_HIGHBITDEPTH
  }

  if (!skip_dc) {
    *out_rate_sum = rate >> 1;
    *out_dist_sum = dist << 3;
  } else {
    *out_rate_sum = 0;
    *out_dist_sum = (sse - var) << 4;
  }

#if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
                                 ac_quant >> (xd->bd - 5), &rate, &dist);
  } else {
    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
                                 ac_quant >> 3, &rate, &dist);
  }
#else
  vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
                               ac_quant >> 3, &rate, &dist);
#endif  // CONFIG_VP9_HIGHBITDEPTH

  *out_rate_sum += rate;
  *out_dist_sum += dist << 4;
}

451 452
static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
                              MACROBLOCK *x, MACROBLOCKD *xd,
453 454
                              int *out_rate_sum, int64_t *out_dist_sum,
                              unsigned int *var_y, unsigned int *sse_y) {
455 456 457 458
  // Note our transform coeffs are 8 times an orthogonal transform.
  // Hence quantizer step is also 8 times. To get effective quantizer
  // we need to divide by 8 before sending to modeling function.
  unsigned int sse;
459 460
  int rate;
  int64_t dist;
461 462
  struct macroblock_plane *const p = &x->plane[0];
  struct macroblockd_plane *const pd = &xd->plane[0];
463 464
  const int64_t dc_thr = p->quant_thred[0] >> 6;
  const int64_t ac_thr = p->quant_thred[1] >> 6;
465 466
  const uint32_t dc_quant = pd->dequant[0];
  const uint32_t ac_quant = pd->dequant[1];
467 468
  unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
                                           pd->dst.buf, pd->dst.stride, &sse);
469 470
  int skip_dc = 0;

471 472 473
  *var_y = var;
  *sse_y = sse;

474 475
  if (cpi->common.tx_mode == TX_MODE_SELECT) {
    if (sse > (var << 2))
476
      xd->mi[0]->mbmi.tx_size =
477 478
          MIN(max_txsize_lookup[bsize],
              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
479
    else
480
      xd->mi[0]->mbmi.tx_size = TX_8X8;
481

482 483 484 485 486
    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
        cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id))
      xd->mi[0]->mbmi.tx_size = TX_8X8;
    else if (xd->mi[0]->mbmi.tx_size > TX_16X16)
      xd->mi[0]->mbmi.tx_size = TX_16X16;
487
  } else {
488
    xd->mi[0]->mbmi.tx_size =
489 490
        MIN(max_txsize_lookup[bsize],
            tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
491 492
  }

493 494 495
  // Evaluate if the partition block is a skippable block in Y plane.
  {
    const BLOCK_SIZE unit_size =
496
        txsize_to_bsize[xd->mi[0]->mbmi.tx_size];
497 498 499 500 501 502 503 504 505 506 507 508 509
    const unsigned int num_blk_log2 =
        (b_width_log2_lookup[bsize] - b_width_log2_lookup[unit_size]) +
        (b_height_log2_lookup[bsize] - b_height_log2_lookup[unit_size]);
    const unsigned int sse_tx = sse >> num_blk_log2;
    const unsigned int var_tx = var >> num_blk_log2;

    x->skip_txfm[0] = 0;
    // Check if all ac coefficients can be quantized to zero.
    if (var_tx < ac_thr || var == 0) {
      x->skip_txfm[0] = 2;
      // Check if dc coefficient can be quantized to zero.
      if (sse_tx - var_tx < dc_thr || sse == var)
        x->skip_txfm[0] = 1;
510 511 512
    } else {
      if (sse_tx - var_tx < dc_thr || sse == var)
        skip_dc = 1;
513 514 515 516 517 518 519 520 521
    }
  }

  if (x->skip_txfm[0] == 1) {
    *out_rate_sum = 0;
    *out_dist_sum = sse << 4;
    return;
  }

522
  if (!skip_dc) {
523
#if CONFIG_VP9_HIGHBITDEPTH
524 525 526 527 528 529 530 531
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
                                   dc_quant >> (xd->bd - 5), &rate, &dist);
    } else {
      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
                                   dc_quant >> 3, &rate, &dist);
    }
#else
532
    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
533 534
                                 dc_quant >> 3, &rate, &dist);
#endif  // CONFIG_VP9_HIGHBITDEPTH
535
  }
536

537 538 539 540 541 542 543
  if (!skip_dc) {
    *out_rate_sum = rate >> 1;
    *out_dist_sum = dist << 3;
  } else {
    *out_rate_sum = 0;
    *out_dist_sum = (sse - var) << 4;
  }
544

545 546
#if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
547
    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
548
                                 ac_quant >> (xd->bd - 5), &rate, &dist);
549
  } else {
550
    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
551
                                 ac_quant >> 3, &rate, &dist);
552 553
  }
#else
554
  vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
555
                               ac_quant >> 3, &rate, &dist);
556 557
#endif  // CONFIG_VP9_HIGHBITDEPTH

558 559
  *out_rate_sum += rate;
  *out_dist_sum += dist << 4;
560 561
}

562 563 564 565 566 567 568 569 570 571 572 573 574 575
#if CONFIG_VP9_HIGHBITDEPTH
static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
                      int *skippable, int64_t *sse, int plane,
                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
  MACROBLOCKD *xd = &x->e_mbd;
  unsigned int var_y, sse_y;
  (void)plane;
  (void)tx_size;
  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
  *sse = INT_MAX;
  *skippable = 0;
  return;
}
#else
576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591
static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
                      int *skippable, int64_t *sse, int plane,
                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
  MACROBLOCKD *xd = &x->e_mbd;
  const struct macroblockd_plane *pd = &xd->plane[plane];
  const struct macroblock_plane *const p = &x->plane[plane];
  const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
  const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
  const int step = 1 << (tx_size << 1);
  const int block_step = (1 << tx_size);
  int block = 0, r, c;
  int shift = tx_size == TX_32X32 ? 0 : 2;
  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
592
  int eob_cost = 0;
593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608

  (void)cpi;
  vp9_subtract_plane(x, bsize, plane);
  *skippable = 1;
  // Keep track of the row and column of the blocks we use so that we know
  // if we are in the unrestricted motion border.
  for (r = 0; r < max_blocks_high; r += block_step) {
    for (c = 0; c < num_4x4_w; c += block_step) {
      if (c < max_blocks_wide) {
        const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
        uint16_t *const eob = &p->eobs[block];
        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
        const int16_t *src_diff;
609
        src_diff = &p->src_diff[(r * diff_stride + c) << 2];
610 611 612 613 614 615 616 617 618 619

        switch (tx_size) {
          case TX_32X32:
            vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
            vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
                                  p->round_fp, p->quant_fp, p->quant_shift,
                                  qcoeff, dqcoeff, pd->dequant, eob,
                                  scan_order->scan, scan_order->iscan);
            break;
          case TX_16X16:
620
            vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643
            vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, eob,
                            scan_order->scan, scan_order->iscan);
            break;
          case TX_8X8:
            vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
            vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, eob,
                            scan_order->scan, scan_order->iscan);
            break;
          case TX_4X4:
            x->fwd_txm4x4(src_diff, coeff, diff_stride);
            vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, eob,
                            scan_order->scan, scan_order->iscan);
            break;
          default:
            assert(0);
            break;
        }
644
        *skippable &= (*eob == 0);
645
        eob_cost += 1;
646 647 648 649 650 651
      }
      block += step;
    }
  }

  if (*skippable && *sse < INT64_MAX) {
652
    *rate = 0;
653 654 655 656
    *dist = (*sse << 6) >> shift;
    *sse = *dist;
    return;
  }
657

658 659 660
  block = 0;
  *rate = 0;
  *dist = 0;
661 662
  if (*sse < INT64_MAX)
    *sse = (*sse << 6) >> shift;
663 664 665 666 667 668 669
  for (r = 0; r < max_blocks_high; r += block_step) {
    for (c = 0; c < num_4x4_w; c += block_step) {
      if (c < max_blocks_wide) {
        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
        uint16_t *const eob = &p->eobs[block];
670 671 672 673 674

        if (*eob == 1)
          *rate += (int)abs(qcoeff[0]);
        else if (*eob > 1)
          *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
675

676
        *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
677 678 679 680 681
      }
      block += step;
    }
  }

682 683 684 685
  if (*skippable == 0) {
    *rate <<= 10;
    *rate += (eob_cost << 8);
  }
686
}
687
#endif
688

689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753
static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
                               MACROBLOCK *x, MACROBLOCKD *xd,
                               int *out_rate_sum, int64_t *out_dist_sum,
                               unsigned int *var_y, unsigned int *sse_y) {
  // Note our transform coeffs are 8 times an orthogonal transform.
  // Hence quantizer step is also 8 times. To get effective quantizer
  // we need to divide by 8 before sending to modeling function.
  unsigned int sse;
  int rate;
  int64_t dist;
  int i;

  *out_rate_sum = 0;
  *out_dist_sum = 0;

  for (i = 1; i <= 2; ++i) {
    struct macroblock_plane *const p = &x->plane[i];
    struct macroblockd_plane *const pd = &xd->plane[i];
    const uint32_t dc_quant = pd->dequant[0];
    const uint32_t ac_quant = pd->dequant[1];
    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
    unsigned int var;

    if (!x->color_sensitivity[i - 1])
      continue;

    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
                             pd->dst.buf, pd->dst.stride, &sse);
    *var_y += var;
    *sse_y += sse;

  #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
                                   dc_quant >> (xd->bd - 5), &rate, &dist);
    } else {
      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
                                   dc_quant >> 3, &rate, &dist);
    }
  #else
    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
                                 dc_quant >> 3, &rate, &dist);
  #endif  // CONFIG_VP9_HIGHBITDEPTH

    *out_rate_sum += rate >> 1;
    *out_dist_sum += dist << 3;

  #if CONFIG_VP9_HIGHBITDEPTH
    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
      vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
                                   ac_quant >> (xd->bd - 5), &rate, &dist);
    } else {
      vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
                                   ac_quant >> 3, &rate, &dist);
    }
  #else
    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
                                 ac_quant >> 3, &rate, &dist);
  #endif  // CONFIG_VP9_HIGHBITDEPTH

    *out_rate_sum += rate;
    *out_dist_sum += dist << 4;
  }
}

754 755 756 757 758 759 760 761 762 763 764 765 766
static int get_pred_buffer(PRED_BUFFER *p, int len) {
  int i;

  for (i = 0; i < len; i++) {
    if (!p[i].in_use) {
      p[i].in_use = 1;
      return i;
    }
  }
  return -1;
}

static void free_pred_buffer(PRED_BUFFER *p) {
767 768
  if (p != NULL)
    p->in_use = 0;
769 770
}

771 772 773 774 775 776 777 778
static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
                                 MV_REFERENCE_FRAME ref_frame,
                                 PREDICTION_MODE this_mode,
                                 unsigned int var_y, unsigned int sse_y,
                                 struct buf_2d yv12_mb[][MAX_MB_PLANE],
                                 int *rate, int64_t *dist) {
  MACROBLOCKD *xd = &x->e_mbd;
779
  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
780 781 782 783 784 785 786 787 788 789 790 791 792 793 794

  const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
  unsigned int var = var_y, sse = sse_y;
  // Skipping threshold for ac.
  unsigned int thresh_ac;
  // Skipping threshold for dc.
  unsigned int thresh_dc;
  if (x->encode_breakout > 0) {
    // Set a maximum for threshold to avoid big PSNR loss in low bit rate
    // case. Use extreme low threshold for static frames to limit
    // skipping.
    const unsigned int max_thresh = 36000;
    // The encode_breakout input
    const unsigned int min_thresh =
        MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
795
#if CONFIG_VP9_HIGHBITDEPTH
796
    const int shift = (xd->bd << 1) - 16;
797
#endif
798 799

    // Calculate threshold according to dequant value.
800
    thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) >> 3;
801 802 803 804 805
#if CONFIG_VP9_HIGHBITDEPTH
    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
      thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
    }
#endif  // CONFIG_VP9_HIGHBITDEPTH
806 807 808 809
    thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);

    // Adjust ac threshold according to partition size.
    thresh_ac >>=
810
        8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
811 812

    thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
813 814 815 816 817
#if CONFIG_VP9_HIGHBITDEPTH
    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
      thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);
    }
#endif  // CONFIG_VP9_HIGHBITDEPTH
818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841
  } else {
    thresh_ac = 0;
    thresh_dc = 0;
  }

  // Y skipping condition checking for ac and dc.
  if (var <= thresh_ac && (sse - var) <= thresh_dc) {
    unsigned int sse_u, sse_v;
    unsigned int var_u, var_v;

    // Skip UV prediction unless breakout is zero (lossless) to save
    // computation with low impact on the result
    if (x->encode_breakout == 0) {
      xd->plane[1].pre[0] = yv12_mb[ref_frame][1];
      xd->plane[2].pre[0] = yv12_mb[ref_frame][2];
      vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
    }

    var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
                                    x->plane[1].src.stride,
                                    xd->plane[1].dst.buf,
                                    xd->plane[1].dst.stride, &sse_u);

    // U skipping condition checking
842
    if (((var_u << 2) <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
843 844 845 846 847 848
      var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
                                      x->plane[2].src.stride,
                                      xd->plane[2].dst.buf,
                                      xd->plane[2].dst.stride, &sse_v);

      // V skipping condition checking
849
      if (((var_v << 2) <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
850 851 852 853
        x->skip = 1;

        // The cost of skip bit needs to be added.
        *rate = cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
854
                                    [INTER_OFFSET(this_mode)];
855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870

        // More on this part of rate
        // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);

        // Scaling factor for SSE from spatial domain to frequency
        // domain is 16. Adjust distortion accordingly.
        // TODO(yunqingwang): In this function, only y-plane dist is
        // calculated.
        *dist = (sse << 4);  // + ((sse_u + sse_v) << 4);

        // *disable_skip = 1;
      }
    }
  }
}

871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894
struct estimate_block_intra_args {
  VP9_COMP *cpi;
  MACROBLOCK *x;
  PREDICTION_MODE mode;
  int rate;
  int64_t dist;
};

static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                                 TX_SIZE tx_size, void *arg) {
  struct estimate_block_intra_args* const args = arg;
  VP9_COMP *const cpi = args->cpi;
  MACROBLOCK *const x = args->x;
  MACROBLOCKD *const xd = &x->e_mbd;
  struct macroblock_plane *const p = &x->plane[0];
  struct macroblockd_plane *const pd = &xd->plane[0];
  const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
  uint8_t *const src_buf_base = p->src.buf;
  uint8_t *const dst_buf_base = pd->dst.buf;
  const int src_stride = p->src.stride;
  const int dst_stride = pd->dst.stride;
  int i, j;
  int rate;
  int64_t dist;
895
  int64_t this_sse = INT64_MAX;
896 897
  int is_skippable;

898 899 900 901 902 903 904 905
  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
  assert(plane == 0);
  (void) plane;

  p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
  pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
  // Use source buffer as an approximation for the fully reconstructed buffer.
  vp9_predict_intra_block(xd, block >> (2 * tx_size),
906
                          b_width_log2_lookup[plane_bsize],
907
                          tx_size, args->mode,
908 909
                          x->skip_encode ? p->src.buf : pd->dst.buf,
                          x->skip_encode ? src_stride : dst_stride,
910 911
                          pd->dst.buf, dst_stride,
                          i, j, 0);
912 913

  // TODO(jingning): This needs further refactoring.
914 915
  block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
            bsize_tx, MIN(tx_size, TX_16X16));
916 917
  // this_sse is a dummy variable here. Its value should remain INT64_MAX.
  assert(this_sse == INT64_MAX);
918 919
  x->skip_txfm[0] = is_skippable;
  rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
920

921 922 923 924 925 926
  p->src.buf = src_buf_base;
  pd->dst.buf = dst_buf_base;
  args->rate += rate;
  args->dist += dist;
}

927
static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
928
  {THR_DC, THR_V_PRED, THR_H_PRED, THR_TM},
929 930 931 932
  {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
  {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
};

933 934 935 936
static const PREDICTION_MODE intra_mode_list[] = {
  DC_PRED, V_PRED, H_PRED, TM_PRED
};

937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
static int mode_offset(const PREDICTION_MODE mode) {
  if (mode >= NEARESTMV) {
    return INTER_OFFSET(mode);
  } else {
    switch (mode) {
      case DC_PRED:
        return 0;
      case V_PRED:
        return 1;
      case H_PRED:
        return 2;
      case TM_PRED:
        return 3;
      default:
        return -1;
    }
  }
}

static INLINE void update_thresh_freq_fact(VP9_COMP *cpi,
                                           TileDataEnc *tile_data,
                                           BLOCK_SIZE bsize,
                                           MV_REFERENCE_FRAME ref_frame,
                                           THR_MODES best_mode_idx,
                                           PREDICTION_MODE mode) {
  THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
  int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx];
  if (thr_mode_idx == best_mode_idx)
    *freq_fact -= (*freq_fact >> 4);
  else
    *freq_fact = MIN(*freq_fact + RD_THRESH_INC,
        cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
}

971 972 973
void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
  MACROBLOCKD *const xd = &x->e_mbd;
974
  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
975 976 977 978 979 980
  RD_COST this_rdc, best_rdc;
  PREDICTION_MODE this_mode;
  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
  const TX_SIZE intra_tx_size =
      MIN(max_txsize_lookup[bsize],
          tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
981
  MODE_INFO *const mic = xd->mi[0];
982
  int *bmode_costs;
983 984
  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
985 986 987 988 989 990 991 992 993 994 995
  const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
  const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
  bmode_costs = cpi->y_mode_costs[A][L];

  (void) ctx;
  vp9_rd_cost_reset(&best_rdc);
  vp9_rd_cost_reset(&this_rdc);

  mbmi->ref_frame[0] = INTRA_FRAME;
  mbmi->mv[0].as_int = INVALID_MV;
  mbmi->uv_mode = DC_PRED;
James Zern's avatar
James Zern committed
996
  memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021

  // Change the limit of this loop to add other intra prediction
  // mode tests.
  for (this_mode = DC_PRED; this_mode <= H_PRED; ++this_mode) {
    args.mode = this_mode;
    args.rate = 0;
    args.dist = 0;
    mbmi->tx_size = intra_tx_size;
    vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                           estimate_block_intra, &args);
    this_rdc.rate = args.rate;
    this_rdc.dist = args.dist;
    this_rdc.rate += bmode_costs[this_mode];
    this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                             this_rdc.rate, this_rdc.dist);

    if (this_rdc.rdcost < best_rdc.rdcost) {
      best_rdc = this_rdc;
      mbmi->mode = this_mode;
    }
  }

  *rd_cost = best_rdc;
}

1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039
static void init_ref_frame_cost(VP9_COMMON *const cm,
                                MACROBLOCKD *const xd,
                                int ref_frame_cost[MAX_REF_FRAMES]) {
  vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
  vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
  vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);

  ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
  ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =
    ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1);

  ref_frame_cost[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0);
  ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
  ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
  ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
  ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
}

1040 1041 1042 1043 1044 1045 1046 1047 1048
typedef struct {
  MV_REFERENCE_FRAME ref_frame;
  PREDICTION_MODE pred_mode;
} REF_MODE;

#define RT_INTER_MODES 8
static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
    {LAST_FRAME, ZEROMV},
    {LAST_FRAME, NEARESTMV},
1049
    {GOLDEN_FRAME, ZEROMV},
1050 1051 1052 1053 1054 1055 1056
    {LAST_FRAME, NEARMV},
    {LAST_FRAME, NEWMV},
    {GOLDEN_FRAME, NEARESTMV},
    {GOLDEN_FRAME, NEARMV},
    {GOLDEN_FRAME, NEWMV}
};

1057 1058
// TODO(jingning) placeholder for inter-frame non-RD mode decision.
// this needs various further optimizations. to be continued..
1059
void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
1060
                         TileDataEnc *tile_data,
1061 1062
                         int mi_row, int mi_col, RD_COST *rd_cost,
                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
1063
  VP9_COMMON *const cm = &cpi->common;
1064
  SPEED_FEATURES *const sf = &cpi->sf;
1065
  TileInfo *const tile_info = &tile_data->tile_info;
1066
  MACROBLOCKD *const xd = &x->e_mbd;
1067
  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
1068
  struct macroblockd_plane *const pd = &xd->plane[0];
1069
  PREDICTION_MODE best_mode = ZEROMV;
1070
  MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
1071
  MV_REFERENCE_FRAME usable_ref_frame;
1072
  TX_SIZE best_tx_size = TX_SIZES;
1073
  INTERP_FILTER best_pred_filter = EIGHTTAP;
1074 1075 1076 1077
  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                    VP9_ALT_FLAG };
1078
  RD_COST this_rdc, best_rdc;
1079
  uint8_t skip_txfm = 0, best_mode_skip_txfm = 0;
1080 1081 1082
  // var_y and sse_y are saved to be used in skipping checking
  unsigned int var_y = UINT_MAX;
  unsigned int sse_y = UINT_MAX;
1083
  // Reduce the intra cost penalty for small blocks (<=16x16).
1084 1085
  const int reduction_fac = (bsize <= BLOCK_16X16) ?
      ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
1086
  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
1087
      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
1088 1089
  const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
                                           intra_cost_penalty, 0);
1090
  const int *const rd_threshes = cpi->rd.threshes[mbmi->segment_id][bsize];
1091
  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
Yaowu Xu's avatar
Yaowu Xu committed
1092
  INTERP_FILTER filter_ref;
1093
  const int bsl = mi_width_log2_lookup[bsize];
1094
  const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
1095 1096
      (((mi_row + mi_col) >> bsl) +
       get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
1097
  int const_motion[MAX_REF_FRAMES] = { 0 };
1098 1099
  const int bh = num_4x4_blocks_high_lookup[bsize] << 2;
  const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
1100 1101
  // For speed 6, the result of interp filter is reused later in actual encoding
  // process.
1102 1103
  // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
  PRED_BUFFER tmp[4];
1104
  DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 64 * 64]);
1105
#if CONFIG_VP9_HIGHBITDEPTH
1106
  DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64]);
1107
#endif
1108 1109 1110
  struct buf_2d orig_dst = pd->dst;
  PRED_BUFFER *best_pred = NULL;
  PRED_BUFFER *this_mode_pred = NULL;
1111
  const int pixels_in_block = bh * bw;
1112
  int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready;
1113
  int ref_frame_skip_mask = 0;
1114
  int idx;
1115
  int best_pred_sad = INT_MAX;
1116
  int best_early_term = 0;
1117 1118
  int ref_frame_cost[MAX_REF_FRAMES];

1119
  init_ref_frame_cost(cm, xd, ref_frame_cost);
1120

1121
  if (reuse_inter_pred) {
1122
    int i;