From debb9c68c8ea92b80627138f95de901cb39cf8dc Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Wed, 7 Aug 2013 15:22:51 -0700
Subject: [PATCH] Use low precision 32x32fdct for encodemb in speed1

The low precision 32x32 fdct has all the intermediate steps within
16-bit depth, hence allowing faster SSE2 implementation, at the
expense of larger round-trip error. It was used in the rate-distortion
optimization search loop only.

Using the low precision version, in replace of the high precision one,
affects the compression performance by about 0.7% (derf, stdhd) at
speed 0. For speed 1, it makes derf set down by only 0.017%.

Change-Id: I4e7d18fac5bea5317b91c8e7dabae143bc6b5c8b
---
 vp9/encoder/vp9_block.h       | 2 +-
 vp9/encoder/vp9_encodeframe.c | 4 ++--
 vp9/encoder/vp9_encodemb.c    | 4 ++--
 vp9/encoder/vp9_onyx_if.c     | 5 +++++
 vp9/encoder/vp9_onyx_int.h    | 1 +
 5 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 3e377cf6f6..790b3c22ca 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -144,7 +144,7 @@ struct macroblock {
   int optimize;
 
   // indicate if it is in the rd search loop or encoding process
-  int rd_search;
+  int use_lp32x32fdct;
   int skip_encode;
 
   // Used to store sub partition's choices.
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 82859c5d7f..39ca5efc4e 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -565,7 +565,7 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  x->rd_search = 1;
+  x->use_lp32x32fdct = 1;
 
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
@@ -2546,7 +2546,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  x->rd_search = 0;
+  x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
   x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
                     xd->q_index < QIDX_SKIP_THRESH);
   if (x->skip_encode)
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 40b0a4e5a2..f43a281379 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -475,7 +475,7 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
       xoff = 32 * (block & twmask);
       yoff = 32 * (block >> twl);
       src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      if (x->rd_search)
+      if (x->use_lp32x32fdct)
         vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
       else
         vp9_short_fdct32x32(src_diff, coeff, bw * 8);
@@ -670,7 +670,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
                               dst, pd->dst.stride, dst, pd->dst.stride);
       vp9_subtract_block(32, 32, src_diff, bw * 4,
                          src, p->src.stride, dst, pd->dst.stride);
-      if (x->rd_search)
+      if (x->use_lp32x32fdct)
         vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
       else
         vp9_short_fdct32x32(src_diff, coeff, bw * 8);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index cf5ae5252d..9ad63efcb9 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -723,6 +723,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->adaptive_rd_thresh = 0;
   sf->use_lastframe_partitioning = 0;
   sf->tx_size_search_method = USE_FULL_RD;
+  sf->use_lp32x32fdct = 0;
   sf->use_8tap_always = 0;
   sf->use_avoid_tested_higherror = 0;
   sf->reference_masking = 0;
@@ -794,6 +795,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->use_uv_intra_rd_estimate = 1;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
+        sf->use_lp32x32fdct = 1;
         sf->auto_mv_step_size = 1;
 
         sf->auto_min_max_partition_size = 1;
@@ -825,6 +827,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->use_uv_intra_rd_estimate = 1;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
+        sf->use_lp32x32fdct = 1;
         sf->using_small_partition_info = 1;
         sf->disable_splitmv =
             (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
@@ -848,6 +851,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
                                      FLAG_EARLY_TERMINATE;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
+        sf->use_lp32x32fdct = 1;
         sf->disable_splitmv = 1;
         sf->auto_mv_step_size = 1;
         sf->search_method = BIGDIA;
@@ -869,6 +873,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
                                      FLAG_SKIP_INTRA_LOWVAR |
                                      FLAG_EARLY_TERMINATE;
         sf->use_rd_breakout = 1;
+        sf->use_lp32x32fdct = 1;
         sf->optimize_coefficients = 0;
         sf->auto_mv_step_size = 1;
         // sf->reduce_first_step_size = 1;
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 1249107698..fdc1081355 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -257,6 +257,7 @@ typedef struct {
   int skip_encode_frame;
   int use_lastframe_partitioning;
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
+  int use_lp32x32fdct;
   int use_8tap_always;
   int use_avoid_tested_higherror;
   int skip_lots_of_modes;
-- 
GitLab