From 83fcc030563ed13373d8fb559de0993905682458 Mon Sep 17 00:00:00 2001
From: fbossen <frank@bossentech.com>
Date: Thu, 23 Aug 2018 19:12:20 -0230
Subject: [PATCH] Add basic support for nonzero motion vectors (#485)

* Add motion vector paramater to inter prediction function

Motion vector is read from added data field in Block structure

* Add reference frame and motion vector fields/parameters

Reference frame and motion vector are added fields to the
RDOOutput structure and added parameters to the block
encoding function

* Fix inter prediction function for nonzero MVs

* Add call to encode new motion vector

* Import default cdfs for MV coding from libaom

* Fix handling of MV precision in MV coding function

* Add coding of DRL mode

* Add motion vector stack

* Add new MV counter to correctly determine context

* Use motion vectors in MV stack for prediction

* Sort MV stack according to weights

Also update weights after near search

* Fix log2() function

* Compute correct context for DRL flag

* Store MVs with 1/8 pel precision and fix MV generation

* Add some basic motion estimation

Motion estimation is full search (+/-16 pel range in 2-pel increments) and
is done independently for each block

* Fix chroma motion compensation for small blocks

Chroma motion compensation sometimes happens on 2x2 basis
---
 src/context.rs   | 239 +++++++++++++++++++++++++++++++----------------
 src/encoder.rs   |  90 ++++++++++++++++--
 src/lib.rs       |   1 +
 src/me.rs        |  72 ++++++++++++++
 src/partition.rs |  37 +++++---
 src/predict.rs   |   1 +
 src/rdo.rs       |  19 +++-
 7 files changed, 358 insertions(+), 101 deletions(-)
 create mode 100644 src/me.rs

diff --git a/src/context.rs b/src/context.rs
index 9c70e162..061c8d25 100755
--- a/src/context.rs
+++ b/src/context.rs
@@ -64,6 +64,9 @@ const TX_SETS: usize = 9;
 const TX_SETS_INTRA: usize = 3;
 const TX_SETS_INTER: usize = 4;
 
+const MAX_REF_MV_STACK_SIZE: usize = 8;
+pub const REF_CAT_LEVEL: u32 = 640;
+
 // Number of transform types in each set type
 static num_tx_set: [usize; TX_SETS] =
   [1, 2, 5, 7, 7, 10, 12, 16, 16];
@@ -377,6 +380,7 @@ const PLANE_TYPES: usize = 2;
 const REF_TYPES: usize = 2;
 const SKIP_CONTEXTS: usize = 3;
 const INTRA_INTER_CONTEXTS: usize = 4;
+const DRL_MODE_CONTEXTS: usize = 3;
 
 // Level Map
 const TXB_SKIP_CONTEXTS: usize =  13;
@@ -758,6 +762,27 @@ pub fn uv_intra_mode_to_tx_type_context(pred: PredictionMode) -> TxType {
   intra_mode_to_tx_type_context[uv2y[pred as usize] as usize]
 }
 
+#[derive(Clone,Copy)]
+#[repr(C)]
+pub struct NMVComponent {
+  classes_cdf: [u16; MV_CLASSES + 1],
+  class0_fp_cdf: [[u16; MV_FP_SIZE + 1]; CLASS0_SIZE],
+  fp_cdf: [u16; MV_FP_SIZE + 1],
+  sign_cdf: [u16; 2 + 1],
+  class0_hp_cdf: [u16; 2 + 1],
+  hp_cdf: [u16; 2 + 1],
+  class0_cdf: [u16; CLASS0_SIZE + 1],
+  bits_cdf: [[u16; 2 + 1]; MV_OFFSET_BITS],
+}
+
+#[derive(Clone,Copy)]
+#[repr(C)]
+pub struct NMVContext {
+  joints_cdf: [u16; MV_JOINTS + 1],
+  comps: [NMVComponent; 2],
+}
+
+
 extern "C" {
   static default_partition_cdf:
     [[u16; EXT_PARTITION_TYPES + 1]; PARTITION_CONTEXTS];
@@ -804,6 +829,9 @@ extern "C" {
   static av1_default_coeff_lps_multi_cdfs: [[[[[u16; BR_CDF_SIZE + 1];
     LEVEL_CONTEXTS]; PLANE_TYPES];
     TxSize::TX_SIZES]; 4];
+
+  static default_nmv_context: NMVContext;
+  static default_drl_cdf:[[u16; 2 + 1]; DRL_MODE_CONTEXTS];
 }
 
 #[repr(C)]
@@ -814,6 +842,13 @@ pub struct SCAN_ORDER {
   pub neighbors: &'static [u16; ((32 * 32) + 1) * 2]
 }
 
+#[derive(Clone)]
+pub struct CandidateMV {
+  pub this_mv: MotionVector,
+  pub comp_mv: MotionVector,
+  pub weight: u32
+}
+
 #[derive(Clone)]
 pub struct CDFContext {
   partition_cdf: [[u16; EXT_PARTITION_TYPES + 1]; PARTITION_CONTEXTS],
@@ -831,6 +866,8 @@ pub struct CDFContext {
   angle_delta_cdf: [[u16; 2 * MAX_ANGLE_DELTA + 1 + 1]; DIRECTIONAL_MODES],
   filter_intra_cdfs: [[u16; 3]; BlockSize::BLOCK_SIZES_ALL],
   single_ref_cdfs: [[[u16; 2 + 1]; SINGLE_REFS - 1]; REF_CONTEXTS],
+  drl_cdfs: [[u16; 2 + 1]; DRL_MODE_CONTEXTS],
+  nmv_context: NMVContext,
 
   // lv_map
   txb_skip_cdf: [[[u16; 3]; TXB_SKIP_CONTEXTS]; TxSize::TX_SIZES],
@@ -877,6 +914,8 @@ impl CDFContext {
       angle_delta_cdf: default_angle_delta_cdf,
       filter_intra_cdfs: default_filter_intra_cdfs,
       single_ref_cdfs: default_single_ref_cdf,
+      drl_cdfs: default_drl_cdf,
+      nmv_context: default_nmv_context,
 
       // lv_map
       txb_skip_cdf: av1_default_txb_skip_cdfs[qctx],
@@ -1030,7 +1069,7 @@ mod test {
 
 const SUPERBLOCK_TO_PLANE_SHIFT: usize = MAX_SB_SIZE_LOG2;
 const SUPERBLOCK_TO_BLOCK_SHIFT: usize = MAX_MIB_SIZE_LOG2;
-const BLOCK_TO_PLANE_SHIFT: usize = MI_SIZE_LOG2;
+pub const BLOCK_TO_PLANE_SHIFT: usize = MI_SIZE_LOG2;
 pub const LOCAL_BLOCK_MASK: usize = (1 << SUPERBLOCK_TO_BLOCK_SHIFT) - 1;
 
 /// Absolute offset in superblocks inside a plane, where a superblock is defined
@@ -1110,6 +1149,7 @@ pub struct Block {
   pub partition: PartitionType,
   pub skip: bool,
   pub ref_frames: [usize; 2],
+  pub mv: [MotionVector; 2],
   pub neighbors_ref_counts: [usize; TOTAL_REFS_PER_FRAME],
   pub cdef_index: u8,
   pub n4_w: usize, /* block width in the unit of mode_info */
@@ -1126,6 +1166,7 @@ impl Block {
       partition: PartitionType::PARTITION_NONE,
       skip: false,
       ref_frames: [INTRA_FRAME; 2],
+      mv: [ MotionVector { row:0, col: 0 }; 2],
       neighbors_ref_counts: [0; TOTAL_REFS_PER_FRAME],
       cdef_index: 0,
       n4_w: BLOCK_64X64.width_mi(),
@@ -1415,6 +1456,17 @@ impl BlockContext {
     }
   }
 
+  pub fn set_motion_vector(&mut self, bo: &BlockOffset, bsize: BlockSize, mv: MotionVector) {
+    let bw = bsize.width_mi();
+    let bh = bsize.height_mi();
+
+    for y in 0..bh {
+      for x in 0..bw {
+        self.blocks[bo.y + y as usize][bo.x + x as usize].mv[0] = mv;
+      }
+    }
+  }
+
   pub fn set_cdef(&mut self, sbo: &SuperBlockOffset, cdef_index: u8) {
     let bo = sbo.block_offset(0, 0);
     // Checkme: Is 16 still the right block unit for 128x128 superblocks?
@@ -1874,31 +1926,48 @@ impl ContextWriter {
     cmp::max(col_offset, -(mi_col as isize))
   }
 
-  fn add_ref_mv_candidate(&self, blk: &Block) -> bool {
+  fn find_matching_mv(&self, blk: &Block, mv_stack: &mut Vec<CandidateMV>, weight: u32) -> bool {
+    for mut mv_cand in mv_stack {
+      if blk.mv[0].row == mv_cand.this_mv.row && blk.mv[0].col == mv_cand.this_mv.col {
+        mv_cand.weight += weight;
+        return true;
+      }
+    }
+    false
+  }
+
+  fn add_ref_mv_candidate(&self, ref_frame: usize, blk: &Block, mv_stack: &mut Vec<CandidateMV>,
+                          weight: u32, newmv_count: &mut usize) -> bool {
     if !blk.is_inter() { /* For intrabc */
       return false;
     }
 
-/*
-    let index = 0;
+    if blk.ref_frames[0] == ref_frame {
+      let found_match = self.find_matching_mv(blk, mv_stack, weight);
 
-    if rf[1] == NONE_FRAME {
-      for i in 0..2 {
-        if cand.ref_frame[i] == rf[0] {
+      if !found_match && mv_stack.len() < MAX_REF_MV_STACK_SIZE {
+        let mv_cand = CandidateMV {
+          this_mv: blk.mv[0],
+          comp_mv: blk.mv[1],
+          weight: weight
+        };
 
-        }
+        mv_stack.push(mv_cand);
       }
-    } else {
-      if cand.ref_frame[0] == rf[0] && cand.ref_frame[1] == rf[1] {
 
+      if blk.mode == PredictionMode::NEWMV {
+        *newmv_count += 1;
       }
+
+      true
+    } else {
+      false
     }
-*/
-    true
   }
 
   fn scan_row_mbmi(&mut self, bo: &BlockOffset, row_offset: isize, max_row_offs: isize,
-                   processed_rows: &mut isize) -> bool {
+                   processed_rows: &mut isize, ref_frame: usize,
+                   mv_stack: &mut Vec<CandidateMV>, newmv_count: &mut usize) -> bool {
     let bc = &self.bc;
     let target_n4_w = bc.at(bo).n4_w;
 
@@ -1907,7 +1976,6 @@ impl ContextWriter {
     let n4_w_8 = BlockSize::MI_SIZE_WIDE[BLOCK_8X8 as usize];
     let n4_w_16 = BlockSize::MI_SIZE_WIDE[BLOCK_16X16 as usize];
     let mut col_offset = 0;
-    //let shift = 0;
 
     if row_offset.abs() > 1 {
       col_offset = 1;
@@ -1932,14 +2000,17 @@ impl ContextWriter {
         len = cmp::max(len, n4_w_8);
       }
 
-      //let mut weight = 2;
+      let mut weight = 2 as u32;
       if target_n4_w >= n4_w_8 && target_n4_w <= n4_w {
         let inc = cmp::min(-max_row_offs + row_offset + 1, cand.n4_h as isize);
-        //weight = cmp::max(weight, inc << shift);
+        assert!(inc >= 0);
+        weight = cmp::max(weight, inc as u32);
         *processed_rows = (inc as isize) - row_offset - 1;
       }
 
-      if self.add_ref_mv_candidate(cand) { found_match = true; }
+      if self.add_ref_mv_candidate(ref_frame, cand, mv_stack, len as u32 * weight, newmv_count) {
+        found_match = true;
+      }
 
       i += len;
     }
@@ -1948,7 +2019,8 @@ impl ContextWriter {
   }
 
   fn scan_col_mbmi(&mut self, bo: &BlockOffset, col_offset: isize, max_col_offs: isize,
-                   processed_cols: &mut isize) -> bool {
+                   processed_cols: &mut isize, ref_frame: usize,
+                   mv_stack: &mut Vec<CandidateMV>, newmv_count: &mut usize) -> bool {
     let bc = &self.bc;
     let target_n4_h = bc.at(bo).n4_h;
 
@@ -1957,7 +2029,6 @@ impl ContextWriter {
     let n4_h_8 = BlockSize::MI_SIZE_HIGH[BLOCK_8X8 as usize];
     let n4_h_16 = BlockSize::MI_SIZE_HIGH[BLOCK_16X16 as usize];
     let mut row_offset = 0;
-    //let shift = 0;
 
     if col_offset.abs() > 1 {
       row_offset = 1;
@@ -1981,14 +2052,17 @@ impl ContextWriter {
         len = cmp::max(len, n4_h_8);
       }
 
-      //let mut weight = 2;
+      let mut weight = 2 as u32;
       if target_n4_h >= n4_h_8 && target_n4_h <= n4_h {
         let inc = cmp::min(-max_col_offs + col_offset + 1, cand.n4_w as isize);
-        //weight = cmp::max(weight, inc << shift);
+        assert!(inc >= 0);
+        weight = cmp::max(weight, inc as u32);
         *processed_cols = (inc as isize) - col_offset - 1;
       }
 
-      if self.add_ref_mv_candidate(cand) { found_match = true; }
+      if self.add_ref_mv_candidate(ref_frame, cand, mv_stack, len as u32 * weight, newmv_count) {
+        found_match = true;
+      }
 
       i += len;
     }
@@ -1996,16 +2070,24 @@ impl ContextWriter {
     found_match
   }
 
-  fn scan_blk_mbmi(&mut self, bo: &BlockOffset) -> bool {
+  fn scan_blk_mbmi(&mut self, bo: &BlockOffset, ref_frame: usize,
+                   mv_stack: &mut Vec<CandidateMV>, newmv_count: &mut usize) -> bool {
     if bo.x >= self.bc.cols || bo.y >= self.bc.rows {
       return false;
     }
 
+    let weight = 2 * BLOCK_8X8.width_mi() as u32;
     /* Always assume its within a tile, probably wrong */
-    self.add_ref_mv_candidate(self.bc.at(bo))
+    self.add_ref_mv_candidate(ref_frame, self.bc.at(bo), mv_stack, weight, newmv_count)
+  }
+
+  fn add_offset(&mut self, mv_stack: &mut Vec<CandidateMV>) {
+    for mut cand_mv in mv_stack {
+      cand_mv.weight += REF_CAT_LEVEL;
+    }
   }
 
-  fn setup_mvref_list(&mut self, bo: &BlockOffset) -> usize {
+  fn setup_mvref_list(&mut self, bo: &BlockOffset, ref_frame: usize, mv_stack: &mut Vec<CandidateMV>) -> usize {
     let (_rf, _rf_num) = self.get_mvref_ref_frames(INTRA_FRAME);
 
     let mut max_row_offs = 0 as isize;
@@ -2045,28 +2127,33 @@ impl ContextWriter {
 
     let mut row_match = false;
     let mut col_match = false;
-    let newmv_count = 0;
+    let mut newmv_count: usize = 0;
 
     if max_row_offs.abs() >= 1 {
-      let found_match = self.scan_row_mbmi(bo, -1, max_row_offs, &mut processed_rows);
+      let found_match = self.scan_row_mbmi(bo, -1, max_row_offs, &mut processed_rows, ref_frame, mv_stack,
+                                           &mut newmv_count);
       row_match |= found_match;
     }
     if max_col_offs.abs() >= 1 {
-      let found_match = self.scan_col_mbmi(bo, -1, max_col_offs, &mut processed_cols);
+      let found_match = self.scan_col_mbmi(bo, -1, max_col_offs, &mut processed_cols, ref_frame, mv_stack,
+                                           &mut newmv_count);
       col_match |= found_match;
     }
     if self.has_tr(bo) {
       let n4_w = self.bc.at(bo).n4_w;
-      let found_match = self.scan_blk_mbmi(&bo.with_offset(n4_w as isize, -1));
+      let found_match = self.scan_blk_mbmi(&bo.with_offset(n4_w as isize, -1), ref_frame, mv_stack,
+                                           &mut newmv_count);
       row_match |= found_match;
     }
 
     let nearest_match = if row_match { 1 } else { 0 } + if col_match { 1 } else { 0 };
 
-    /* TODO: set ref_mv_stack weights to REF_CAT_LEVEL for this ref frame */
+    self.add_offset(mv_stack);
 
     /* Scan the second outer area. */
-    let found_match = self.scan_blk_mbmi(&bo.with_offset(-1, -1));
+    let mut far_newmv_count: usize = 0; // won't be used
+
+    let found_match = self.scan_blk_mbmi(&bo.with_offset(-1, -1), ref_frame, mv_stack, &mut far_newmv_count);
     row_match |= found_match;
 
     for idx in 2..MVREF_ROW_COLS+1 {
@@ -2074,12 +2161,14 @@ impl ContextWriter {
       let col_offset = -2 * idx as isize + 1 + col_adj as isize;
 
       if row_offset.abs() <= max_row_offs.abs() && row_offset.abs() > processed_rows {
-        let found_match = self.scan_row_mbmi(bo, row_offset, max_row_offs, &mut processed_rows);
+        let found_match = self.scan_row_mbmi(bo, row_offset, max_row_offs, &mut processed_rows, ref_frame, mv_stack,
+                                             &mut far_newmv_count);
         row_match |= found_match;
       }
 
       if col_offset.abs() <= max_col_offs.abs() && col_offset.abs() > processed_cols {
-        let found_match = self.scan_col_mbmi(bo, col_offset, max_col_offs, &mut processed_cols);
+        let found_match = self.scan_col_mbmi(bo, col_offset, max_col_offs, &mut processed_cols, ref_frame, mv_stack,
+                                             &mut far_newmv_count);
         col_match |= found_match;
       }
     }
@@ -2096,12 +2185,26 @@ impl ContextWriter {
 
     /* TODO: Find nearest match and assign nearest and near mvs */
 
+    // Sort MV stack according to weight
+    if mv_stack.len() > 1 {
+      let mut i: usize = 1;
+      while i < mv_stack.len() {
+        let mut j = i;
+        while j > 0 && mv_stack[j - 1].weight < mv_stack[j].weight {
+          mv_stack.swap(j, j - 1);
+          j = j - 1;
+        }
+        i = i + 1;
+      }
+    }
+
     /* TODO: Handle single reference frame extension */
 
     mode_context
   }
 
-  pub fn find_mvrefs(&mut self, bo: &BlockOffset, ref_frame: usize) -> usize {
+  pub fn find_mvrefs(&mut self, bo: &BlockOffset, ref_frame: usize,
+                     mv_stack: &mut Vec<CandidateMV>) -> usize {
     if ref_frame < REF_FRAMES {
       if ref_frame != INTRA_FRAME {
         /* TODO: convert global mv to an mv here */
@@ -2116,7 +2219,7 @@ impl ContextWriter {
       /* TODO: Set the zeromv ref to 0 */
     }
 
-    let mode_context = self.setup_mvref_list(bo);
+    let mode_context = self.setup_mvref_list(bo, ref_frame, mv_stack);
     mode_context
   }
 
@@ -2296,6 +2399,26 @@ impl ContextWriter {
     }
   }
 
+  pub fn write_drl_mode(&mut self, w: &mut dyn Writer, drl_mode: bool, ctx: usize) {
+    symbol_with_update!(self, w, drl_mode as u32, &mut self.fc.drl_cdfs[ctx]);
+  }
+
+  pub fn write_mv(&mut self, w: &mut dyn Writer,
+                  mv: &MotionVector, ref_mv: &MotionVector,
+                  mv_precision: MvSubpelPrecision) {
+    let diff = MotionVector { row: mv.row - ref_mv.row, col: mv.col - ref_mv.col };
+    let j: MvJointType = av1_get_mv_joint(&diff);
+
+    w.symbol_with_update(j as u32, &mut self.fc.nmv_context.joints_cdf);
+
+    if mv_joint_vertical(j) {
+      encode_mv_component(w, diff.row as i32, &mut self.fc.nmv_context.comps[0], mv_precision);
+    }
+    if mv_joint_horizontal(j) {
+      encode_mv_component(w, diff.col as i32, &mut self.fc.nmv_context.comps[1], mv_precision);
+    }
+  }
+
   pub fn write_tx_type(
     &mut self, w: &mut dyn Writer, tx_size: TxSize, tx_type: TxType, y_mode: PredictionMode,
     is_inter: bool, use_reduced_tx_set: bool
@@ -2915,29 +3038,8 @@ const MV_UPP: i32 = (1 << MV_IN_USE_BITS);
 const MV_LOW: i32 = (-(1 << MV_IN_USE_BITS));
 
 
-pub struct nmv_component {
-  classes_cdf: [u16; MV_CLASSES + 1],
-  class0_fp_cdf: [[u16; MV_FP_SIZE + 1]; CLASS0_SIZE],
-  fp_cdf: [u16; MV_FP_SIZE + 1],
-  sign_cdf: [u16; 2 + 1],
-  class0_hp_cdf: [u16; 2 + 1],
-  hp_cdf: [u16; 2 + 1],
-  class0_cdf: [u16; CLASS0_SIZE + 1],
-  bits_cdf: [[u16; 2 + 1]; MV_OFFSET_BITS],
-}
-
-pub struct nmv_context {
-  joints_cdf: [u16; MV_JOINTS + 1],
-  comps: [nmv_component; 2],
-}
-
-pub struct MV {
-  row : i16,
-  col : i16,
-}
-
 #[inline(always)]
-pub fn av1_get_mv_joint(mv: &MV) -> MvJointType {
+pub fn av1_get_mv_joint(mv: &MotionVector) -> MvJointType {
   if mv.row == 0 {
     if mv.col == 0 { MvJointType::MV_JOINT_ZERO } else { MvJointType::MV_JOINT_HNZVZ }
   } else {
@@ -2961,7 +3063,7 @@ pub fn mv_class_base(mv_class: usize) -> u32 {
 #[inline(always)]
 // If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
 pub fn log_in_base_2(n: u32) -> u8 {
-  32 - n.leading_zeros() as u8
+  31 - cmp::min(31, n.leading_zeros() as u8)
 }
 #[inline(always)]
 pub fn get_mv_class(z: u32, offset: &mut u32) -> usize {
@@ -2974,7 +3076,7 @@ pub fn get_mv_class(z: u32, offset: &mut u32) -> usize {
 }
 
 pub fn encode_mv_component(w: &mut Writer, comp: i32, 
-  mvcomp: &mut nmv_component, precision: MvSubpelPrecision) {
+  mvcomp: &mut NMVComponent, precision: MvSubpelPrecision) {
   assert!(comp != 0);
   let mut offset: u32 = 0;
   let sign: u32 = if comp < 0 { 1 } else { 0 };
@@ -3016,22 +3118,3 @@ pub fn encode_mv_component(w: &mut Writer, comp: i32,
   }
 }
 
-pub fn av1_encode_mv(w: &mut Writer,
-                   mv: &MV, ref_mv: &MV,
-                   mvctx: &mut nmv_context, mut usehp: MvSubpelPrecision) {
-  let diff = MV { row: mv.row - ref_mv.row, col: mv.col - ref_mv.col };
-  let j: MvJointType = av1_get_mv_joint(&diff);
-
-  // TODO: pass fi.force_integer_mv to this function
-  if false /*fi.force_integer_mv*/ {
-    usehp = MvSubpelPrecision::MV_SUBPEL_NONE;
-  }
-  w.symbol_with_update(j as u32, &mut mvctx.joints_cdf);
-
-  if mv_joint_vertical(j) {
-    encode_mv_component(w, diff.row as i32, &mut mvctx.comps[0], usehp);
-  }
-  if mv_joint_horizontal(j) {
-    encode_mv_component(w, diff.col as i32, &mut mvctx.comps[1], usehp);
-  }
-}
diff --git a/src/encoder.rs b/src/encoder.rs
index f142bd71..f6c248a7 100644
--- a/src/encoder.rs
+++ b/src/encoder.rs
@@ -340,7 +340,7 @@ impl FrameInvariants {
             showable_frame: true,
             error_resilient: true,
             intra_only: false,
-            allow_high_precision_mv: true,
+            allow_high_precision_mv: false,
             frame_type: FrameType::KEY,
             show_existing_frame: false,
             use_reduced_tx_set,
@@ -1198,6 +1198,7 @@ pub fn encode_block_a(seq: &Sequence,
 pub fn encode_block_b(fi: &FrameInvariants, fs: &mut FrameState,
                  cw: &mut ContextWriter, w: &mut dyn Writer,
                  luma_mode: PredictionMode, chroma_mode: PredictionMode,
+                 ref_frame: usize, mv: MotionVector,
                  bsize: BlockSize, bo: &BlockOffset, skip: bool, bit_depth: usize) {
     let is_inter = !luma_mode.is_intra();
     if is_inter { assert!(luma_mode == chroma_mode); };
@@ -1208,15 +1209,46 @@ pub fn encode_block_b(fi: &FrameInvariants, fs: &mut FrameState,
     if fi.frame_type == FrameType::INTER {
         cw.write_is_inter(w, bo, is_inter);
         if is_inter {
-            let ref_frame = LAST_FRAME;
             cw.fill_neighbours_ref_counts(bo);
             cw.bc.set_ref_frame(bo, bsize, ref_frame);
+            cw.bc.set_motion_vector(bo, bsize, mv);
             cw.write_ref_frames(w, bo);
-            let mode_context = cw.find_mvrefs(bo, ref_frame);
+
+            let mut mv_stack = Vec::new();
+            let mode_context = cw.find_mvrefs(bo, ref_frame, &mut mv_stack);
             //let mode_context = if bo.x == 0 && bo.y == 0 { 0 } else if bo.x ==0 || bo.y == 0 { 51 } else { 85 };
             // NOTE: Until rav1e supports other inter modes than GLOBALMV
-            assert!(luma_mode == PredictionMode::GLOBALMV);
             cw.write_inter_mode(w, luma_mode, mode_context);
+
+            if luma_mode == PredictionMode::NEWMV || luma_mode == PredictionMode::NEW_NEWMV {
+              let ref_mv_idx = 0;
+              let num_mv_found = mv_stack.len();
+              for idx in 0..2 {
+                if num_mv_found > idx + 1 {
+                  let drl_mode = ref_mv_idx > idx;
+                  let ctx: usize = (mv_stack[idx].weight < REF_CAT_LEVEL) as usize
+                    + (mv_stack[idx + 1].weight < REF_CAT_LEVEL) as usize;
+
+                  cw.write_drl_mode(w, drl_mode, ctx);
+                  if !drl_mode { break; }
+                }
+              }
+
+              let ref_mv = if num_mv_found > 0 {
+                mv_stack[ref_mv_idx].this_mv
+              } else {
+                MotionVector{ row: 0, col: 0 }
+              };
+
+              let mv_precision = if fi.force_integer_mv != 0 {
+                MvSubpelPrecision::MV_SUBPEL_NONE
+              } else if fi.allow_high_precision_mv {
+                MvSubpelPrecision::MV_SUBPEL_HIGH_PRECISION
+              } else {
+                MvSubpelPrecision::MV_SUBPEL_LOW_PRECISION
+              };
+              cw.write_mv(w, &mv, &ref_mv, mv_precision);
+            }
         } else {
             cw.write_intra_mode(w, bsize, luma_mode);
         }
@@ -1267,6 +1299,9 @@ pub fn encode_block_b(fi: &FrameInvariants, fs: &mut FrameState,
     };
 
     if is_inter {
+      {
+        let ref_frame = cw.bc.at(bo).ref_frames[0];
+        let mv = &cw.bc.at(bo).mv[0];
         // Inter mode prediction can take place once for a whole partition,
         // instead of each tx-block.
         let num_planes = 1 + if has_chroma(bo, bsize, xdec, ydec) { 2 } else { 0 };
@@ -1278,11 +1313,38 @@ pub fn encode_block_b(fi: &FrameInvariants, fs: &mut FrameState,
 
             let rec = &mut fs.rec.planes[p];
 
-            luma_mode.predict_inter(fi, p, &po, &mut rec.mut_slice(&po), plane_bsize);
+            // TODO: make more generic to handle 2xN and Nx2 MC
+            if p > 0 && bsize == BlockSize::BLOCK_4X4 {
+              let mv0 = &cw.bc.at(&bo.with_offset(-1,-1)).mv[0];
+              let mv1 = &cw.bc.at(&bo.with_offset(0,-1)).mv[0];
+              let po1 = PlaneOffset { x: po.x+2, y: po.y };
+              let mv2 = &cw.bc.at(&bo.with_offset(-1,0)).mv[0];
+              let po2 = PlaneOffset { x: po.x, y: po.y+2 };
+              let po3 = PlaneOffset { x: po.x+2, y: po.y+2 };
+              let some_use_intra = cw.bc.at(&bo.with_offset(-1,-1)).mode.is_intra()
+                || cw.bc.at(&bo.with_offset(0,-1)).mode.is_intra()
+                || cw.bc.at(&bo.with_offset(-1,0)).mode.is_intra();
+
+              if some_use_intra {
+                luma_mode.predict_inter(fi, p, &po, &mut rec.mut_slice(&po), plane_bsize.width(),
+                                        plane_bsize.height(), ref_frame, mv);
+              } else {
+                luma_mode.predict_inter(fi, p, &po, &mut rec.mut_slice(&po), 2, 2, ref_frame, mv0);
+                luma_mode.predict_inter(fi, p, &po1, &mut rec.mut_slice(&po1), 2, 2, ref_frame, mv1);
+                luma_mode.predict_inter(fi, p, &po2, &mut rec.mut_slice(&po2), 2, 2, ref_frame, mv2);
+                luma_mode.predict_inter(fi, p, &po3, &mut rec.mut_slice(&po3), 2, 2, ref_frame, mv);
+              }
+            }
+            else
+            {
+              luma_mode.predict_inter(fi, p, &po, &mut rec.mut_slice(&po), plane_bsize.width(),
+                                      plane_bsize.height(), ref_frame, mv);
+            }
         }
-        write_tx_tree(fi, fs, cw, w, luma_mode, bo, bsize, tx_size, tx_type, skip, bit_depth); // i.e. var-tx if inter mode
+      }
+      write_tx_tree(fi, fs, cw, w, luma_mode, bo, bsize, tx_size, tx_type, skip, bit_depth); // i.e. var-tx if inter mode
     } else {
-        write_tx_blocks(fi, fs, cw, w, luma_mode, chroma_mode, bo, bsize, tx_size, tx_type, skip, bit_depth);
+      write_tx_blocks(fi, fs, cw, w, luma_mode, chroma_mode, bo, bsize, tx_size, tx_type, skip, bit_depth);
     }
 }
 
@@ -1437,6 +1499,8 @@ fn encode_partition_bottomup(seq: &Sequence, fi: &FrameInvariants, fs: &mut Fram
         bo: bo.clone(),
         pred_mode_luma: PredictionMode::DC_PRED,
         pred_mode_chroma: PredictionMode::DC_PRED,
+        ref_frame: INTRA_FRAME,
+        mv: MotionVector { row: 0, col: 0},
         skip: false
     }; // Best decision that is not PARTITION_SPLIT
 
@@ -1457,6 +1521,8 @@ fn encode_partition_bottomup(seq: &Sequence, fi: &FrameInvariants, fs: &mut Fram
         }
         let mode_decision = rdo_mode_decision(seq, fi, fs, cw, bsize, bo).part_modes[0].clone();
         let (mode_luma, mode_chroma) = (mode_decision.pred_mode_luma, mode_decision.pred_mode_chroma);
+        let ref_frame = mode_decision.ref_frame;
+        let mv = mode_decision.mv;
         let skip = mode_decision.skip;
         let mut cdef_coded = cw.bc.cdef_coded;
         rd_cost = mode_decision.rd_cost;
@@ -1464,7 +1530,7 @@ fn encode_partition_bottomup(seq: &Sequence, fi: &FrameInvariants, fs: &mut Fram
         cdef_coded = encode_block_a(seq, cw, if cdef_coded  {w_post_cdef} else {w_pre_cdef},
                                    bsize, bo, skip);
         encode_block_b(fi, fs, cw, if cdef_coded  {w_post_cdef} else {w_pre_cdef},
-                       mode_luma, mode_chroma, bsize, bo, skip, seq.bit_depth);
+                       mode_luma, mode_chroma, ref_frame, mv, bsize, bo, skip, seq.bit_depth);
 
         best_decision = mode_decision;
     }
@@ -1509,12 +1575,14 @@ fn encode_partition_bottomup(seq: &Sequence, fi: &FrameInvariants, fs: &mut Fram
 
             // FIXME: redundant block re-encode
             let (mode_luma, mode_chroma) = (best_decision.pred_mode_luma, best_decision.pred_mode_chroma);
+            let ref_frame = best_decision.ref_frame;
+            let mv = best_decision.mv;
             let skip = best_decision.skip;
             let mut cdef_coded = cw.bc.cdef_coded;
             cdef_coded = encode_block_a(seq, cw, if cdef_coded {w_post_cdef} else {w_pre_cdef},
                                        bsize, bo, skip);
             encode_block_b(fi, fs, cw, if cdef_coded {w_post_cdef} else {w_pre_cdef},
-                          mode_luma, mode_chroma, bsize, bo, skip, seq.bit_depth);
+                          mode_luma, mode_chroma, ref_frame, mv, bsize, bo, skip, seq.bit_depth);
         }
     }
 
@@ -1586,13 +1654,15 @@ fn encode_partition_topdown(seq: &Sequence, fi: &FrameInvariants, fs: &mut Frame
 
             let (mode_luma, mode_chroma) = (part_decision.pred_mode_luma, part_decision.pred_mode_chroma);
             let skip = part_decision.skip;
+            let ref_frame = part_decision.ref_frame;
+            let mv = part_decision.mv;
             let mut cdef_coded = cw.bc.cdef_coded;
 
             // FIXME: every final block that has gone through the RDO decision process is encoded twice
             cdef_coded = encode_block_a(seq, cw, if cdef_coded  {w_post_cdef} else {w_pre_cdef},
                          bsize, bo, skip);
             encode_block_b(fi, fs, cw, if cdef_coded  {w_post_cdef} else {w_pre_cdef},
-                          mode_luma, mode_chroma, bsize, bo, skip, seq.bit_depth);
+                          mode_luma, mode_chroma, ref_frame, mv, bsize, bo, skip, seq.bit_depth);
         },
         PartitionType::PARTITION_SPLIT => {
             if rdo_output.part_modes.len() >= 4 {
diff --git a/src/lib.rs b/src/lib.rs
index 62b33d02..6e16d369 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,6 +29,7 @@ pub mod rdo;
 pub mod util;
 pub mod cdef;
 pub mod encoder;
+pub mod me;
 
 pub use encoder::*;
 
diff --git a/src/me.rs b/src/me.rs
new file mode 100644
index 00000000..68173aca
--- /dev/null
+++ b/src/me.rs
@@ -0,0 +1,72 @@
+// Copyright (c) 2017-2018, The rav1e contributors. All rights reserved
+//
+// This source code is subject to the terms of the BSD 2 Clause License and
+// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+// was not distributed with this source code in the LICENSE file, you can
+// obtain it at www.aomedia.org/license/software. If the Alliance for Open
+// Media Patent License 1.0 was not distributed with this source code in the
+// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+
+use std::cmp;
+use FrameInvariants;
+use FrameState;
+use partition::BlockSize;
+use context::BlockOffset;
+use partition::MotionVector;
+use partition::LAST_FRAME;
+use plane::PlaneOffset;
+use context::BLOCK_TO_PLANE_SHIFT;
+
+pub fn motion_estimation(fi: &FrameInvariants, fs: &mut FrameState, bsize: BlockSize,
+                         bo: &BlockOffset, ref_frame: usize) -> MotionVector {
+
+  match fi.rec_buffer.frames[fi.ref_frames[ref_frame - LAST_FRAME]] {
+    Some(ref rec) => {
+      let po = PlaneOffset { x: bo.x << BLOCK_TO_PLANE_SHIFT, y: bo.y << BLOCK_TO_PLANE_SHIFT };
+      let range = 16 as usize;
+      let blk_w = bsize.width();
+      let blk_h = bsize.height();
+      let x_lo = cmp::max(0, po.x as isize - range as isize) as usize;
+      let x_hi = cmp::min(fs.input.planes[0].cfg.width - blk_w, po.x + range);
+      let y_lo = cmp::max(0, po.y as isize - range as isize) as usize;
+      let y_hi = cmp::min(fs.input.planes[0].cfg.height - blk_h, po.y + range);
+
+      let sorg = fs.input.planes[0].slice(&po);
+      let slice_org = sorg.as_slice();
+      let stride_org = fs.input.planes[0].cfg.stride;
+      let stride_ref = rec.planes[0].cfg.stride;
+
+      let mut lowest_sad = 128*128*4096 as usize;
+      let mut best_mv = MotionVector { row: 0, col: 0 };
+
+      for y in (y_lo..y_hi).step_by(2) {
+        for x in (x_lo..x_hi).step_by(2) {
+
+          let mut sad = 0;
+          let sref = rec.planes[0].slice(&PlaneOffset { x: x, y: y });
+          let slice_ref = sref.as_slice();
+
+          for r in 0..blk_h {
+            for c in 0..blk_w {
+              let org_index = r * stride_org + c;
+              let ref_index = r * stride_ref + c;
+              let a = slice_org[org_index];
+              let b = slice_ref[ref_index];
+              let delta = b as isize - a as isize;
+              sad += delta.abs() as usize;
+            }
+          }
+
+          if sad < lowest_sad {
+            lowest_sad = sad;
+            best_mv = MotionVector { row: 8*(y as i16 - po.y as i16), col: 8*(x as i16 - po.x as i16) }
+          }
+
+        }
+      }
+      best_mv
+    }
+
+    None => MotionVector { row: 0, col : 0 }
+  }
+}
diff --git a/src/partition.rs b/src/partition.rs
index c2059ca8..27d174ce 100755
--- a/src/partition.rs
+++ b/src/partition.rs
@@ -10,6 +10,7 @@
 #![allow(non_camel_case_types)]
 #![allow(dead_code)]
 
+use std::cmp;
 use self::BlockSize::*;
 use self::TxSize::*;
 use encoder::FrameInvariants;
@@ -348,6 +349,12 @@ pub enum PredictionMode {
   NEW_NEWMV
 }
 
+#[derive(Copy, Clone)]
+pub struct MotionVector {
+  pub row: i16,
+  pub col: i16,
+}
+
 pub const NEWMV_MODE_CONTEXTS: usize = 7;
 pub const GLOBALMV_MODE_CONTEXTS: usize = 2;
 pub const REFMV_MODE_CONTEXTS: usize = 9;
@@ -529,24 +536,30 @@ impl PredictionMode {
   }
 
   pub fn predict_inter<'a>(self, fi: &FrameInvariants, p: usize, po: &PlaneOffset,
-                           dst: &'a mut PlaneMutSlice<'a>, plane_size: BlockSize) {
+                           dst: &'a mut PlaneMutSlice<'a>, width: usize, height: usize,
+                           ref_frame: usize, mv: &MotionVector) {
     assert!(!self.is_intra());
-    assert!(self == PredictionMode::GLOBALMV); // Other modes not implemented
-
-    let ref_frame_idx = LAST_FRAME;
+    assert!(ref_frame == LAST_FRAME);
 
-    match fi.rec_buffer.frames[fi.ref_frames[ref_frame_idx - LAST_FRAME]] {
+    match fi.rec_buffer.frames[fi.ref_frames[ref_frame - LAST_FRAME]] {
       Some(ref rec) => {
-        let ref_stride = rec.planes[p].cfg.stride;
-        let src = rec.planes[p].slice(po);
-        let ref_slice = src.as_slice();
+        let rec_cfg = &rec.planes[p].cfg;
+        let shift_row = 3 + rec_cfg.ydec;
+        let shift_col = 3 + rec_cfg.xdec;
+        let row_offset = mv.row as i32 >> shift_row;
+        let col_offset = mv.col as i32 >> shift_col;
+        let ref_width = rec_cfg.width;
+        let ref_height = rec_cfg.height;
+
         let stride = dst.plane.cfg.stride;
         let slice = dst.as_mut_slice();
-        for r in 0..plane_size.height() {
-          for c in 0..plane_size.width() {
-            let input_index = r * ref_stride + c;
+
+        for r in 0..height {
+          for c in 0..width {
+            let rs = cmp::min(ref_height as i32 - 1, cmp::max(0, po.y as i32 + row_offset + r as i32)) as usize;
+            let cs = cmp::min(ref_width as i32 - 1, cmp::max(0, po.x as i32 + col_offset + c as i32)) as usize;
             let output_index = r * stride + c;
-            slice[output_index] = ref_slice[input_index];
+            slice[output_index] = rec.planes[p].p(cs, rs);
           }
         }
       },
diff --git a/src/predict.rs b/src/predict.rs
index 723b9c78..7ae0211e 100755
--- a/src/predict.rs
+++ b/src/predict.rs
@@ -38,6 +38,7 @@ pub static RAV1E_INTRA_MODES_MINIMAL: &'static [PredictionMode] = &[
 
 pub static RAV1E_INTER_MODES: &'static [PredictionMode] = &[
   PredictionMode::GLOBALMV,
+  PredictionMode::NEWMV,
 ];
 
 // Weights are quadratic from '1' to '1 / block_size', scaled by 2^sm_weight_log2_scale.
diff --git a/src/rdo.rs b/src/rdo.rs
index a7d82ddf..e0fbb005 100755
--- a/src/rdo.rs
+++ b/src/rdo.rs
@@ -12,6 +12,7 @@
 #![cfg_attr(feature = "cargo-clippy", allow(cast_lossless))]
 
 use context::*;
+use me::*;
 use ec::OD_BITRES;
 use ec::Writer;
 use ec::WriterCounter;
@@ -48,6 +49,8 @@ pub struct RDOPartitionOutput {
   pub bo: BlockOffset,
   pub pred_mode_luma: PredictionMode,
   pub pred_mode_chroma: PredictionMode,
+  pub ref_frame: usize,
+  pub mv: MotionVector,
   pub skip: bool
 }
 
@@ -189,6 +192,8 @@ pub fn rdo_mode_decision(
   let mut best_mode_chroma = PredictionMode::DC_PRED;
   let mut best_skip = false;
   let mut best_rd = std::f64::MAX;
+  let mut best_ref_frame = INTRA_FRAME;
+  let mut best_mv = MotionVector { row: 0, col: 0 };
 
   // Get block luma and chroma dimensions
   let w = bsize.width();
@@ -223,6 +228,13 @@ pub fn rdo_mode_decision(
       mode_set_chroma.push(PredictionMode::DC_PRED);
     }
 
+    let ref_frame = if luma_mode.is_intra() { INTRA_FRAME } else { LAST_FRAME };
+    let mv = if luma_mode != PredictionMode::NEWMV {
+      MotionVector { row: 0, col: 0 }
+    } else {
+      motion_estimation(fi, fs, bsize, bo, ref_frame)
+    };
+
     // Find the best chroma prediction mode for the current luma prediction mode
     for &chroma_mode in &mode_set_chroma {
       for &skip in &[false, true] {
@@ -232,8 +244,9 @@ pub fn rdo_mode_decision(
         let mut wr: &mut dyn Writer = &mut WriterCounter::new();
         let tell = wr.tell_frac();
 
+
         encode_block_a(seq, cw, wr, bsize, bo, skip);
-        encode_block_b(fi, fs, cw, wr, luma_mode, chroma_mode, bsize, bo, skip, seq.bit_depth);
+        encode_block_b(fi, fs, cw, wr, luma_mode, chroma_mode, ref_frame, mv, bsize, bo, skip, seq.bit_depth);
 
         let cost = wr.tell_frac() - tell;
         let rd = compute_rd_cost(
@@ -251,6 +264,8 @@ pub fn rdo_mode_decision(
           best_rd = rd;
           best_mode_luma = luma_mode;
           best_mode_chroma = chroma_mode;
+          best_ref_frame = ref_frame;
+          best_mv = mv;
           best_skip = skip;
         }
 
@@ -270,6 +285,8 @@ pub fn rdo_mode_decision(
       bo: bo.clone(),
       pred_mode_luma: best_mode_luma,
       pred_mode_chroma: best_mode_chroma,
+      ref_frame: best_ref_frame,
+      mv: best_mv,
       rd_cost: best_rd,
       skip: best_skip
     }]
-- 
GitLab