From 1c122c24a1fdbe140f40d08b0373e4b6324b1210 Mon Sep 17 00:00:00 2001
From: Thomas <thdavies@cisco.com>
Date: Fri, 19 Feb 2016 09:06:12 +0000
Subject: [PATCH] Add quant and dequant functions for new quant matrices.

Change-Id: If0ba62428216fa343b9a37a3b349edba4103c00a
---
 vp10/common/vp10_rtcd_defs.pl |  87 +++++---
 vp10/decoder/decodeframe.c    |  36 +++-
 vp10/decoder/decoder.c        |   4 +
 vp10/decoder/detokenize.c     |  25 ++-
 vp10/encoder/dct.c            |  30 ++-
 vp10/encoder/encodemb.c       | 348 +++++++++++++++++++++++++++-----
 vp10/encoder/encoder.c        |   3 +
 vp10/encoder/quantize.c       | 160 +++++++++++++--
 vpx_dsp/quantize.c            | 366 ++++++++++++++++++++++++++++++++++
 vpx_dsp/quantize.h            |  57 +++++-
 vpx_dsp/vpx_dsp_rtcd_defs.pl  |  38 ++--
 11 files changed, 1034 insertions(+), 120 deletions(-)

diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 9831bdd580..6f8900ae69 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -309,37 +309,67 @@ if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
 
 # ENCODEMB INVOKE
 
-if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
-# the transform coefficients are held in 32-bit
-# values, so the assembler code for  vp10_block_error can no longer be used.
-  add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp10_block_error/;
+if (vpx_config("CONFIG_AOM_QM") eq "yes") {
+  if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
+    # the transform coefficients are held in 32-bit
+    # values, so the assembler code for  vp10_block_error can no longer be used.
+    add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+    specialize qw/vp10_block_error/;
+
+    add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+
+    add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+
+    add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+    specialize qw/vp10_fdct8x8_quant/;
+  } else {
+    add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+    specialize qw/vp10_block_error avx2 msa/, "$sse2_x86inc";
 
-  add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_quantize_fp/;
+    add_proto qw/int64_t vp10_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+    specialize qw/vp10_block_error_fp neon/, "$sse2_x86inc";
 
-  add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_quantize_fp_32x32/;
+    add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
 
-  add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_fdct8x8_quant/;
+    add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+
+    add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
+  }
 } else {
-  add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp10_block_error avx2 msa/, "$sse2_x86inc";
+  if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
+    # the transform coefficients are held in 32-bit
+    # values, so the assembler code for  vp10_block_error can no longer be used.
+    add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+    specialize qw/vp10_block_error/;
+
+    add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_quantize_fp/;
+
+    add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_quantize_fp_32x32/;
 
-  add_proto qw/int64_t vp10_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
-  specialize qw/vp10_block_error_fp neon/, "$sse2_x86inc";
+    add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_fdct8x8_quant/;
+  } else {
+    add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+    specialize qw/vp10_block_error avx2 msa/, "$sse2_x86inc";
+
+    add_proto qw/int64_t vp10_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+    specialize qw/vp10_block_error_fp neon/, "$sse2_x86inc";
 
-  add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_quantize_fp neon sse2/, "$ssse3_x86_64_x86inc";
+    add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_quantize_fp neon sse2/, "$ssse3_x86_64_x86inc";
 
-  add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
+    add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_fdct8x8_quant sse2 ssse3 neon/;
+  }
 
-  add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_fdct8x8_quant sse2 ssse3 neon/;
 }
 
+
 # fdct functions
 
 if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
@@ -574,11 +604,18 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
   add_proto qw/int64_t vp10_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/vp10_highbd_block_error sse2/;
 
-  add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_highbd_quantize_fp/;
+  if (vpx_config("CONFIG_AOM_QM") eq "yes") {
+    add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
 
-  add_proto qw/void vp10_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_highbd_quantize_fp_32x32/;
+    add_proto qw/void vp10_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+  } else {
+    add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_highbd_quantize_fp/;
+
+    add_proto qw/void vp10_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vp10_highbd_quantize_fp_32x32/;
+
+  }
 
   # fdct functions
   add_proto qw/void vp10_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 59e0c9be0b..3f7f098fac 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -712,8 +712,8 @@ static void dec_build_inter_predictors_sb(VP10Decoder *const pbi,
   }
 }
 
-static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi, int n4_wl,
-                                         int n4_hl) {
+static INLINE TX_SIZE
+    dec_get_uv_tx_size(const MB_MODE_INFO *mbmi, int n4_wl, int n4_hl) {
   // get minimum log2 num4x4s dimension
   const int x = VPXMIN(n4_wl, n4_hl);
   return VPXMIN(mbmi->tx_size, x);
@@ -1120,8 +1120,13 @@ static void setup_quantization(VP10_COMMON *const cm,
 
 static void setup_segmentation_dequant(VP10_COMMON *const cm) {
   // Build y/uv dequant values based on segmentation.
+  int i = 0;
+#if CONFIG_AOM_QM
+  int lossless;
+  int j = 0;
+  int qmindex;
+#endif
   if (cm->seg.enabled) {
-    int i;
     for (i = 0; i < MAX_SEGMENTS; ++i) {
       const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
       cm->y_dequant[i][0] =
@@ -1131,6 +1136,19 @@ static void setup_segmentation_dequant(VP10_COMMON *const cm) {
           vp10_dc_quant(qindex, cm->uv_dc_delta_q, cm->bit_depth);
       cm->uv_dequant[i][1] =
           vp10_ac_quant(qindex, cm->uv_ac_delta_q, cm->bit_depth);
+#if CONFIG_AOM_QM
+      lossless = qindex == 0 && cm->y_dc_delta_q == 0 &&
+                 cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+      // NB: depends on base index so there is only 1 set per frame
+      // No quant weighting when lossless
+      qmindex = lossless ? QINDEX_RANGE - 1 : cm->base_qindex;
+      for (j = 0; j < TX_SIZES; ++j) {
+        cm->y_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmindex, 0, j, 1);
+        cm->y_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmindex, 0, j, 0);
+        cm->uv_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmindex, 1, j, 1);
+        cm->uv_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmindex, 1, j, 0);
+      }
+#endif
     }
   } else {
     const int qindex = cm->base_qindex;
@@ -1143,6 +1161,18 @@ static void setup_segmentation_dequant(VP10_COMMON *const cm) {
         vp10_dc_quant(qindex, cm->uv_dc_delta_q, cm->bit_depth);
     cm->uv_dequant[0][1] =
         vp10_ac_quant(qindex, cm->uv_ac_delta_q, cm->bit_depth);
+#if CONFIG_AOM_QM
+    lossless = qindex == 0 && cm->y_dc_delta_q == 0 && cm->uv_dc_delta_q == 0 &&
+               cm->uv_ac_delta_q == 0;
+    // No quant weighting when lossless
+    qmindex = lossless ? QINDEX_RANGE - 1 : cm->base_qindex;
+    for (j = 0; j < TX_SIZES; ++j) {
+      cm->y_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmindex, 0, j, 1);
+      cm->y_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmindex, 0, j, 0);
+      cm->uv_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmindex, 1, j, 1);
+      cm->uv_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmindex, 1, j, 0);
+    }
+#endif
   }
 }
 
diff --git a/vp10/decoder/decoder.c b/vp10/decoder/decoder.c
index f979abf2f2..438c2da9d4 100644
--- a/vp10/decoder/decoder.c
+++ b/vp10/decoder/decoder.c
@@ -109,6 +109,10 @@ VP10Decoder *vp10_decoder_create(BufferPool *const pool) {
 
   vp10_loop_filter_init(cm);
 
+#if CONFIG_AOM_QM
+  aom_qm_init(cm);
+#endif
+
   cm->error.setjmp = 0;
 
   vpx_get_worker_interface()->init(&pbi->lf_worker);
diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index e57d1efde8..d4da4bb8d4 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c
@@ -43,14 +43,25 @@ static INLINE int read_coeff(const vpx_prob *probs, int n, vpx_reader *r) {
   return val;
 }
 
+#if CONFIG_AOM_QM
 static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
                         tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
                         int ctx, const int16_t *scan, const int16_t *nb,
-                        vpx_reader *r) {
+                        vpx_reader *r, const qm_val_t *iqm[2][TX_SIZES])
+#else
+static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
+                        tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
+                        int ctx, const int16_t *scan, const int16_t *nb,
+                        vpx_reader *r)
+#endif
+{
   FRAME_COUNTS *counts = xd->counts;
   const int max_eob = 16 << (tx_size << 1);
   const FRAME_CONTEXT *const fc = xd->fc;
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
+#if CONFIG_AOM_QM
+  const qm_val_t *iqmatrix = iqm[!ref][tx_size];
+#endif
   int band, c = 0;
   const vpx_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size][type][ref];
@@ -183,6 +194,10 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
         }
       }
     }
+#if CONFIG_AOM_QM
+    dqv = ((iqmatrix[scan[c]] * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
     v = (val * dqv) >> dq_shift;
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
 #if CONFIG_VPX_HIGHBITDEPTH
@@ -249,8 +264,16 @@ int vp10_decode_block_tokens(MACROBLOCKD *xd, int plane, const scan_order *sc,
   const int16_t *const dequant = pd->seg_dequant[seg_id];
   const int ctx =
       get_entropy_context(tx_size, pd->above_context + x, pd->left_context + y);
+#if CONFIG_AOM_QM
+  const int eob =
+      decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, dequant, ctx,
+                   sc->scan, sc->neighbors, r, pd->seg_iqmatrix[seg_id]);
+#else
   const int eob = decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size,
                                dequant, ctx, sc->scan, sc->neighbors, r);
+#endif
   dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
   return eob;
 }
+
+
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 2a7ba7ef4e..300a742518 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -1021,7 +1021,12 @@ void vp10_fdct8x8_quant_c(const int16_t *input, int stride,
                           const int16_t *quant_shift_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan, const int16_t *iscan) {
+                          const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+                          ,
+                          const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                          ) {
   int eob = -1;
 
   int i, j;
@@ -1107,16 +1112,29 @@ void vp10_fdct8x8_quant_c(const int16_t *input, int stride,
     for (i = 0; i < n_coeffs; i++) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      int tmp32;
+#if CONFIG_AOM_QM
+      tmp32 = (tmp * quant_ptr[rc != 0] * wt) >> (16 + AOM_QM_BITS);
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+#else
+      tmp32 = (tmp * quant_ptr[rc != 0]) >> 16;
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+#endif
 
-      if (tmp) eob = i;
+      if (tmp32) eob = i;
     }
   }
   *eob_ptr = eob + 1;
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index b2fbf13a9d..afd38a3272 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -99,6 +99,11 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   const PLANE_TYPE type = pd->plane_type;
   const int default_eob = 16 << (tx_size << 1);
   const int mul = 1 + (tx_size == TX_32X32);
+#if CONFIG_AOM_QM
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+  int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
+#endif
   const int16_t *dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
   TX_TYPE tx_type = get_tx_type(type, xd, block);
@@ -138,7 +143,11 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
 
   for (i = eob; i-- > 0;) {
     int base_bits, d2, dx;
+
     const int rc = scan[i];
+#if CONFIG_AOM_QM
+    int iwt = iqmatrix[rc];
+#endif
     int x = qcoeff[rc];
     /* Only add a trellis state for non-zero coefficients. */
     if (x) {
@@ -182,9 +191,16 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
 
+#if CONFIG_AOM_QM
+      if ((abs(x) * dequant_ptr[rc != 0] * iwt >
+           ((abs(coeff[rc]) * mul) << AOM_QM_BITS)) &&
+          (abs(x) * dequant_ptr[rc != 0] * iwt <
+           ((abs(coeff[rc]) * mul + dequant_ptr[rc != 0]) << AOM_QM_BITS)))
+#else
       if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
           (abs(x) * dequant_ptr[rc != 0] <
            abs(coeff[rc]) * mul + dequant_ptr[rc != 0]))
+#endif
         shortcut = 1;
       else
         shortcut = 0;
@@ -239,6 +255,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
 #endif  // CONFIG_VPX_HIGHBITDEPTH
         d2 = dx * dx;
       }
+
       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
       tokens[i][1].error = d2 + (best ? error1 : error0);
       tokens[i][1].next = next;
@@ -288,12 +305,21 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   for (i = next; i < eob; i = next) {
     const int x = tokens[i][best].qc;
     const int rc = scan[i];
+#if CONFIG_AOM_QM
+    const int iwt = iqmatrix[rc];
+    const int dequant =
+        (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+#endif
     if (x) {
       final_eob = i;
     }
 
     qcoeff[rc] = x;
+#if CONFIG_AOM_QM
+    dqcoeff[rc] = (x * dequant) / mul;
+#else
     dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
+#endif
 
     next = tokens[i][best].next;
     best = best_index[i][best];
@@ -328,11 +354,17 @@ void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
     vp10_fwht4x4(src_diff, coeff, diff_stride);
   } else {
     switch (tx_type) {
-      case DCT_DCT: vpx_fdct4x4(src_diff, coeff, diff_stride); break;
+      case DCT_DCT:
+        vpx_fdct4x4(src_diff, coeff, diff_stride);
+        break;
       case ADST_DCT:
       case DCT_ADST:
-      case ADST_ADST: vp10_fht4x4(src_diff, coeff, diff_stride, tx_type); break;
-      default: assert(0); break;
+      case ADST_ADST:
+        vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
+        break;
+      default:
+        assert(0);
+        break;
     }
   }
 }
@@ -343,8 +375,12 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
-    case ADST_ADST: vp10_fht8x8(src_diff, coeff, diff_stride, tx_type); break;
-    default: assert(0); break;
+    case ADST_ADST:
+      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
+      break;
+    default:
+      assert(0);
+      break;
   }
 }
 
@@ -354,8 +390,12 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
-    case ADST_ADST: vp10_fht16x16(src_diff, coeff, diff_stride, tx_type); break;
-    default: assert(0); break;
+    case ADST_ADST:
+      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      break;
+    default:
+      assert(0);
+      break;
   }
 }
 
@@ -363,11 +403,17 @@ static void fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
                            tran_low_t *coeff, int diff_stride,
                            TX_TYPE tx_type) {
   switch (tx_type) {
-    case DCT_DCT: fdct32x32(rd_transform, src_diff, coeff, diff_stride); break;
+    case DCT_DCT:
+      fdct32x32(rd_transform, src_diff, coeff, diff_stride);
+      break;
     case ADST_DCT:
     case DCT_ADST:
-    case ADST_ADST: assert(0); break;
-    default: assert(0); break;
+    case ADST_ADST:
+      assert(0);
+      break;
+    default:
+      assert(0);
+      break;
   }
 }
 
@@ -379,13 +425,17 @@ void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
     vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
   } else {
     switch (tx_type) {
-      case DCT_DCT: vpx_highbd_fdct4x4(src_diff, coeff, diff_stride); break;
+      case DCT_DCT:
+        vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
+        break;
       case ADST_DCT:
       case DCT_ADST:
       case ADST_ADST:
         vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
         break;
-      default: assert(0); break;
+      default:
+        assert(0);
+        break;
     }
   }
 }
@@ -393,26 +443,34 @@ void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TX_TYPE tx_type) {
   switch (tx_type) {
-    case DCT_DCT: vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); break;
+    case DCT_DCT:
+      vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+      break;
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
       vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
       break;
-    default: assert(0); break;
+    default:
+      assert(0);
+      break;
   }
 }
 
 static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TX_TYPE tx_type) {
   switch (tx_type) {
-    case DCT_DCT: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); break;
+    case DCT_DCT:
+      vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+      break;
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
       vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
       break;
-    default: assert(0); break;
+    default:
+      assert(0);
+      break;
   }
 }
 
@@ -425,8 +483,12 @@ static void highbd_fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
       break;
     case ADST_DCT:
     case DCT_ADST:
-    case ADST_ADST: assert(0); break;
-    default: assert(0); break;
+    case ADST_ADST:
+      assert(0);
+      break;
+    default:
+      assert(0);
+      break;
   }
 }
 #endif  // CONFIG_VPX_HIGHBITDEPTH
@@ -444,6 +506,12 @@ void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block, int blk_row,
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+#if CONFIG_AOM_QM
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+  int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
+  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][is_intra][tx_size];
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
+#endif
   const int16_t *src_diff;
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
@@ -455,21 +523,34 @@ void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block, int blk_row,
         vp10_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
                                       p->round_fp, p->quant_fp, p->quant_shift,
                                       qcoeff, dqcoeff, pd->dequant, eob,
-                                      scan_order->scan, scan_order->iscan);
+                                      scan_order->scan,
+#if !CONFIG_AOM_QM
+                                      scan_order->iscan);
+#else
+                                      scan_order->iscan, qmatrix, iqmatrix);
+#endif
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
         vp10_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
                                 p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                                 scan_order->iscan);
+#else
+                                scan_order->iscan, qmatrix, iqmatrix);
+#endif
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
         vp10_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
                                 p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                                 scan_order->iscan);
+#else
+                                scan_order->iscan, qmatrix, iqmatrix);
+#endif
         break;
       case TX_4X4:
         if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
@@ -480,9 +561,14 @@ void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block, int blk_row,
         vp10_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
                                 p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                                 scan_order->iscan);
+#else
+                                scan_order->iscan, qmatrix, iqmatrix);
+#endif
         break;
-      default: assert(0);
+      default:
+        assert(0);
     }
     return;
   }
@@ -494,19 +580,32 @@ void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block, int blk_row,
       vp10_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
                              p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                              pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                              scan_order->iscan);
+#else
+                             scan_order->iscan, qmatrix, iqmatrix);
+#endif
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
       vp10_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
                        p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, eob, scan_order->scan, scan_order->iscan);
+                       pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
+                       scan_order->iscan);
+#else
+                       scan_order->iscan, qmatrix, iqmatrix);
+#endif
       break;
     case TX_8X8:
       vp10_fdct8x8_quant(src_diff, diff_stride, coeff, 64, x->skip_block,
                          p->zbin, p->round_fp, p->quant_fp, p->quant_shift,
                          qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                          scan_order->iscan);
+#else
+                         scan_order->iscan, qmatrix, iqmatrix);
+#endif
       break;
     case TX_4X4:
       if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
@@ -516,9 +615,16 @@ void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block, int blk_row,
       }
       vp10_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
                        p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, eob, scan_order->scan, scan_order->iscan);
+                       pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
+                       scan_order->iscan);
+#else
+                       scan_order->iscan, qmatrix, iqmatrix);
+#endif
+      break;
+    default:
+      assert(0);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -532,6 +638,12 @@ void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block, int blk_row,
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+#if CONFIG_AOM_QM
+  int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
+  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][is_intra][tx_size];
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
+#endif
   const int16_t *src_diff;
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
@@ -542,31 +654,49 @@ void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block, int blk_row,
         vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
         vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
                                      p->quant_fp[0], qcoeff, dqcoeff,
-                                     pd->dequant[0], eob);
+                                     pd->dequant[0],
+#if !CONFIG_AOM_QM
+                                     eob);
+#else
+                                     eob, qmatrix, iqmatrix);
+#endif
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
         vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
+#if !CONFIG_AOM_QM
                                eob);
+#else
+                               eob, qmatrix, iqmatrix);
+#endif
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
         vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
+#if !CONFIG_AOM_QM
                                eob);
+#else
+                               eob, qmatrix, iqmatrix);
+#endif
         break;
       case TX_4X4:
-        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+        if (xd->lossless[seg_id]) {
           vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
         } else {
           vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
         }
         vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
+#if !CONFIG_AOM_QM
                                eob);
+#else
+                               eob, qmatrix, iqmatrix);
+#endif
         break;
-      default: assert(0);
+      default:
+        assert(0);
     }
     return;
   }
@@ -576,28 +706,50 @@ void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block, int blk_row,
     case TX_32X32:
       vpx_fdct32x32_1(src_diff, coeff, diff_stride);
       vpx_quantize_dc_32x32(coeff, x->skip_block, p->round, p->quant_fp[0],
-                            qcoeff, dqcoeff, pd->dequant[0], eob);
+                            qcoeff, dqcoeff, pd->dequant[0],
+#if !CONFIG_AOM_QM
+                            eob);
+#else
+                            eob, qmatrix, iqmatrix);
+#endif
       break;
     case TX_16X16:
       vpx_fdct16x16_1(src_diff, coeff, diff_stride);
       vpx_quantize_dc(coeff, 256, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+                      qcoeff, dqcoeff, pd->dequant[0],
+#if !CONFIG_AOM_QM
+                      eob);
+#else
+                      eob, qmatrix, iqmatrix);
+#endif
       break;
     case TX_8X8:
       vpx_fdct8x8_1(src_diff, coeff, diff_stride);
       vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+                      qcoeff, dqcoeff, pd->dequant[0],
+#if !CONFIG_AOM_QM
+                      eob);
+#else
+                      eob, qmatrix, iqmatrix);
+#endif
       break;
     case TX_4X4:
-      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+      if (xd->lossless[seg_id]) {
         vp10_fwht4x4(src_diff, coeff, diff_stride);
       } else {
         vpx_fdct4x4(src_diff, coeff, diff_stride);
       }
       vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+                      qcoeff, dqcoeff, pd->dequant[0],
+#if !CONFIG_AOM_QM
+                      eob);
+#else
+                      eob, qmatrix, iqmatrix);
+#endif
+      break;
+    default:
+      assert(0);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -614,6 +766,12 @@ void vp10_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+#if CONFIG_AOM_QM
+  int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
+  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][is_intra][tx_size];
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
+#endif
   const int16_t *src_diff;
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
@@ -626,31 +784,48 @@ void vp10_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
         vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
                                     p->round, p->quant, p->quant_shift, qcoeff,
                                     dqcoeff, pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                                     scan_order->iscan);
+#else
+                                    scan_order->iscan, qmatrix, iqmatrix);
+#endif
         break;
       case TX_16X16:
         highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
         vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
                               pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                               scan_order->iscan);
+#else
+                              scan_order->iscan, qmatrix, iqmatrix);
+#endif
         break;
       case TX_8X8:
         highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
         vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
                               pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                               scan_order->iscan);
+#else
+                              scan_order->iscan, qmatrix, iqmatrix);
+#endif
         break;
       case TX_4X4:
         vp10_highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
-                                 xd->lossless[xd->mi[0]->mbmi.segment_id]);
+                                 xd->lossless[seg_id]);
         vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
                               pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                               scan_order->iscan);
+#else
+                              scan_order->iscan, qmatrix, iqmatrix);
+#endif
         break;
-      default: assert(0);
+      default:
+        assert(0);
     }
     return;
   }
@@ -662,28 +837,49 @@ void vp10_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
       vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
                            pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                            scan_order->iscan);
+#else
+                           scan_order->iscan, qmatrix, iqmatrix);
+#endif
       break;
     case TX_16X16:
       fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
       vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+                     scan_order->scan,
+#if !CONFIG_AOM_QM
+                     scan_order->iscan);
+#else
+                     scan_order->iscan, qmatrix, iqmatrix);
+#endif
       break;
     case TX_8X8:
       fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
       vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+                     scan_order->scan,
+#if !CONFIG_AOM_QM
+                     scan_order->iscan);
+#else
+                     scan_order->iscan, qmatrix, iqmatrix);
+#endif
       break;
     case TX_4X4:
       vp10_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
-                        xd->lossless[xd->mi[0]->mbmi.segment_id]);
+                        xd->lossless[seg_id]);
       vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+                     scan_order->scan,
+#if !CONFIG_AOM_QM
+                     scan_order->iscan);
+#else
+                     scan_order->iscan, qmatrix, iqmatrix);
+#endif
+      break;
+    default:
+      assert(0);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -780,7 +976,9 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
                                      p->eobs[block], xd->bd, tx_type,
                                      xd->lossless[xd->mi[0]->mbmi.segment_id]);
         break;
-      default: assert(0 && "Invalid transform size"); break;
+      default:
+        assert(0 && "Invalid transform size");
+        break;
     }
 
     return;
@@ -807,7 +1005,9 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
       vp10_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, p->eobs[block],
                             tx_type, xd->lossless[xd->mi[0]->mbmi.segment_id]);
       break;
-    default: assert(0 && "Invalid transform size"); break;
+    default:
+      assert(0 && "Invalid transform size");
+      break;
   }
 }
 
@@ -899,6 +1099,12 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
   uint8_t *src, *dst;
   int16_t *src_diff;
   uint16_t *eob = &p->eobs[block];
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+#if CONFIG_AOM_QM
+  int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
+  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][is_intra][tx_size];
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
+#endif
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
   dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
@@ -921,7 +1127,12 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
           vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
                                       p->round, p->quant, p->quant_shift,
                                       qcoeff, dqcoeff, pd->dequant, eob,
-                                      scan_order->scan, scan_order->iscan);
+                                      scan_order->scan,
+#if !CONFIG_AOM_QM
+                                      scan_order->iscan);
+#else
+                                      scan_order->iscan, qmatrix, iqmatrix);
+#endif
         }
         if (*eob)
           vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, xd->bd,
@@ -935,7 +1146,11 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
           vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                                 scan_order->iscan);
+#else
+                                scan_order->iscan, qmatrix, iqmatrix);
+#endif
         }
         if (*eob)
           vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, xd->bd,
@@ -949,7 +1164,11 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
           vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                                 scan_order->iscan);
+#else
+                                scan_order->iscan, qmatrix, iqmatrix);
+#endif
         }
         if (*eob)
           vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, xd->bd,
@@ -960,11 +1179,15 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
           vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src,
                                     src_stride, dst, dst_stride, xd->bd);
           vp10_highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
-                                   xd->lossless[mbmi->segment_id]);
+                                   xd->lossless[seg_id]);
           vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                                 scan_order->iscan);
+#else
+                                scan_order->iscan, qmatrix, iqmatrix);
+#endif
         }
 
         if (*eob)
@@ -972,9 +1195,11 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
           // eob<=1 which is significant (not just an optimization) for the
           // lossless case.
           vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                       tx_type, xd->lossless[mbmi->segment_id]);
+                                       tx_type, xd->lossless[seg_id]);
         break;
-      default: assert(0); return;
+      default:
+        assert(0);
+        return;
     }
     if (*eob) *(args->skip) = 0;
     return;
@@ -991,7 +1216,11 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
         vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                              p->quant, p->quant_shift, qcoeff, dqcoeff,
                              pd->dequant, eob, scan_order->scan,
+#if !CONFIG_AOM_QM
                              scan_order->iscan);
+#else
+                             scan_order->iscan, qmatrix, iqmatrix);
+#endif
       }
       if (*eob)
         vp10_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, tx_type);
@@ -1003,7 +1232,12 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
         fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
         vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+                       scan_order->scan,
+#if !CONFIG_AOM_QM
+                       scan_order->iscan);
+#else
+                       scan_order->iscan, qmatrix, iqmatrix);
+#endif
       }
       if (*eob)
         vp10_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, tx_type);
@@ -1015,7 +1249,12 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
         fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
         vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+                       scan_order->scan,
+#if !CONFIG_AOM_QM
+                       scan_order->iscan);
+#else
+                       scan_order->iscan, qmatrix, iqmatrix);
+#endif
       }
       if (*eob) vp10_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, tx_type);
       break;
@@ -1024,10 +1263,15 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
         vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
         vp10_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
-                          xd->lossless[mbmi->segment_id]);
+                          xd->lossless[seg_id]);
         vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+                       scan_order->scan,
+#if !CONFIG_AOM_QM
+                       scan_order->iscan);
+#else
+                       scan_order->iscan, qmatrix, iqmatrix);
+#endif
       }
 
       if (*eob) {
@@ -1035,10 +1279,12 @@ void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
         // which is significant (not just an optimization) for the lossless
         // case.
         vp10_inv_txfm_add_4x4(dqcoeff, dst, dst_stride, *eob, tx_type,
-                              xd->lossless[mbmi->segment_id]);
+                              xd->lossless[seg_id]);
       }
       break;
-    default: assert(0); break;
+    default:
+      assert(0);
+      break;
   }
   if (*eob) *(args->skip) = 0;
 }
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 4231d4bcf6..b7a18cc765 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -1613,6 +1613,9 @@ VP10_COMP *vp10_create_compressor(VP10EncoderConfig *oxcf,
    * vp10_init_quantizer() for every frame.
    */
   vp10_init_quantizer(cpi);
+#if CONFIG_AOM_QM
+  aom_qm_init(cm);
+#endif
 
   vp10_loop_filter_init(cm);
 
diff --git a/vp10/encoder/quantize.c b/vp10/encoder/quantize.c
index 0688a69ca7..820dc4a020 100644
--- a/vp10/encoder/quantize.c
+++ b/vp10/encoder/quantize.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <math.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -26,7 +25,12 @@ void vp10_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                         uint16_t *eob_ptr, const int16_t *scan,
-                        const int16_t *iscan) {
+                        const int16_t *iscan
+#if CONFIG_AOM_QM
+                        ,
+                        const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                        ) {
   int i, eob = -1;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
@@ -43,16 +47,29 @@ void vp10_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     for (i = 0; i < n_coeffs; i++) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      int tmp32;
+#if CONFIG_AOM_QM
+      tmp32 = (tmp * wt * quant_ptr[rc != 0]) >> (16 + AOM_QM_BITS);
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+#else
+      tmp32 = (tmp * quant_ptr[rc != 0]) >> 16;
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+#endif
 
-      if (tmp) eob = i;
+      if (tmp32) eob = i;
     }
   }
   *eob_ptr = eob + 1;
@@ -66,7 +83,12 @@ void vp10_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
                                const int16_t *quant_shift_ptr,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const int16_t *scan, const int16_t *iscan) {
+                               const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+                               ,
+                               const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                               ) {
   int i;
   int eob = -1;
   // TODO(jingning) Decide the need of these arguments after the
@@ -84,12 +106,26 @@ void vp10_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
     for (i = 0; i < count; i++) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+#if CONFIG_AOM_QM
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp * quant_ptr[rc != 0] * wt) >> (16 + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+#else
       const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+#endif
       if (abs_qcoeff) eob = i;
     }
   }
@@ -106,7 +142,12 @@ void vp10_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                               const int16_t *quant_shift_ptr,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan) {
+                              const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+                              ,
+                              const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                              ) {
   int i, eob = -1;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
@@ -119,19 +160,38 @@ void vp10_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     for (i = 0; i < n_coeffs; i++) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      int64_t tmp = 0;
+#endif
       const int coeff_sign = (coeff >> 31);
-      int tmp = 0;
+      int tmp32 = 0;
       int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
+#if CONFIG_AOM_QM
+      if (abs_coeff * wt >= (dequant_ptr[rc != 0] << (AOM_QM_BITS - 2))) {
+#else
       if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+#endif
         abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
         abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
-        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+#if CONFIG_AOM_QM
+        tmp = abs_coeff * wt;
+        tmp32 = (int)(tmp * quant_ptr[rc != 0]) >> (AOM_QM_BITS + 15);
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
+#else
+        tmp32 = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant_ptr[rc != 0]) / 2;
+#endif
       }
 
-      if (tmp) eob = i;
+      if (tmp32) eob = i;
     }
   }
   *eob_ptr = eob + 1;
@@ -143,7 +203,12 @@ void vp10_highbd_quantize_fp_32x32_c(
     const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const int16_t *scan, const int16_t *iscan
+#if CONFIG_AOM_QM
+    ,
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+    ) {
   int i, eob = -1;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
@@ -157,15 +222,32 @@ void vp10_highbd_quantize_fp_32x32_c(
       uint32_t abs_qcoeff = 0;
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
+#if CONFIG_AOM_QM
+      const qm_val_t wt = qm_ptr[rc];
+      const qm_val_t iwt = iqm_ptr[rc];
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+#endif
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
+#if CONFIG_AOM_QM
+      if (abs_coeff * wt >= (dequant_ptr[rc != 0] << (AOM_QM_BITS - 2))) {
+#else
       if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+#endif
         const int64_t tmp =
             abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+#if CONFIG_AOM_QM
+        abs_qcoeff =
+            (uint32_t)((tmp * wt * quant_ptr[rc != 0]) >> (AOM_QM_BITS + 15));
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
+#else
         abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 15);
         qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant_ptr[rc != 0]) / 2;
+#endif
       }
 
       if (abs_qcoeff) eob = i;
@@ -180,6 +262,12 @@ void vp10_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
+#if CONFIG_AOM_QM
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+  int is_intra = is_inter_block(&xd->mi[0]->mbmi);
+  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][is_intra][0];
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][0];
+#endif
 
 #if CONFIG_VPX_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -187,7 +275,12 @@ void vp10_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                           p->zbin, p->round, p->quant, p->quant_shift,
                           BLOCK_OFFSET(p->qcoeff, block),
                           BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant,
+#if !CONFIG_AOM_QM
                           &p->eobs[block], scan, iscan);
+#else
+                          &p->eobs[block], scan, iscan,
+                          qmatrix, iqmatrix);
+#endif
     return;
   }
 #endif
@@ -195,7 +288,12 @@ void vp10_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                  p->round, p->quant, p->quant_shift,
                  BLOCK_OFFSET(p->qcoeff, block),
                  BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant, &p->eobs[block],
-                 scan, iscan);
+#if !CONFIG_AOM_QM
+                          scan, iscan);
+#else
+                          scan, iscan,
+                          qmatrix, iqmatrix);
+#endif
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -212,9 +310,12 @@ static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
   const int quant = vp10_dc_quant(q, 0, bit_depth);
 #if CONFIG_VPX_HIGHBITDEPTH
   switch (bit_depth) {
-    case VPX_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
-    case VPX_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
-    case VPX_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+    case VPX_BITS_8:
+      return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+    case VPX_BITS_10:
+      return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+    case VPX_BITS_12:
+      return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
     default:
       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
       return -1;
@@ -288,6 +389,11 @@ void vp10_init_plane_quantizers(VP10_COMP *cpi, MACROBLOCK *x) {
   const int qindex = vp10_get_qindex(&cm->seg, segment_id, cm->base_qindex);
   const int rdmult = vp10_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
   int i;
+#if CONFIG_AOM_QM
+  const int lossless = xd->lossless[segment_id];
+  // Quant matrix only depends on the base QP so there is only one set per frame
+  int qmlevel = lossless ? NUM_QM_LEVELS - 1 : aom_get_qmlevel(cm->base_qindex);
+#endif
 
   // Y
   x->plane[0].quant = quants->y_quant[qindex];
@@ -296,6 +402,12 @@ void vp10_init_plane_quantizers(VP10_COMP *cpi, MACROBLOCK *x) {
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
+#if CONFIG_AOM_QM
+  memcpy(&xd->plane[0].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][0],
+         sizeof(cm->gqmatrix[qmlevel][0]));
+  memcpy(&xd->plane[0].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][0],
+         sizeof(cm->giqmatrix[qmlevel][0]));
+#endif
   xd->plane[0].dequant = cpi->y_dequant[qindex];
 
   x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
@@ -309,6 +421,12 @@ void vp10_init_plane_quantizers(VP10_COMP *cpi, MACROBLOCK *x) {
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
+#if CONFIG_AOM_QM
+    memcpy(&xd->plane[i].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1],
+           sizeof(cm->gqmatrix[qmlevel][1]));
+    memcpy(&xd->plane[i].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1],
+           sizeof(cm->giqmatrix[qmlevel][1]));
+#endif
     xd->plane[i].dequant = cpi->uv_dequant[qindex];
 
     x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index e65f4f3cdd..096bc32ff5 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -11,6 +11,371 @@
 #include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 
+#if CONFIG_AOM_QM
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr,
+                     const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int64_t tmp, eob = -1;
+  int32_t tmp32;
+  int dequant =
+      (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (16 + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+    if (tmp32) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VPX_HIGHBITDEPTH
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t quant, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+                            const qm_val_t *iqm_ptr) {
+  int eob = -1;
+  int dequant =
+      (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[0];
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp * qm_ptr[0] * quant) >> (16 + AOM_QM_BITS));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
+    if (abs_qcoeff) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr,
+                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+  const int n_coeffs = 1024;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int64_t tmp, eob = -1;
+  int32_t tmp32;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                INT16_MIN, INT16_MAX);
+    tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (15 + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+    dequant =
+        (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
+    if (tmp32) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VPX_HIGHBITDEPTH
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                                  const int16_t *round_ptr, const int16_t quant,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  const int16_t dequant_ptr, uint16_t *eob_ptr,
+                                  const qm_val_t *qm_ptr,
+                                  const qm_val_t *iqm_ptr) {
+  const int n_coeffs = 1024;
+  int eob = -1;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp * qm_ptr[0] * quant) >> (15 + AOM_QM_BITS));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dequant =
+        (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 2;
+    if (abs_qcoeff) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block, const int16_t *zbin_ptr,
+                      const int16_t *round_ptr, const int16_t *quant_ptr,
+                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan, const qm_val_t *qm_ptr,
+                      const qm_val_t *iqm_ptr) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc] * wt;
+
+      if (coeff < (zbins[rc != 0] << AOM_QM_BITS) &&
+          coeff > (nzbins[rc != 0] << AOM_QM_BITS))
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int dequant;
+
+      if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+        int32_t tmp32;
+        int64_t tmp =
+            clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+        tmp = tmp * wt;
+        tmp32 = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                 quant_shift_ptr[rc != 0]) >>
+                (16 + AOM_QM_BITS);  // quantization
+        dequant =
+            (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+            AOM_QM_BITS;
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+
+        if (tmp32) eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VPX_HIGHBITDEPTH
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int dequant;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc] * wt;
+
+      if (coeff < (zbins[rc != 0] << AOM_QM_BITS) &&
+          coeff > (nzbins[rc != 0] << AOM_QM_BITS))
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+        const int64_t tmpw = tmp1 * wt;
+        const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (16 + AOM_QM_BITS));
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dequant =
+            (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+            AOM_QM_BITS;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+        if (abs_qcoeff) eob = i;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t *zbin_ptr,
+                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan,
+                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  int dequant;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc] * wt;
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
+          coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const qm_val_t wt = qm_ptr[rc];
+      int64_t tmp;
+      int tmp32;
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = tmp * wt;
+      tmp32 = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+               quant_shift_ptr[rc != 0]) >>
+              (15 + AOM_QM_BITS);
+
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      dequant =
+          (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
+
+      if (tmp32) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VPX_HIGHBITDEPTH
+void vpx_highbd_quantize_b_32x32_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  int dequant;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const qm_val_t wt = qm_ptr[rc];
+      const int coeff = coeff_ptr[rc] * wt;
+
+      // If the coefficient is out of the base ZBIN range, keep it for
+      // quantization.
+      if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
+          coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
+        idx_arr[idx++] = i;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = scan[idx_arr[i]];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const qm_val_t wt = qm_ptr[rc];
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int64_t tmp1 =
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      const int64_t tmpw = tmp1 * wt;
+      const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (15 + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dequant =
+          (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
+      if (abs_qcoeff) eob = idx_arr[i];
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
+#else
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -317,3 +682,4 @@ void vpx_highbd_quantize_b_32x32_c(
   *eob_ptr = eob + 1;
 }
 #endif
+#endif
diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
index 75ab9f28bb..229d4dce10 100644
--- a/vpx_dsp/quantize.h
+++ b/vpx_dsp/quantize.h
@@ -18,6 +18,47 @@
 extern "C" {
 #endif
 
+#if CONFIG_AOM_QM
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+                     const int16_t *round_ptr, const int16_t quant_ptr,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                     const int16_t dequant_ptr, uint16_t *eob_ptr,
+                     const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t dequant_ptr, uint16_t *eob_ptr,
+                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block, const int16_t *zbin_ptr,
+                      const int16_t *round_ptr, const int16_t *quant_ptr,
+                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan, const qm_val_t *qm_ptr,
+                      const qm_val_t *iqm_ptr);
+#if CONFIG_VPX_HIGHBITDEPTH
+void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+                            const qm_val_t *iqm_ptr);
+void vpx_highbd_quantize_dc_32x32(
+    const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr,
+    const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
+#endif
+#else
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant_ptr,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -26,7 +67,13 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                            const int16_t *round_ptr, const int16_t quant_ptr,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant_ptr, uint16_t *eob_ptr);
-
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block, const int16_t *zbin_ptr,
+                      const int16_t *round_ptr, const int16_t *quant_ptr,
+                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan);
 #if CONFIG_VPX_HIGHBITDEPTH
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                             int skip_block, const int16_t *round_ptr,
@@ -39,6 +86,14 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
                                   const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan);
+#endif
 #endif
 
 #ifdef __cplusplus
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index dcdefa45e8..10e33215d6 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -931,21 +931,35 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
 #
 # Quantization
 #
-if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
-  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
+if (vpx_config("CONFIG_AOM_QM") eq "yes") {
+  if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
+    add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
+    add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
 
-  if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b sse2/;
+    if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
-  }  # CONFIG_VPX_HIGHBITDEPTH
-}  # CONFIG_VP10_ENCODER
+      add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
+    }  # CONFIG_VPX_HIGHBITDEPTH
+  }  # CONFIG_VP10_ENCODER
+} else {
+  if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
+    add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
+
+    add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
+
+    if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+      specialize qw/vpx_highbd_quantize_b sse2/;
+
+      add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+      specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+    }  # CONFIG_VPX_HIGHBITDEPTH
+  }  # CONFIG_VP10_ENCODER
+} # CONFIG_AOM_QM
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 #
-- 
GitLab