diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index f0f6981517dae2b94fc8c081cef82f194e092e34..f9b7d26bb0a31a6ecaab7d5de11e8bef5d41022c 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -26,6 +26,9 @@ void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm) {
   cfl->subsampling_y = cm->subsampling_y;
   cfl->are_parameters_computed = 0;
   cfl->store_y = 0;
+#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+  cfl_clear_sub8x8_val(cfl);
+#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
 }
 
 // Load from the CfL pixel buffer into output
@@ -311,10 +314,21 @@ void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, int row,
       assert(col == 0);
       col++;
     }
+#if CONFIG_DEBUG
+    for (int unit_r = 0; unit_r < tx_size_high_unit[tx_size]; unit_r++) {
+      assert(row + unit_r < 2);
+      int row_off = (row + unit_r) * 2;
+      for (int unit_c = 0; unit_c < tx_size_wide_unit[tx_size]; unit_c++) {
+        assert(col + unit_c < 2);
+        assert(cfl->sub8x8_val[row_off + col + unit_c] == 0);
+        cfl->sub8x8_val[row_off + col + unit_c] = 1;
+      }
+    }
+#endif  // CONFIG_DEBUG
   }
 #else
   (void)bsize;
-#endif
+#endif  // CONFIG_CHROMA_SUB8X8
 
   // Invalidate current parameters
   cfl->are_parameters_computed = 0;
@@ -357,6 +371,20 @@ void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
 #if CONFIG_CHROMA_SUB8X8
   const BLOCK_SIZE plane_bsize = AOMMAX(
       BLOCK_4X4, get_plane_block_size(mbmi->sb_type, &xd->plane[AOM_PLANE_U]));
+#if CONFIG_DEBUG
+  if (mbmi->sb_type < BLOCK_8X8) {
+    const int val_high =
+        block_size_high[BLOCK_8X8] / block_size_high[BLOCK_4X4];
+    const int val_wide =
+        block_size_wide[BLOCK_8X8] / block_size_wide[BLOCK_4X4];
+    for (int val_r = 0; val_r < val_high; val_r++) {
+      for (int val_c = 0; val_c < val_wide; val_c++) {
+        assert(cfl->sub8x8_val[(val_r * val_wide) + val_c] == 1);
+      }
+    }
+    cfl_clear_sub8x8_val(cfl);
+  }
+#endif  // CONFIG_DEBUG
 #else
   const BLOCK_SIZE plane_bsize =
       get_plane_block_size(mbmi->sb_type, &xd->plane[AOM_PLANE_U]);
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 1a1d2f68209ad0405dd41897c3d7211a47863134..d4ce9005ce810e545b0d2d1d5288adbf73f16209 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -13,6 +13,7 @@
 #define AV1_COMMON_CFL_H_
 
 #include <assert.h>
+#include <string.h>
 
 #include "av1/common/common.h"
 #include "av1/common/enums.h"
@@ -64,6 +65,16 @@ typedef struct {
 
   // Whether the reconstructed luma pixels need to be stored
   int store_y;
+
+#if CONFIG_CB4X4
+  int is_chroma_reference;
+#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+  // The prediction used for sub8x8 blocks originates from multiple luma blocks,
+  // this array is used to validate that cfl_store() is called only once for
+  // each luma block
+  uint8_t sub8x8_val[4];
+#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+#endif  // CONFIG_CB4X4
 } CFL_CTX;
 
 static INLINE int get_scaled_luma_q0(int alpha_q3, int y_pix, int avg_q3) {
@@ -71,6 +82,12 @@ static INLINE int get_scaled_luma_q0(int alpha_q3, int y_pix, int avg_q3) {
   return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6);
 }
 
+#if CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+static INLINE void cfl_clear_sub8x8_val(CFL_CTX *cfl) {
+  memset(cfl->sub8x8_val, 0, sizeof(cfl->sub8x8_val));
+}
+#endif  // CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+
 void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm);
 
 void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 0931a51f303086d9223a105e5e8ccd49f69fa450..e0fb51ba60bc3ddd1f0afd2f3611ef584aa86b0e 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -526,14 +526,14 @@ static void predict_and_reconstruct_intra_block(
         AOMMAX(BLOCK_4X4, get_plane_block_size(mbmi->sb_type, pd));
 #else
     const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
-#endif
+#endif  // CONFIG_CHROMA_SUB8X8
     uint8_t *dst =
         &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
     // TODO (ltrudeau) Store sub-8x8 inter blocks when bottom right block is
     // intra predicted.
     cfl_store(xd->cfl, dst, pd->dst.stride, row, col, tx_size, plane_bsize);
   }
-#endif
+#endif  // CONFIG_CFL
 }
 
 #if CONFIG_VAR_TX && !CONFIG_COEF_INTERLEAVE
@@ -1889,6 +1889,11 @@ static void decode_token_and_recon_block(AV1Decoder *const pbi,
         }
       }
     }
+#if CONFIG_CFL && CONFIG_CB4X4 && CONFIG_DEBUG
+    if (xd->cfl->is_chroma_reference) {
+      cfl_clear_sub8x8_val(xd->cfl);
+    }
+#endif  // CONFIG_CFL && CONFIG_CB4X4 && CONFIG_DEBUG
   } else {
     int ref;
 
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 4ac0fda803f470e9e2ab4e701fcbfd1e7e9441f1..25a0ab9c380672107856330dc7b5dd435d02cfc4 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -1179,13 +1179,13 @@ static void read_intra_frame_mode_info(AV1_COMMON *const cm,
 #if CONFIG_CB4X4
   if (is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
                           xd->plane[1].subsampling_y)) {
+#if CONFIG_CFL
+    xd->cfl->is_chroma_reference = 1;
+#endif  // CONFIG_CFL
+#endif  // CONFIG_CB4X4
     mbmi->uv_mode = read_intra_mode_uv(ec_ctx, xd, r, mbmi->mode);
-#else
-  mbmi->uv_mode = read_intra_mode_uv(ec_ctx, xd, r, mbmi->mode);
-#endif
 
 #if CONFIG_CFL
-    // TODO(ltrudeau) support PALETTE
     if (mbmi->uv_mode == UV_CFL_PRED) {
       mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs);
       xd->cfl->store_y = 1;
@@ -1198,6 +1198,10 @@ static void read_intra_frame_mode_info(AV1_COMMON *const cm,
   } else {
     // Avoid decoding angle_info if there is is no chroma prediction
     mbmi->uv_mode = UV_DC_PRED;
+#if CONFIG_CFL
+    xd->cfl->is_chroma_reference = 0;
+    xd->cfl->store_y = 1;
+#endif
   }
 #endif
 
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index e3d2574e00e58592553bedb85a0547c2a1688e01..74f91f6c176a1b751e86df2857c743ee79421da8 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -3609,6 +3609,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #else
     restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
 #endif
+#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+    if (!x->skip_chroma_rd) {
+      cfl_clear_sub8x8_val(xd->cfl);
+    }
+#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
   }
 
   // store estimated motion vector
@@ -3812,6 +3817,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
 #endif  // CONFIG_SUPERTX
     }
 
+#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+    if (!reached_last_index && sum_rdc.rdcost >= best_rdc.rdcost)
+      cfl_clear_sub8x8_val(xd->cfl);
+#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -3997,6 +4007,11 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     }
 #endif  // CONFIG_SUPERTX
 
+#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+    if (sum_rdc.rdcost >= best_rdc.rdcost) {
+      cfl_clear_sub8x8_val(xd->cfl);
+    }
+#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_HORZ];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -4177,6 +4192,12 @@ static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
     }
 #endif  // CONFIG_SUPERTX
 
+#if CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+    if (sum_rdc.rdcost >= best_rdc.rdcost) {
+      cfl_clear_sub8x8_val(xd->cfl);
+    }
+#endif  // CONFIG_CFL && CONFIG_CHROMA_SUB8X8 && CONFIG_DEBUG
+
     if (sum_rdc.rdcost < best_rdc.rdcost) {
       sum_rdc.rate += partition_cost[PARTITION_VERT];
       sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -5993,6 +6014,13 @@ static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
     }
 #if CONFIG_CFL
     xd->cfl->store_y = 0;
+#if CONFIG_CB4X4 && CONFIG_DEBUG
+    if (is_chroma_reference(mi_row, mi_col, bsize, xd->cfl->subsampling_x,
+                            xd->cfl->subsampling_y) &&
+        !xd->cfl->are_parameters_computed) {
+      cfl_clear_sub8x8_val(xd->cfl);
+    }
+#endif  // CONFIG_CB4X4 && CONFIG_DEBUG
 #endif  // CONFIG_CFL
     if (!dry_run) {
       sum_intra_stats(td->counts, xd, mi, xd->above_mi, xd->left_mi,
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 67111316cc7ab53d4b160c1155168c8df1878c8f..3337574c75c5a2ab37c345ed88ea4b6fd6e63793 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -1100,7 +1100,7 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
     // intra predicted.
     cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size, plane_bsize);
   }
-#endif
+#endif  // CONFIG_CFL
 }
 
 void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,