diff --git a/test/reconintra_predictors_test.cc b/test/reconintra_predictors_test.cc
index 38720baa856b2c36007bdee571636e27ede5254a..5da9af57478622b96edb697fb7023d4ca72fa970 100644
--- a/test/reconintra_predictors_test.cc
+++ b/test/reconintra_predictors_test.cc
@@ -32,6 +32,20 @@ typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, int bs,
 typedef tuple<Predictor, Predictor, int> PredFuncMode;
 typedef tuple<PredFuncMode, int> PredParams;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*HbdPredictor)(uint16_t *dst, ptrdiff_t stride, int bs,
+                             const uint16_t *above, const uint16_t *left,
+                             int bd);
+
+// Note:
+//  Test parameter list:
+//  Reference predictor, optimized predictor, prediction mode, block size,
+//  bit depth
+//
+typedef tuple<HbdPredictor, HbdPredictor, int> HbdPredFuncMode;
+typedef tuple<HbdPredFuncMode, int, int> HbdPredParams;
+#endif
+
 const int MaxBlkSize = 32;
 
 // By default, disable speed test
@@ -136,6 +150,105 @@ class VP10IntraPredOptimzTest : public ::testing::TestWithParam<PredParams> {
   uint8_t *predRef_;
 };
 
+#if CONFIG_VP9_HIGHBITDEPTH
+class VP10HbdIntraPredOptimzTest :
+      public ::testing::TestWithParam<HbdPredParams> {
+ public:
+  virtual ~VP10HbdIntraPredOptimzTest() {}
+  virtual void SetUp() {
+    HbdPredFuncMode funcMode = GET_PARAM(0);
+    predFuncRef_ = std::tr1::get<0>(funcMode);
+    predFunc_ = std::tr1::get<1>(funcMode);
+    mode_ = std::tr1::get<2>(funcMode);
+    blockSize_ = GET_PARAM(1);
+    bd_ = GET_PARAM(2);
+
+    alloc_ = (uint16_t *)malloc((3 * MaxBlkSize + 2) * sizeof(alloc_[0]));
+    predRef_ =
+        (uint16_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(predRef_[0]));
+    pred_ = (uint16_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(pred_[0]));
+  }
+
+  virtual void TearDown() {
+    delete[] alloc_;
+    delete[] predRef_;
+    delete[] pred_;
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RunTest() const {
+    int tstIndex = 0;
+    int stride = blockSize_;
+    uint16_t *left = alloc_;
+    uint16_t *above = alloc_ + MaxBlkSize + 1;
+    while (tstIndex < MaxTestNum) {
+      PrepareBuffer();
+      predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_);
+      ASM_REGISTER_STATE_CHECK(
+          predFunc_(pred_, stride, blockSize_, &above[1], left, bd_));
+      DiffPred(tstIndex);
+      tstIndex += 1;
+    }
+  }
+
+  void RunSpeedTestC() const {
+    int tstIndex = 0;
+    int stride = blockSize_;
+    uint16_t *left = alloc_;
+    uint16_t *above = alloc_ + MaxBlkSize + 1;
+    PrepareBuffer();
+    while (tstIndex < MaxTestNum) {
+      predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_);
+      tstIndex += 1;
+    }
+  }
+
+  void RunSpeedTestSSE() const {
+    int tstIndex = 0;
+    int stride = blockSize_;
+    uint16_t *left = alloc_;
+    uint16_t *above = alloc_ + MaxBlkSize + 1;
+    PrepareBuffer();
+    while (tstIndex < MaxTestNum) {
+      predFunc_(predRef_, stride, blockSize_, &above[1], left, bd_);
+      tstIndex += 1;
+    }
+  }
+
+ private:
+  void PrepareBuffer() const {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    int i = 0;
+    while (i < (3 * MaxBlkSize + 2)) {
+      alloc_[i] = rnd.Rand16() & ((1 << bd_) - 1);
+      i += 1;
+    }
+  }
+
+  void DiffPred(int testNum) const {
+    int i = 0;
+    while (i < blockSize_ * blockSize_) {
+      EXPECT_EQ(predRef_[i], pred_[i])
+          << "Error at position: " << i << " "
+          << "Block size: " << blockSize_ << " "
+          << "Bit depth: " << bd_ << " "
+          << "Test number: " << testNum;
+      i += 1;
+    }
+  }
+
+  HbdPredictor predFunc_;
+  HbdPredictor predFuncRef_;
+  int mode_;
+  int blockSize_;
+  int bd_;
+  uint16_t *alloc_;
+  uint16_t *pred_;
+  uint16_t *predRef_;
+};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 TEST_P(VP10IntraPredOptimzTest, BitExactCheck) {
   RunTest();
 }
@@ -150,6 +263,22 @@ TEST_P(VP10IntraPredOptimzTest, SpeedCheckSSE) {
 }
 #endif
 
+#if CONFIG_VP9_HIGHBITDEPTH
+TEST_P(VP10HbdIntraPredOptimzTest, BitExactCheck) {
+  RunTest();
+}
+
+#if PREDICTORS_SPEED_TEST
+TEST_P(VP10HbdIntraPredOptimzTest, SpeedCheckC) {
+  RunSpeedTestC();
+}
+
+TEST_P(VP10HbdIntraPredOptimzTest, SpeedCheckSSE) {
+  RunSpeedTestSSE();
+}
+#endif  // PREDICTORS_SPEED_TEST
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 using std::tr1::make_tuple;
 
 const PredFuncMode kPredFuncMdArray[] = {
@@ -183,4 +312,38 @@ INSTANTIATE_TEST_CASE_P(
          ::testing::ValuesIn(kPredFuncMdArray),
          ::testing::ValuesIn(kBlkSize)));
 
+#if CONFIG_VP9_HIGHBITDEPTH
+const HbdPredFuncMode kHbdPredFuncMdArray[] = {
+  make_tuple(vp10_highbd_dc_filter_predictor_c,
+             vp10_highbd_dc_filter_predictor_sse4_1, DC_PRED),
+  make_tuple(vp10_highbd_v_filter_predictor_c,
+             vp10_highbd_v_filter_predictor_sse4_1, V_PRED),
+  make_tuple(vp10_highbd_h_filter_predictor_c,
+             vp10_highbd_h_filter_predictor_sse4_1, H_PRED),
+  make_tuple(vp10_highbd_d45_filter_predictor_c,
+             vp10_highbd_d45_filter_predictor_sse4_1, D45_PRED),
+  make_tuple(vp10_highbd_d135_filter_predictor_c,
+             vp10_highbd_d135_filter_predictor_sse4_1, D135_PRED),
+  make_tuple(vp10_highbd_d117_filter_predictor_c,
+             vp10_highbd_d117_filter_predictor_sse4_1, D117_PRED),
+  make_tuple(vp10_highbd_d153_filter_predictor_c,
+             vp10_highbd_d153_filter_predictor_sse4_1, D153_PRED),
+  make_tuple(vp10_highbd_d207_filter_predictor_c,
+             vp10_highbd_d207_filter_predictor_sse4_1, D207_PRED),
+  make_tuple(vp10_highbd_d63_filter_predictor_c,
+             vp10_highbd_d63_filter_predictor_sse4_1, D63_PRED),
+  make_tuple(vp10_highbd_tm_filter_predictor_c,
+             vp10_highbd_tm_filter_predictor_sse4_1, TM_PRED),
+};
+
+const int kBd[] = {10, 12};
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10HbdIntraPredOptimzTest,
+    ::testing::Combine(
+         ::testing::ValuesIn(kHbdPredFuncMdArray),
+         ::testing::ValuesIn(kBlkSize),
+         ::testing::ValuesIn(kBd)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 }  // namespace
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index 19d0c3d48889b60df8ef74a0baa77d5f973fa26d..b5b0777e56fd251e286f790cc8c95dc1d0959561 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -1071,85 +1071,115 @@ static void highbd_filter_intra_predictors_4tap(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-static void highbd_dc_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+void vp10_highbd_dc_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                        int bs, const uint16_t *above,
                                        const uint16_t *left, int bd) {
   highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED,
                                       bd);
 }
 
-static void highbd_v_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+void vp10_highbd_v_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                       int bs, const uint16_t *above,
                                       const uint16_t *left, int bd) {
   highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED,
                                       bd);
 }
 
-static void highbd_h_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+void vp10_highbd_h_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                       int bs, const uint16_t *above,
                                       const uint16_t *left, int bd) {
   highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED,
                                       bd);
 }
 
-static void highbd_d45_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+void vp10_highbd_d45_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                         int bs, const uint16_t *above,
                                         const uint16_t *left, int bd) {
   highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED,
                                       bd);
 }
 
-static void highbd_d135_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+void vp10_highbd_d135_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
   highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED,
                                       bd);
 }
 
-static void highbd_d117_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+void vp10_highbd_d117_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
   highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED,
                                       bd);
 }
 
-static void highbd_d153_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+void vp10_highbd_d153_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
   highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED,
                                       bd);
 }
 
-static void highbd_d207_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+void vp10_highbd_d207_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
   highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED,
                                       bd);
 }
 
-static void highbd_d63_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+void vp10_highbd_d63_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                         int bs, const uint16_t *above,
                                         const uint16_t *left, int bd) {
   highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED,
                                       bd);
 }
 
-static void highbd_tm_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+void vp10_highbd_tm_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                        int bs, const uint16_t *above,
                                        const uint16_t *left, int bd) {
   highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED,
                                       bd);
 }
 
-static void (*highbd_filter_intra_predictors[EXT_INTRA_MODES])(uint16_t *dst,
-    ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left,
-    int bd) = {
-        highbd_dc_filter_predictor, highbd_v_filter_predictor,
-        highbd_h_filter_predictor, highbd_d45_filter_predictor,
-        highbd_d135_filter_predictor, highbd_d117_filter_predictor,
-        highbd_d153_filter_predictor, highbd_d207_filter_predictor,
-        highbd_d63_filter_predictor, highbd_tm_filter_predictor,
-};
+static void highbd_filter_intra_predictors(int mode, uint16_t *dst,
+                                           ptrdiff_t stride, int bs,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  switch (mode) {
+    case DC_PRED:
+      vp10_highbd_dc_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case V_PRED:
+      vp10_highbd_v_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case H_PRED:
+      vp10_highbd_h_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case D45_PRED:
+        vp10_highbd_d45_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case D135_PRED:
+      vp10_highbd_d135_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case D117_PRED:
+      vp10_highbd_d117_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case D153_PRED:
+        vp10_highbd_d153_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case D207_PRED:
+      vp10_highbd_d207_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case D63_PRED:
+      vp10_highbd_d63_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case TM_PRED:
+      vp10_highbd_tm_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    default:
+      assert(0);
+  }
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_EXT_INTRA
 
@@ -1303,7 +1333,7 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
 
 #if CONFIG_EXT_INTRA
   if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
-    highbd_filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
+    highbd_filter_intra_predictors(ext_intra_mode, dst, dst_stride, bs,
         const_above_row, left_col, xd->bd);
     return;
   }
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 6dbcc65c9889e9919ae456bd2b197887040d427c..b82b2634996ea0745e56e9defa519368743b7aab 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -320,6 +320,29 @@ if (vpx_config("CONFIG_EXT_INTRA") eq "yes") {
   specialize qw/vp10_d63_filter_predictor sse4_1/;
   add_proto qw/void vp10_tm_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
   specialize qw/vp10_tm_filter_predictor sse4_1/;
+  # High bitdepth functions
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vp10_highbd_dc_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp10_highbd_dc_filter_predictor sse4_1/;
+    add_proto qw/void vp10_highbd_v_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp10_highbd_v_filter_predictor sse4_1/;
+    add_proto qw/void vp10_highbd_h_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp10_highbd_h_filter_predictor sse4_1/;
+    add_proto qw/void vp10_highbd_d45_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp10_highbd_d45_filter_predictor sse4_1/;
+    add_proto qw/void vp10_highbd_d135_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp10_highbd_d135_filter_predictor sse4_1/;
+    add_proto qw/void vp10_highbd_d117_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp10_highbd_d117_filter_predictor sse4_1/;
+    add_proto qw/void vp10_highbd_d153_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp10_highbd_d153_filter_predictor sse4_1/;
+    add_proto qw/void vp10_highbd_d207_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp10_highbd_d207_filter_predictor sse4_1/;
+    add_proto qw/void vp10_highbd_d63_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp10_highbd_d63_filter_predictor sse4_1/;
+    add_proto qw/void vp10_highbd_tm_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/vp10_highbd_tm_filter_predictor sse4_1/;
+  }
 }
 
 # High bitdepth functions
diff --git a/vp10/common/x86/reconintra_sse4.c b/vp10/common/x86/reconintra_sse4.c
index 851d850e732c0a6bbd6a1878435e8f6b5d7ce80a..7399de2b0f95dfa8270364050d03948b1f5c202f 100644
--- a/vp10/common/x86/reconintra_sse4.c
+++ b/vp10/common/x86/reconintra_sse4.c
@@ -591,3 +591,323 @@ void vp10_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
   GetIntraFilterParams(bs, TM_PRED, &prm[0]);
   FilterPrediction(above, left, bs, prm, dst, stride);
 }
+
+// ============== High Bit Depth ==============
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int HighbdGetMeanValue4x4(const uint16_t *above,
+                                        const uint16_t *left, const int bd,
+                                        __m128i *params) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+  (void)bd;
+
+  sum_vector = _mm_add_epi16(a, l);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 4;
+  sum_value >>= 3;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int HighbdGetMeanValue8x8(const uint16_t *above,
+                                        const uint16_t *left, const int bd,
+                                        __m128i *params) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+  (void)bd;
+
+  sum_vector = _mm_add_epi16(a, l);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 8;
+  sum_value >>= 4;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+// Note:
+//  Process 16 pixels above and left, 10-bit depth
+//  Add to the last 8 pixels sum
+static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left,
+                           __m128i *sum) {
+  __m128i a = _mm_loadu_si128((const __m128i *)above);
+  __m128i l = _mm_loadu_si128((const __m128i *)left);
+  sum[0] = _mm_add_epi16(a, l);
+  a = _mm_loadu_si128((const __m128i *)(above + 8));
+  l = _mm_loadu_si128((const __m128i *)(left + 8));
+  sum[0] = _mm_add_epi16(sum[0], a);
+  sum[0] = _mm_add_epi16(sum[0], l);
+}
+
+// Note:
+//  Process 16 pixels above and left, 12-bit depth
+//  Add to the last 8 pixels sum
+static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left,
+                           __m128i *sum) {
+  __m128i a = _mm_loadu_si128((const __m128i *)above);
+  __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i v0, v1;
+
+  v0 = _mm_unpacklo_epi16(a, zero);
+  v1 = _mm_unpacklo_epi16(l, zero);
+  sum[0] = _mm_add_epi32(v0, v1);
+
+  v0 = _mm_unpackhi_epi16(a, zero);
+  v1 = _mm_unpackhi_epi16(l, zero);
+  sum[0] = _mm_add_epi32(sum[0], v0);
+  sum[0] = _mm_add_epi32(sum[0], v1);
+
+  a = _mm_loadu_si128((const __m128i *)(above + 8));
+  l = _mm_loadu_si128((const __m128i *)(left + 8));
+
+  v0 = _mm_unpacklo_epi16(a, zero);
+  v1 = _mm_unpacklo_epi16(l, zero);
+  sum[0] = _mm_add_epi32(sum[0], v0);
+  sum[0] = _mm_add_epi32(sum[0], v1);
+
+  v0 = _mm_unpackhi_epi16(a, zero);
+  v1 = _mm_unpackhi_epi16(l, zero);
+  sum[0] = _mm_add_epi32(sum[0], v0);
+  sum[0] = _mm_add_epi32(sum[0], v1);
+}
+
+static INLINE int HighbdGetMeanValue16x16(const uint16_t *above,
+                                          const uint16_t *left, const int bd,
+                                          __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint32_t sum_value = 0;
+
+  if (10 == bd) {
+    AddPixels10bit(above, left, &sum_vector);
+    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+    u = _mm_srli_si128(sum_vector, 2);
+    sum_vector = _mm_add_epi16(sum_vector, u);
+    sum_value = _mm_extract_epi16(sum_vector, 0);
+  } else if (12 == bd) {
+    AddPixels12bit(above, left, &sum_vector);
+
+    sum_vector = _mm_hadd_epi32(sum_vector, zero);
+    u = _mm_srli_si128(sum_vector, 4);
+    sum_vector = _mm_add_epi32(u, sum_vector);
+    sum_value = _mm_extract_epi32(sum_vector, 0);
+  }
+
+  sum_value += 16;
+  sum_value >>= 5;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int HighbdGetMeanValue32x32(const uint16_t *above,
+                                          const uint16_t *left, const int bd,
+                                          __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector[2], u;
+  uint32_t sum_value = 0;
+
+  if (10 == bd) {
+    AddPixels10bit(above, left, &sum_vector[0]);
+    AddPixels10bit(above + 16, left + 16, &sum_vector[1]);
+
+    sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
+    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
+    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
+
+    u = _mm_srli_si128(sum_vector[0], 2);
+    sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
+    sum_value = _mm_extract_epi16(sum_vector[0], 0);
+  } else if (12 == bd) {
+    AddPixels12bit(above, left, &sum_vector[0]);
+    AddPixels12bit(above + 16, left + 16, &sum_vector[1]);
+
+    sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]);
+    sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero);
+    u = _mm_srli_si128(sum_vector[0], 4);
+    sum_vector[0] = _mm_add_epi32(u, sum_vector[0]);
+    sum_value = _mm_extract_epi32(sum_vector[0], 0);
+  }
+
+  sum_value += 32;
+  sum_value >>= 6;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+// Note:
+//  params[4] : mean value, 4 int32_t repetition
+//
+static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above,
+                                               const uint16_t *left, int bs,
+                                               const int bd, __m128i *params) {
+  int meanValue = 0;
+  switch (bs) {
+    case 4:
+      meanValue = HighbdGetMeanValue4x4(above, left, bd, params);
+      break;
+    case 8:
+      meanValue = HighbdGetMeanValue8x8(above, left, bd, params);
+      break;
+    case 16:
+      meanValue = HighbdGetMeanValue16x16(above, left, bd, params);
+      break;
+    case 32:
+      meanValue = HighbdGetMeanValue32x32(above, left, bd, params);
+      break;
+    default:
+      assert(0);
+  }
+  return meanValue;
+}
+
+// Note:
+//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
+//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
+static void HighbdGeneratePrediction(const uint16_t *above,
+                                     const uint16_t *left,
+                                     const int bs, const int bd,
+                                     const __m128i *prm, int meanValue,
+                                     uint16_t *dst,
+                                     ptrdiff_t stride) {
+  int pred[33][65];
+  int r, c, colBound;
+  int remainings;
+  int ipred;
+
+  for (r = 0; r < bs; ++r) {
+    pred[r + 1][0] = (int)left[r] - meanValue;
+  }
+
+  above -= 1;
+  for (c = 0; c < 2 * bs + 1; ++c) {
+    pred[0][c] = (int)above[c] - meanValue;
+  }
+
+  r = 0;
+  c = 0;
+  while (r < bs) {
+    colBound = (bs << 1) - r;
+    for (c = 0; c < colBound; c += 4) {
+      remainings = colBound - c + 1;
+      ProducePixels(&pred[r][c], prm, remainings);
+    }
+    r += 1;
+  }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = pred[r + 1][c + 1] + meanValue;
+      dst[c] = clip_pixel_highbd(ipred, bd);
+    }
+    dst += stride;
+  }
+}
+
+static void HighbdFilterPrediction(const uint16_t *above, const uint16_t *left,
+                                   int bs, const int bd, __m128i *prm,
+                                   uint16_t *dst, ptrdiff_t stride) {
+  int meanValue = 0;
+  meanValue = HighbdCalcRefPixelsMeanValue(above, left, bs, bd, &prm[4]);
+  HighbdGeneratePrediction(above, left, bs, bd, prm, meanValue, dst, stride);
+}
+
+void vp10_highbd_dc_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                            int bs, const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void vp10_highbd_v_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, V_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void vp10_highbd_h_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, H_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void vp10_highbd_d45_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void vp10_highbd_d135_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                              int bs, const uint16_t *above,
+                                              const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void vp10_highbd_d117_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                              int bs, const uint16_t *above,
+                                              const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void vp10_highbd_d153_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                              int bs, const uint16_t *above,
+                                              const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void vp10_highbd_d207_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                              int bs, const uint16_t *above,
+                                              const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void vp10_highbd_d63_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void vp10_highbd_tm_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                            int bs, const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, TM_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH