diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 22d827fe3e30c3b8f264e22e6bc4ddbbd4f9fe2a..dcdefa45e8fc2b37ac7f3808430fbdacff19014a 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -699,7 +699,7 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_iwht4x4_1_add/;
 
   add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_iwht4x4_16_add/;
+  specialize qw/vpx_iwht4x4_16_add/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
   specialize qw/vpx_highbd_idct4x4_1_add/;
@@ -762,7 +762,7 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
 
     add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1_add/;
-    
+
     add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct4x4_16_add/;
 
@@ -785,10 +785,10 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct4x4_1_add sse2/;
 
     add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_64_add sse2/;
+    specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64_x86inc";
 
     add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_12_add sse2/;
+    specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64_x86inc";
 
     add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct8x8_1_add sse2/;
@@ -803,14 +803,15 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct16x16_1_add sse2/;
 
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add sse2/;
+    specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64_x86inc";
 
     add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_135_add sse2/;
+    specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64_x86inc";
+    # Need to add 135 eob idct32x32 implementations.
     $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
 
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_34_add sse2/;
+    specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64_x86inc";
 
     add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1_add sse2/;
diff --git a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
index 80a330b7b9bad33590a48526968a63d4c934d775..a835161ea02de3d2a9db9bcf5a72392402239091 100644
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -220,7 +220,24 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
   mova    m12, [pw_11585x2]
 
   lea      r3, [2 * strideq]
-
+%if CONFIG_VPX_HIGHBITDEPTH
+  mova     m0, [inputq +   0]
+  packssdw m0, [inputq +  16]
+  mova     m1, [inputq +  32]
+  packssdw m1, [inputq +  48]
+  mova     m2, [inputq +  64]
+  packssdw m2, [inputq +  80]
+  mova     m3, [inputq +  96]
+  packssdw m3, [inputq + 112]
+  mova     m4, [inputq + 128]
+  packssdw m4, [inputq + 144]
+  mova     m5, [inputq + 160]
+  packssdw m5, [inputq + 176]
+  mova     m6, [inputq + 192]
+  packssdw m6, [inputq + 208]
+  mova     m7, [inputq + 224]
+  packssdw m7, [inputq + 240]
+%else
   mova     m0, [inputq +   0]
   mova     m1, [inputq +  16]
   mova     m2, [inputq +  32]
@@ -229,7 +246,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
   mova     m5, [inputq +  80]
   mova     m6, [inputq +  96]
   mova     m7, [inputq + 112]
-
+%endif
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
   IDCT8_1D
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
@@ -254,10 +271,21 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
 
   lea        r3, [2 * strideq]
 
+%if CONFIG_VPX_HIGHBITDEPTH
+  mova       m0, [inputq +   0]
+  packssdw   m0, [inputq +  16]
+  mova       m1, [inputq +  32]
+  packssdw   m1, [inputq +  48]
+  mova       m2, [inputq +  64]
+  packssdw   m2, [inputq +  80]
+  mova       m3, [inputq +  96]
+  packssdw   m3, [inputq + 112]
+%else
   mova       m0, [inputq +  0]
   mova       m1, [inputq + 16]
   mova       m2, [inputq + 32]
   mova       m3, [inputq + 48]
+%endif
 
   punpcklwd  m0, m1
   punpcklwd  m2, m3
@@ -765,6 +793,24 @@ idct32x32_34:
   lea             r4, [rsp + transposed_in]
 
 idct32x32_34_transpose:
+%if CONFIG_VPX_HIGHBITDEPTH
+  mova            m0, [r3 +       0]
+  packssdw        m0, [r3 +      16]
+  mova            m1, [r3 + 32 *  4]
+  packssdw        m1, [r3 + 32 *  4 + 16]
+  mova            m2, [r3 + 32 *  8]
+  packssdw        m2, [r3 + 32 *  8 + 16]
+  mova            m3, [r3 + 32 * 12]
+  packssdw        m3, [r3 + 32 * 12 + 16]
+  mova            m4, [r3 + 32 * 16]
+  packssdw        m4, [r3 + 32 * 16 + 16]
+  mova            m5, [r3 + 32 * 20]
+  packssdw        m5, [r3 + 32 * 20 + 16]
+  mova            m6, [r3 + 32 * 24]
+  packssdw        m6, [r3 + 32 * 24 + 16]
+  mova            m7, [r3 + 32 * 28]
+  packssdw        m7, [r3 + 32 * 28 + 16]
+%else
   mova            m0, [r3 +       0]
   mova            m1, [r3 + 16 *  4]
   mova            m2, [r3 + 16 *  8]
@@ -773,6 +819,7 @@ idct32x32_34_transpose:
   mova            m5, [r3 + 16 * 20]
   mova            m6, [r3 + 16 * 24]
   mova            m7, [r3 + 16 * 28]
+%endif
 
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
 
@@ -1176,6 +1223,24 @@ idct32x32_135:
   mov             r7, 2
 
 idct32x32_135_transpose:
+%if CONFIG_VPX_HIGHBITDEPTH
+  mova            m0, [r3 +       0]
+  packssdw        m0, [r3 +      16]
+  mova            m1, [r3 + 32 *  4]
+  packssdw        m1, [r3 + 32 *  4 + 16]
+  mova            m2, [r3 + 32 *  8]
+  packssdw        m2, [r3 + 32 *  8 + 16]
+  mova            m3, [r3 + 32 * 12]
+  packssdw        m3, [r3 + 32 * 12 + 16]
+  mova            m4, [r3 + 32 * 16]
+  packssdw        m4, [r3 + 32 * 16 + 16]
+  mova            m5, [r3 + 32 * 20]
+  packssdw        m5, [r3 + 32 * 20 + 16]
+  mova            m6, [r3 + 32 * 24]
+  packssdw        m6, [r3 + 32 * 24 + 16]
+  mova            m7, [r3 + 32 * 28]
+  packssdw        m7, [r3 + 32 * 28 + 16]
+%else
   mova            m0, [r3 +       0]
   mova            m1, [r3 + 16 *  4]
   mova            m2, [r3 + 16 *  8]
@@ -1184,7 +1249,7 @@ idct32x32_135_transpose:
   mova            m5, [r3 + 16 * 20]
   mova            m6, [r3 + 16 * 24]
   mova            m7, [r3 + 16 * 28]
-
+%endif
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
 
   mova [r4 +      0], m0
@@ -1196,14 +1261,22 @@ idct32x32_135_transpose:
   mova [r4 + 16 * 6], m6
   mova [r4 + 16 * 7], m7
 
+%if CONFIG_VPX_HIGHBITDEPTH
+  add             r3, 32
+%else
   add             r3, 16
+%endif
   add             r4, 16 * 8
   dec             r7
   jne idct32x32_135_transpose
 
   IDCT32X32_135 16*0, 16*32, 16*64, 16*96
   lea            stp, [stp + 16 * 8]
+%if CONFIG_VPX_HIGHBITDEPTH
+  lea         inputq, [inputq + 32 * 32]
+%else
   lea         inputq, [inputq + 16 * 32]
+%endif
   dec             r6
   jnz idct32x32_135
 
@@ -1614,6 +1687,24 @@ idct32x32_1024:
   mov             r7, 4
 
 idct32x32_1024_transpose:
+%if CONFIG_VPX_HIGHBITDEPTH
+  mova            m0, [r3 +       0]
+  packssdw        m0, [r3 +      16]
+  mova            m1, [r3 + 32 *  4]
+  packssdw        m1, [r3 + 32 *  4 + 16]
+  mova            m2, [r3 + 32 *  8]
+  packssdw        m2, [r3 + 32 *  8 + 16]
+  mova            m3, [r3 + 32 * 12]
+  packssdw        m3, [r3 + 32 * 12 + 16]
+  mova            m4, [r3 + 32 * 16]
+  packssdw        m4, [r3 + 32 * 16 + 16]
+  mova            m5, [r3 + 32 * 20]
+  packssdw        m5, [r3 + 32 * 20 + 16]
+  mova            m6, [r3 + 32 * 24]
+  packssdw        m6, [r3 + 32 * 24 + 16]
+  mova            m7, [r3 + 32 * 28]
+  packssdw        m7, [r3 + 32 * 28 + 16]
+%else
   mova            m0, [r3 +       0]
   mova            m1, [r3 + 16 *  4]
   mova            m2, [r3 + 16 *  8]
@@ -1622,6 +1713,7 @@ idct32x32_1024_transpose:
   mova            m5, [r3 + 16 * 20]
   mova            m6, [r3 + 16 * 24]
   mova            m7, [r3 + 16 * 28]
+%endif
 
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
 
@@ -1633,8 +1725,11 @@ idct32x32_1024_transpose:
   mova [r4 + 16 * 5], m5
   mova [r4 + 16 * 6], m6
   mova [r4 + 16 * 7], m7
-
+%if CONFIG_VPX_HIGHBITDEPTH
+  add             r3, 32
+%else
   add             r3, 16
+%endif
   add             r4, 16 * 8
   dec             r7
   jne idct32x32_1024_transpose
@@ -1642,7 +1737,11 @@ idct32x32_1024_transpose:
   IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
 
   lea            stp, [stp + 16 * 8]
+%if CONFIG_VPX_HIGHBITDEPTH
+  lea         inputq, [inputq + 32 * 32]
+%else
   lea         inputq, [inputq + 16 * 32]
+%endif
   dec             r6
   jnz idct32x32_1024
 
diff --git a/vpx_dsp/x86/inv_wht_sse2.asm b/vpx_dsp/x86/inv_wht_sse2.asm
index df6f4692b48dcc07f43e46330641f792625e12db..eec504755016fb229a4cb97dd89f23d6f4b143bd 100644
--- a/vpx_dsp/x86/inv_wht_sse2.asm
+++ b/vpx_dsp/x86/inv_wht_sse2.asm
@@ -82,9 +82,15 @@ SECTION .text
 
 INIT_XMM sse2
 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+%if CONFIG_VPX_HIGHBITDEPTH
+  mova            m0,        [inputq +  0]
+  packssdw        m0,        [inputq + 16]
+  mova            m1,        [inputq + 32]
+  packssdw        m1,        [inputq + 48]
+%else
   mova            m0,        [inputq +  0]
   mova            m1,        [inputq + 16]
-
+%endif
   psraw           m0,        2
   psraw           m1,        2