diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 22d827fe3e30c3b8f264e22e6bc4ddbbd4f9fe2a..dcdefa45e8fc2b37ac7f3808430fbdacff19014a 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -699,7 +699,7 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_iwht4x4_1_add/; add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_iwht4x4_16_add/; + specialize qw/vpx_iwht4x4_16_add/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; specialize qw/vpx_highbd_idct4x4_1_add/; @@ -762,7 +762,7 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_1_add/; - + add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; specialize qw/vpx_highbd_idct4x4_16_add/; @@ -785,10 +785,10 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct4x4_1_add sse2/; add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct8x8_64_add sse2/; + specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64_x86inc"; add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct8x8_12_add sse2/; + specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64_x86inc"; add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct8x8_1_add sse2/; @@ -803,14 +803,15 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct16x16_1_add sse2/; add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct32x32_1024_add sse2/; + specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64_x86inc"; add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct32x32_135_add sse2/; + specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64_x86inc"; + # Need to add 135 eob idct32x32 implementations. $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct32x32_34_add sse2/; + specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64_x86inc"; add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_1_add sse2/; diff --git a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm index 80a330b7b9bad33590a48526968a63d4c934d775..a835161ea02de3d2a9db9bcf5a72392402239091 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm +++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm @@ -220,7 +220,24 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride mova m12, [pw_11585x2] lea r3, [2 * strideq] - +%if CONFIG_VPX_HIGHBITDEPTH + mova m0, [inputq + 0] + packssdw m0, [inputq + 16] + mova m1, [inputq + 32] + packssdw m1, [inputq + 48] + mova m2, [inputq + 64] + packssdw m2, [inputq + 80] + mova m3, [inputq + 96] + packssdw m3, [inputq + 112] + mova m4, [inputq + 128] + packssdw m4, [inputq + 144] + mova m5, [inputq + 160] + packssdw m5, [inputq + 176] + mova m6, [inputq + 192] + packssdw m6, [inputq + 208] + mova m7, [inputq + 224] + packssdw m7, [inputq + 240] +%else mova m0, [inputq + 0] mova m1, [inputq + 16] mova m2, [inputq + 32] @@ -229,7 +246,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride mova m5, [inputq + 80] mova m6, [inputq + 96] mova m7, [inputq + 112] - +%endif TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 IDCT8_1D TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 @@ -254,10 +271,21 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride lea r3, [2 * strideq] +%if CONFIG_VPX_HIGHBITDEPTH + mova m0, [inputq + 0] + packssdw m0, [inputq + 16] + mova m1, [inputq + 32] + packssdw m1, [inputq + 48] + mova m2, [inputq + 64] + packssdw m2, [inputq + 80] + mova m3, [inputq + 96] + packssdw m3, [inputq + 112] +%else mova m0, [inputq + 0] mova m1, [inputq + 16] mova m2, [inputq + 32] mova m3, [inputq + 48] +%endif punpcklwd m0, m1 punpcklwd m2, m3 @@ -765,6 +793,24 @@ idct32x32_34: lea r4, [rsp + transposed_in] idct32x32_34_transpose: +%if CONFIG_VPX_HIGHBITDEPTH + mova m0, [r3 + 0] + packssdw m0, [r3 + 16] + mova m1, [r3 + 32 * 4] + packssdw m1, [r3 + 32 * 4 + 16] + mova m2, [r3 + 32 * 8] + packssdw m2, [r3 + 32 * 8 + 16] + mova m3, [r3 + 32 * 12] + packssdw m3, [r3 + 32 * 12 + 16] + mova m4, [r3 + 32 * 16] + packssdw m4, [r3 + 32 * 16 + 16] + mova m5, [r3 + 32 * 20] + packssdw m5, [r3 + 32 * 20 + 16] + mova m6, [r3 + 32 * 24] + packssdw m6, [r3 + 32 * 24 + 16] + mova m7, [r3 + 32 * 28] + packssdw m7, [r3 + 32 * 28 + 16] +%else mova m0, [r3 + 0] mova m1, [r3 + 16 * 4] mova m2, [r3 + 16 * 8] @@ -773,6 +819,7 @@ idct32x32_34_transpose: mova m5, [r3 + 16 * 20] mova m6, [r3 + 16 * 24] mova m7, [r3 + 16 * 28] +%endif TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 @@ -1176,6 +1223,24 @@ idct32x32_135: mov r7, 2 idct32x32_135_transpose: +%if CONFIG_VPX_HIGHBITDEPTH + mova m0, [r3 + 0] + packssdw m0, [r3 + 16] + mova m1, [r3 + 32 * 4] + packssdw m1, [r3 + 32 * 4 + 16] + mova m2, [r3 + 32 * 8] + packssdw m2, [r3 + 32 * 8 + 16] + mova m3, [r3 + 32 * 12] + packssdw m3, [r3 + 32 * 12 + 16] + mova m4, [r3 + 32 * 16] + packssdw m4, [r3 + 32 * 16 + 16] + mova m5, [r3 + 32 * 20] + packssdw m5, [r3 + 32 * 20 + 16] + mova m6, [r3 + 32 * 24] + packssdw m6, [r3 + 32 * 24 + 16] + mova m7, [r3 + 32 * 28] + packssdw m7, [r3 + 32 * 28 + 16] +%else mova m0, [r3 + 0] mova m1, [r3 + 16 * 4] mova m2, [r3 + 16 * 8] @@ -1184,7 +1249,7 @@ idct32x32_135_transpose: mova m5, [r3 + 16 * 20] mova m6, [r3 + 16 * 24] mova m7, [r3 + 16 * 28] - +%endif TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 mova [r4 + 0], m0 @@ -1196,14 +1261,22 @@ idct32x32_135_transpose: mova [r4 + 16 * 6], m6 mova [r4 + 16 * 7], m7 +%if CONFIG_VPX_HIGHBITDEPTH + add r3, 32 +%else add r3, 16 +%endif add r4, 16 * 8 dec r7 jne idct32x32_135_transpose IDCT32X32_135 16*0, 16*32, 16*64, 16*96 lea stp, [stp + 16 * 8] +%if CONFIG_VPX_HIGHBITDEPTH + lea inputq, [inputq + 32 * 32] +%else lea inputq, [inputq + 16 * 32] +%endif dec r6 jnz idct32x32_135 @@ -1614,6 +1687,24 @@ idct32x32_1024: mov r7, 4 idct32x32_1024_transpose: +%if CONFIG_VPX_HIGHBITDEPTH + mova m0, [r3 + 0] + packssdw m0, [r3 + 16] + mova m1, [r3 + 32 * 4] + packssdw m1, [r3 + 32 * 4 + 16] + mova m2, [r3 + 32 * 8] + packssdw m2, [r3 + 32 * 8 + 16] + mova m3, [r3 + 32 * 12] + packssdw m3, [r3 + 32 * 12 + 16] + mova m4, [r3 + 32 * 16] + packssdw m4, [r3 + 32 * 16 + 16] + mova m5, [r3 + 32 * 20] + packssdw m5, [r3 + 32 * 20 + 16] + mova m6, [r3 + 32 * 24] + packssdw m6, [r3 + 32 * 24 + 16] + mova m7, [r3 + 32 * 28] + packssdw m7, [r3 + 32 * 28 + 16] +%else mova m0, [r3 + 0] mova m1, [r3 + 16 * 4] mova m2, [r3 + 16 * 8] @@ -1622,6 +1713,7 @@ idct32x32_1024_transpose: mova m5, [r3 + 16 * 20] mova m6, [r3 + 16 * 24] mova m7, [r3 + 16 * 28] +%endif TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 @@ -1633,8 +1725,11 @@ idct32x32_1024_transpose: mova [r4 + 16 * 5], m5 mova [r4 + 16 * 6], m6 mova [r4 + 16 * 7], m7 - +%if CONFIG_VPX_HIGHBITDEPTH + add r3, 32 +%else add r3, 16 +%endif add r4, 16 * 8 dec r7 jne idct32x32_1024_transpose @@ -1642,7 +1737,11 @@ idct32x32_1024_transpose: IDCT32X32_1024 16*0, 16*32, 16*64, 16*96 lea stp, [stp + 16 * 8] +%if CONFIG_VPX_HIGHBITDEPTH + lea inputq, [inputq + 32 * 32] +%else lea inputq, [inputq + 16 * 32] +%endif dec r6 jnz idct32x32_1024 diff --git a/vpx_dsp/x86/inv_wht_sse2.asm b/vpx_dsp/x86/inv_wht_sse2.asm index df6f4692b48dcc07f43e46330641f792625e12db..eec504755016fb229a4cb97dd89f23d6f4b143bd 100644 --- a/vpx_dsp/x86/inv_wht_sse2.asm +++ b/vpx_dsp/x86/inv_wht_sse2.asm @@ -82,9 +82,15 @@ SECTION .text INIT_XMM sse2 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride +%if CONFIG_VPX_HIGHBITDEPTH + mova m0, [inputq + 0] + packssdw m0, [inputq + 16] + mova m1, [inputq + 32] + packssdw m1, [inputq + 48] +%else mova m0, [inputq + 0] mova m1, [inputq + 16] - +%endif psraw m0, 2 psraw m1, 2