From db2247236409e8b5dbb231c9c77651042f05829b Mon Sep 17 00:00:00 2001
From: Henrik Gramner <gramner@twoorioles.com>
Date: Mon, 8 Oct 2018 13:03:29 +0200
Subject: [PATCH] x86: Add dc/h/v intra prediction AVX2 asm

---
 src/x86/ipred.asm | 399 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 399 insertions(+)
 create mode 100644 src/x86/ipred.asm

diff --git a/src/x86/ipred.asm b/src/x86/ipred.asm
new file mode 100644
index 00000000..dec92560
--- /dev/null
+++ b/src/x86/ipred.asm
@@ -0,0 +1,399 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+pb_128: times 4 db 128
+
+%macro JMP_TABLE 3-*
+    %xdefine %1_%2_table (%%table - 2*4)
+    %xdefine %%base mangle(private_prefix %+ _%1_%2)
+    %%table:
+    %rep %0 - 2
+        dd %%base %+ .%3 - (%%table - 2*4)
+        %rotate 1
+    %endrep
+%endmacro
+
+%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
+
+JMP_TABLE ipred_dc,      avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+                               s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64
+JMP_TABLE ipred_h,       avx2, w4, w8, w16, w32, w64
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
+    lea                  r5, [ipred_dc_left_avx2_table]
+    tzcnt                wd, wm
+    inc                 tlq
+    movu                 m0, [tlq]
+    movifnidn            hd, hm
+    mov                 r6d, 0x8000
+    shrx                r6d, r6d, wd
+    movd                xm3, r6d
+    movsxd               r6, [r5+wq*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, r5
+    add                  r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    jmp                  r6
+
+cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
+    mov                  hd, hm ; zero upper half
+    tzcnt               r6d, hd
+    sub                 tlq, hq
+    tzcnt                wd, wm
+    movu                 m0, [tlq]
+    mov                 r5d, 0x8000
+    shrx                r5d, r5d, r6d
+    movd                xm3, r5d
+    lea                  r5, [ipred_dc_left_avx2_table]
+    movsxd               r6, [r5+r6*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, r5
+    add                  r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    jmp                  r6
+.h64:
+    movu                 m1, [tlq+32] ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+.h32:
+    vextracti128        xm1, m0, 1
+    paddw               xm0, xm1
+.h16:
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+.h8:
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+.h4:
+    pmaddwd             xm0, xm2
+    pmulhrsw            xm0, xm3
+    lea            stride3q, [strideq*3]
+    vpbroadcastb         m0, xm0
+    mova                 m1, m0
+    jmp                  wq
+
+cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
+    movifnidn            hd, hm
+    movifnidn            wd, wm
+    tzcnt               r6d, hd
+    lea                 r5d, [wq+hq]
+    movd                xm4, r5d
+    tzcnt               r5d, r5d
+    movd                xm5, r5d
+    lea                  r5, [ipred_dc_avx2_table]
+    tzcnt                wd, wd
+    movsxd               r6, [r5+r6*4]
+    movsxd               wq, [r5+wq*4+5*4]
+    pcmpeqd              m3, m3
+    psrlw               xm4, 1
+    add                  r6, r5
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  r6
+.h4:
+    movd                xm0, [tlq-4]
+    pmaddubsw           xm0, xm3
+    jmp                  wq
+.w4:
+    movd                xm1, [tlq+1]
+    pmaddubsw           xm1, xm3
+    psubw               xm0, xm4
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    cmp                  hd, 4
+    jg .w4_mul
+    psrlw               xm0, 3
+    jmp .w4_end
+.w4_mul:
+    punpckhqdq          xm1, xm0, xm0
+    lea                 r2d, [hq*2]
+    mov                 r6d, 0x55563334
+    paddw               xm0, xm1
+    shrx                r6d, r6d, r2d
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    movd                xm1, r6d
+    psrlw               xm0, 2
+    pmulhuw             xm0, xm1
+.w4_end:
+    vpbroadcastb        xm0, xm0
+.s4:
+    movd   [dstq+strideq*0], xm0
+    movd   [dstq+strideq*1], xm0
+    movd   [dstq+strideq*2], xm0
+    movd   [dstq+stride3q ], xm0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s4
+    RET
+ALIGN function_align
+.h8:
+    movq                xm0, [tlq-8]
+    pmaddubsw           xm0, xm3
+    jmp                  wq
+.w8:
+    movq                xm1, [tlq+1]
+    vextracti128        xm2, m0, 1
+    pmaddubsw           xm1, xm3
+    psubw               xm0, xm4
+    paddw               xm0, xm2
+    punpckhqdq          xm2, xm0, xm0
+    paddw               xm0, xm2
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 8
+    je .w8_end
+    mov                 r6d, 0x5556
+    mov                 r2d, 0x3334
+    cmp                  hd, 32
+    cmovz               r6d, r2d
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w8_end:
+    vpbroadcastb        xm0, xm0
+.s8:
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xm0
+    movq   [dstq+strideq*2], xm0
+    movq   [dstq+stride3q ], xm0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s8
+    RET
+ALIGN function_align
+.h16:
+    mova                xm0, [tlq-16]
+    pmaddubsw           xm0, xm3
+    jmp                  wq
+.w16:
+    movu                xm1, [tlq+1]
+    vextracti128        xm2, m0, 1
+    pmaddubsw           xm1, xm3
+    psubw               xm0, xm4
+    paddw               xm0, xm2
+    paddw               xm0, xm1
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 16
+    je .w16_end
+    mov                 r6d, 0x5556
+    mov                 r2d, 0x3334
+    test                 hb, 8|32
+    cmovz               r6d, r2d
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w16_end:
+    vpbroadcastb        xm0, xm0
+.s16:
+    mova   [dstq+strideq*0], xm0
+    mova   [dstq+strideq*1], xm0
+    mova   [dstq+strideq*2], xm0
+    mova   [dstq+stride3q ], xm0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s16
+    RET
+ALIGN function_align
+.h32:
+    mova                 m0, [tlq-32]
+    pmaddubsw            m0, m3
+    jmp                  wq
+.w32:
+    movu                 m1, [tlq+1]
+    pmaddubsw            m1, m3
+    paddw                m0, m1
+    vextracti128        xm1, m0, 1
+    psubw               xm0, xm4
+    paddw               xm0, xm1
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 32
+    je .w32_end
+    lea                 r2d, [hq*2]
+    mov                 r6d, 0x33345556
+    shrx                r6d, r6d, r2d
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w32_end:
+    vpbroadcastb         m0, xm0
+.s32:
+    mova   [dstq+strideq*0], m0
+    mova   [dstq+strideq*1], m0
+    mova   [dstq+strideq*2], m0
+    mova   [dstq+stride3q ], m0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s32
+    RET
+ALIGN function_align
+.h64:
+    mova                 m0, [tlq-64]
+    mova                 m1, [tlq-32]
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    paddw                m0, m1
+    jmp                  wq
+.w64:
+    movu                 m1, [tlq+ 1]
+    movu                 m2, [tlq+33]
+    pmaddubsw            m1, m3
+    pmaddubsw            m2, m3
+    paddw                m0, m1
+    paddw                m0, m2
+    vextracti128        xm1, m0, 1
+    psubw               xm0, xm4
+    paddw               xm0, xm1
+    punpckhqdq          xm1, xm0, xm0
+    paddw               xm0, xm1
+    psrlq               xm1, xm0, 32
+    paddw               xm0, xm1
+    pmaddwd             xm0, xm3
+    psrlw               xm0, xm5
+    cmp                  hd, 64
+    je .w64_end
+    mov                 r6d, 0x33345556
+    shrx                r6d, r6d, hd
+    movd                xm1, r6d
+    pmulhuw             xm0, xm1
+.w64_end:
+    vpbroadcastb         m0, xm0
+    mova                 m1, m0
+.s64:
+    mova [dstq+strideq*0+32*0], m0
+    mova [dstq+strideq*0+32*1], m1
+    mova [dstq+strideq*1+32*0], m0
+    mova [dstq+strideq*1+32*1], m1
+    mova [dstq+strideq*2+32*0], m0
+    mova [dstq+strideq*2+32*1], m1
+    mova [dstq+stride3q +32*0], m0
+    mova [dstq+stride3q +32*1], m1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .s64
+    RET
+
+cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
+    lea                  r5, [ipred_dc_splat_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    vpbroadcastd         m0, [r5-ipred_dc_splat_avx2_table+pb_128]
+    mova                 m1, m0
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+
+cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
+    lea                  r5, [ipred_dc_splat_avx2_table]
+    tzcnt                wd, wm
+    movu                 m0, [tlq+ 1]
+    movu                 m1, [tlq+33]
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+
+%macro IPRED_H 2 ; w, store_type
+    vpbroadcastb         m0, [tlq-1]
+    vpbroadcastb         m1, [tlq-2]
+    vpbroadcastb         m2, [tlq-3]
+    sub                 tlq, 4
+    vpbroadcastb         m3, [tlq+0]
+    mov%2  [dstq+strideq*0], m0
+    mov%2  [dstq+strideq*1], m1
+    mov%2  [dstq+strideq*2], m2
+    mov%2  [dstq+stride3q ], m3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w%1
+    RET
+ALIGN function_align
+%endmacro
+
+INIT_XMM avx2
+cglobal ipred_h, 3, 6, 4, dst, stride, tl, w, h, stride3
+    lea                  r5, [ipred_h_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    IPRED_H               4, d
+.w8:
+    IPRED_H               8, q
+.w16:
+    IPRED_H              16, a
+INIT_YMM avx2
+.w32:
+    IPRED_H              32, a
+.w64:
+    vpbroadcastb         m0, [tlq-1]
+    vpbroadcastb         m1, [tlq-2]
+    vpbroadcastb         m2, [tlq-3]
+    sub                 tlq, 4
+    vpbroadcastb         m3, [tlq+0]
+    mova [dstq+strideq*0+32*0], m0
+    mova [dstq+strideq*0+32*1], m0
+    mova [dstq+strideq*1+32*0], m1
+    mova [dstq+strideq*1+32*1], m1
+    mova [dstq+strideq*2+32*0], m2
+    mova [dstq+strideq*2+32*1], m2
+    mova [dstq+stride3q +32*0], m3
+    mova [dstq+stride3q +32*1], m3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w64
+    RET
+
+%endif
-- 
GitLab