Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
R
rav1e
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Timothy B. Terriberry
rav1e
Commits
db224723
Commit
db224723
authored
6 years ago
by
Henrik Gramner
Committed by
David Michael Barr
6 years ago
Browse files
Options
Downloads
Patches
Plain Diff
x86: Add dc/h/v intra prediction AVX2 asm
parent
9cb02af7
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/x86/ipred.asm
+399
-0
399 additions, 0 deletions
src/x86/ipred.asm
with
399 additions
and
0 deletions
src/x86/ipred.asm
0 → 100644
+
399
−
0
View file @
db224723
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION
_RODATA
pb_128:
times
4
db
128
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
%xdefine %%base mangle(private_prefix %+ _%1_%2)
%%table:
%rep %0 - 2
dd
%%
base
%+
.
%
3
-
(
%%
table
-
2
*
4
)
%rotate 1
%endrep
%endmacro
%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
JMP_TABLE
ipred_dc
,
avx2
,
h4
,
h8
,
h16
,
h32
,
h64
,
w4
,
w8
,
w16
,
w32
,
w64
,
\
s4
-
10
*
4
,
s8
-
10
*
4
,
s16
-
10
*
4
,
s32
-
10
*
4
,
s64
-
10
*
4
JMP_TABLE
ipred_dc_left
,
avx2
,
h4
,
h8
,
h16
,
h32
,
h64
JMP_TABLE
ipred_h
,
avx2
,
w4
,
w8
,
w16
,
w32
,
w64
SECTION
.text
INIT_YMM
avx2
cglobal
ipred_dc_top
,
3
,
7
,
6
,
ds
t
,
stride
,
tl
,
w
,
h
lea
r5
,
[
ipred_dc_left_avx2_table
]
tzcnt
wd
,
wm
inc
tlq
movu
m0
,
[
tlq
]
movifnidn
hd
,
hm
mov
r6d
,
0x8000
shrx
r6d
,
r6d
,
wd
movd
xm3
,
r6d
movsxd
r6
,
[
r5
+
wq
*
4
]
pcmpeqd
m2
,
m2
pmaddubsw
m0
,
m2
add
r6
,
r5
add
r5
,
ipred_dc_splat_avx2_table
-
ipred_dc_left_avx2_table
movsxd
wq
,
[
r5
+
wq
*
4
]
add
wq
,
r5
jmp
r6
cglobal
ipred_dc_left
,
3
,
7
,
6
,
ds
t
,
stride
,
tl
,
w
,
h
,
stride3
mov
hd
,
hm
; zero upper half
tzcnt
r6d
,
hd
sub
tlq
,
hq
tzcnt
wd
,
wm
movu
m0
,
[
tlq
]
mov
r5d
,
0x8000
shrx
r5d
,
r5d
,
r6d
movd
xm3
,
r5d
lea
r5
,
[
ipred_dc_left_avx2_table
]
movsxd
r6
,
[
r5
+
r6
*
4
]
pcmpeqd
m2
,
m2
pmaddubsw
m0
,
m2
add
r6
,
r5
add
r5
,
ipred_dc_splat_avx2_table
-
ipred_dc_left_avx2_table
movsxd
wq
,
[
r5
+
wq
*
4
]
add
wq
,
r5
jmp
r6
.h64:
movu
m1
,
[
tlq
+
32
]
; unaligned when jumping here from dc_top
pmaddubsw
m1
,
m2
paddw
m0
,
m1
.h32:
vextracti128
xm1
,
m0
,
1
paddw
xm0
,
xm1
.h16:
punpckhqdq
xm1
,
xm0
,
xm0
paddw
xm0
,
xm1
.h8:
psrlq
xm1
,
xm0
,
32
paddw
xm0
,
xm1
.h4:
pmaddwd
xm0
,
xm2
pmulhrsw
xm0
,
xm3
lea
stride3q
,
[
strideq
*
3
]
vpbroadcastb
m0
,
xm0
mova
m1
,
m0
jmp
wq
cglobal
ipred_dc
,
3
,
7
,
6
,
ds
t
,
stride
,
tl
,
w
,
h
,
stride3
movifnidn
hd
,
hm
movifnidn
wd
,
wm
tzcnt
r6d
,
hd
lea
r5d
,
[
wq
+
hq
]
movd
xm4
,
r5d
tzcnt
r5d
,
r5d
movd
xm5
,
r5d
lea
r5
,
[
ipred_dc_avx2_table
]
tzcnt
wd
,
wd
movsxd
r6
,
[
r5
+
r6
*
4
]
movsxd
wq
,
[
r5
+
wq
*
4
+
5
*
4
]
pcmpeqd
m3
,
m3
psrlw
xm4
,
1
add
r6
,
r5
add
wq
,
r5
lea
stride3q
,
[
strideq
*
3
]
jmp
r6
.h4:
movd
xm0
,
[
tlq
-
4
]
pmaddubsw
xm0
,
xm3
jmp
wq
.w4:
movd
xm1
,
[
tlq
+
1
]
pmaddubsw
xm1
,
xm3
psubw
xm0
,
xm4
paddw
xm0
,
xm1
pmaddwd
xm0
,
xm3
cmp
hd
,
4
jg
.w4_mul
psrlw
xm0
,
3
jmp
.w4_end
.w4_mul:
punpckhqdq
xm1
,
xm0
,
xm0
lea
r2d
,
[
hq
*
2
]
mov
r6d
,
0x55563334
paddw
xm0
,
xm1
shrx
r6d
,
r6d
,
r2d
psrlq
xm1
,
xm0
,
32
paddw
xm0
,
xm1
movd
xm1
,
r6d
psrlw
xm0
,
2
pmulhuw
xm0
,
xm1
.w4_end:
vpbroadcastb
xm0
,
xm0
.s4:
movd
[
ds
tq
+
strideq
*
0
],
xm0
movd
[
ds
tq
+
strideq
*
1
],
xm0
movd
[
ds
tq
+
strideq
*
2
],
xm0
movd
[
ds
tq
+
stride3q
],
xm0
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
4
jg
.s4
RET
ALIGN
function_align
.h8:
movq
xm0
,
[
tlq
-
8
]
pmaddubsw
xm0
,
xm3
jmp
wq
.w8:
movq
xm1
,
[
tlq
+
1
]
vextracti128
xm2
,
m0
,
1
pmaddubsw
xm1
,
xm3
psubw
xm0
,
xm4
paddw
xm0
,
xm2
punpckhqdq
xm2
,
xm0
,
xm0
paddw
xm0
,
xm2
paddw
xm0
,
xm1
psrlq
xm1
,
xm0
,
32
paddw
xm0
,
xm1
pmaddwd
xm0
,
xm3
psrlw
xm0
,
xm5
cmp
hd
,
8
je
.w8_end
mov
r6d
,
0x5556
mov
r2d
,
0x3334
cmp
hd
,
32
cmovz
r6d
,
r2d
movd
xm1
,
r6d
pmulhuw
xm0
,
xm1
.w8_end:
vpbroadcastb
xm0
,
xm0
.s8:
movq
[
ds
tq
+
strideq
*
0
],
xm0
movq
[
ds
tq
+
strideq
*
1
],
xm0
movq
[
ds
tq
+
strideq
*
2
],
xm0
movq
[
ds
tq
+
stride3q
],
xm0
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
4
jg
.s8
RET
ALIGN
function_align
.h16:
mova
xm0
,
[
tlq
-
16
]
pmaddubsw
xm0
,
xm3
jmp
wq
.w16:
movu
xm1
,
[
tlq
+
1
]
vextracti128
xm2
,
m0
,
1
pmaddubsw
xm1
,
xm3
psubw
xm0
,
xm4
paddw
xm0
,
xm2
paddw
xm0
,
xm1
punpckhqdq
xm1
,
xm0
,
xm0
paddw
xm0
,
xm1
psrlq
xm1
,
xm0
,
32
paddw
xm0
,
xm1
pmaddwd
xm0
,
xm3
psrlw
xm0
,
xm5
cmp
hd
,
16
je
.w16_end
mov
r6d
,
0x5556
mov
r2d
,
0x3334
test
hb
,
8
|
32
cmovz
r6d
,
r2d
movd
xm1
,
r6d
pmulhuw
xm0
,
xm1
.w16_end:
vpbroadcastb
xm0
,
xm0
.s16:
mova
[
ds
tq
+
strideq
*
0
],
xm0
mova
[
ds
tq
+
strideq
*
1
],
xm0
mova
[
ds
tq
+
strideq
*
2
],
xm0
mova
[
ds
tq
+
stride3q
],
xm0
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
4
jg
.s16
RET
ALIGN
function_align
.h32:
mova
m0
,
[
tlq
-
32
]
pmaddubsw
m0
,
m3
jmp
wq
.w32:
movu
m1
,
[
tlq
+
1
]
pmaddubsw
m1
,
m3
paddw
m0
,
m1
vextracti128
xm1
,
m0
,
1
psubw
xm0
,
xm4
paddw
xm0
,
xm1
punpckhqdq
xm1
,
xm0
,
xm0
paddw
xm0
,
xm1
psrlq
xm1
,
xm0
,
32
paddw
xm0
,
xm1
pmaddwd
xm0
,
xm3
psrlw
xm0
,
xm5
cmp
hd
,
32
je
.w32_end
lea
r2d
,
[
hq
*
2
]
mov
r6d
,
0x33345556
shrx
r6d
,
r6d
,
r2d
movd
xm1
,
r6d
pmulhuw
xm0
,
xm1
.w32_end:
vpbroadcastb
m0
,
xm0
.s32:
mova
[
ds
tq
+
strideq
*
0
],
m0
mova
[
ds
tq
+
strideq
*
1
],
m0
mova
[
ds
tq
+
strideq
*
2
],
m0
mova
[
ds
tq
+
stride3q
],
m0
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
4
jg
.s32
RET
ALIGN
function_align
.h64:
mova
m0
,
[
tlq
-
64
]
mova
m1
,
[
tlq
-
32
]
pmaddubsw
m0
,
m3
pmaddubsw
m1
,
m3
paddw
m0
,
m1
jmp
wq
.w64:
movu
m1
,
[
tlq
+
1
]
movu
m2
,
[
tlq
+
33
]
pmaddubsw
m1
,
m3
pmaddubsw
m2
,
m3
paddw
m0
,
m1
paddw
m0
,
m2
vextracti128
xm1
,
m0
,
1
psubw
xm0
,
xm4
paddw
xm0
,
xm1
punpckhqdq
xm1
,
xm0
,
xm0
paddw
xm0
,
xm1
psrlq
xm1
,
xm0
,
32
paddw
xm0
,
xm1
pmaddwd
xm0
,
xm3
psrlw
xm0
,
xm5
cmp
hd
,
64
je
.w64_end
mov
r6d
,
0x33345556
shrx
r6d
,
r6d
,
hd
movd
xm1
,
r6d
pmulhuw
xm0
,
xm1
.w64_end:
vpbroadcastb
m0
,
xm0
mova
m1
,
m0
.s64:
mova
[
ds
tq
+
strideq
*
0
+
32
*
0
],
m0
mova
[
ds
tq
+
strideq
*
0
+
32
*
1
],
m1
mova
[
ds
tq
+
strideq
*
1
+
32
*
0
],
m0
mova
[
ds
tq
+
strideq
*
1
+
32
*
1
],
m1
mova
[
ds
tq
+
strideq
*
2
+
32
*
0
],
m0
mova
[
ds
tq
+
strideq
*
2
+
32
*
1
],
m1
mova
[
ds
tq
+
stride3q
+
32
*
0
],
m0
mova
[
ds
tq
+
stride3q
+
32
*
1
],
m1
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
4
jg
.s64
RET
cglobal
ipred_dc_128
,
2
,
7
,
6
,
ds
t
,
stride
,
tl
,
w
,
h
,
stride3
lea
r5
,
[
ipred_dc_splat_avx2_table
]
tzcnt
wd
,
wm
movifnidn
hd
,
hm
movsxd
wq
,
[
r5
+
wq
*
4
]
vpbroadcastd
m0
,
[
r5
-
ipred_dc_splat_avx2_table
+
pb_128
]
mova
m1
,
m0
add
wq
,
r5
lea
stride3q
,
[
strideq
*
3
]
jmp
wq
cglobal
ipred_v
,
3
,
7
,
6
,
ds
t
,
stride
,
tl
,
w
,
h
,
stride3
lea
r5
,
[
ipred_dc_splat_avx2_table
]
tzcnt
wd
,
wm
movu
m0
,
[
tlq
+
1
]
movu
m1
,
[
tlq
+
33
]
movifnidn
hd
,
hm
movsxd
wq
,
[
r5
+
wq
*
4
]
add
wq
,
r5
lea
stride3q
,
[
strideq
*
3
]
jmp
wq
%macro IPRED_H 2
; w, store_type
vpbroadcastb
m0
,
[
tlq
-
1
]
vpbroadcastb
m1
,
[
tlq
-
2
]
vpbroadcastb
m2
,
[
tlq
-
3
]
sub
tlq
,
4
vpbroadcastb
m3
,
[
tlq
+
0
]
mov
%
2
[
ds
tq
+
strideq
*
0
],
m0
mov
%
2
[
ds
tq
+
strideq
*
1
],
m1
mov
%
2
[
ds
tq
+
strideq
*
2
],
m2
mov
%
2
[
ds
tq
+
stride3q
],
m3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
4
jg
.w
%
1
RET
ALIGN
function_align
%endmacro
INIT_XMM
avx2
cglobal
ipred_h
,
3
,
6
,
4
,
ds
t
,
stride
,
tl
,
w
,
h
,
stride3
lea
r5
,
[
ipred_h_avx2_table
]
tzcnt
wd
,
wm
movifnidn
hd
,
hm
movsxd
wq
,
[
r5
+
wq
*
4
]
add
wq
,
r5
lea
stride3q
,
[
strideq
*
3
]
jmp
wq
.w4:
IPRED_H
4
,
d
.w8:
IPRED_H
8
,
q
.w16:
IPRED_H
16
,
a
INIT_YMM
avx2
.w32:
IPRED_H
32
,
a
.w64:
vpbroadcastb
m0
,
[
tlq
-
1
]
vpbroadcastb
m1
,
[
tlq
-
2
]
vpbroadcastb
m2
,
[
tlq
-
3
]
sub
tlq
,
4
vpbroadcastb
m3
,
[
tlq
+
0
]
mova
[
ds
tq
+
strideq
*
0
+
32
*
0
],
m0
mova
[
ds
tq
+
strideq
*
0
+
32
*
1
],
m0
mova
[
ds
tq
+
strideq
*
1
+
32
*
0
],
m1
mova
[
ds
tq
+
strideq
*
1
+
32
*
1
],
m1
mova
[
ds
tq
+
strideq
*
2
+
32
*
0
],
m2
mova
[
ds
tq
+
strideq
*
2
+
32
*
1
],
m2
mova
[
ds
tq
+
stride3q
+
32
*
0
],
m3
mova
[
ds
tq
+
stride3q
+
32
*
1
],
m3
lea
ds
tq
,
[
ds
tq
+
strideq
*
4
]
sub
hd
,
4
jg
.w64
RET
%endif
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment