Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
89963bf5
Commit
89963bf5
authored
Sep 05, 2014
by
Dmitry Kovalev
Committed by
Gerrit Code Review
Sep 05, 2014
Browse files
Merge "Removing postproc mmx code."
parents
ebac8f34
1100e262
Changes
3
Hide whitespace changes
Inline
Side-by-side
vp9/common/vp9_rtcd_defs.pl
View file @
89963bf5
...
...
@@ -268,7 +268,7 @@ $vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon;
#
if
(
vpx_config
("
CONFIG_VP9_POSTPROC
")
eq
"
yes
")
{
add_proto
qw/void vp9_mbpost_proc_down/
,
"
uint8_t *dst, int pitch, int rows, int cols, int flimit
";
specialize
qw/vp9_mbpost_proc_down
mmx
sse2/
;
specialize
qw/vp9_mbpost_proc_down sse2/
;
$vp9_mbpost_proc_down_sse2
=
vp9_mbpost_proc_down_xmm
;
add_proto
qw/void vp9_mbpost_proc_across_ip/
,
"
uint8_t *src, int pitch, int rows, int cols, int flimit
";
...
...
@@ -276,11 +276,11 @@ specialize qw/vp9_mbpost_proc_across_ip sse2/;
$vp9_mbpost_proc_across_ip_sse2
=
vp9_mbpost_proc_across_ip_xmm
;
add_proto
qw/void vp9_post_proc_down_and_across/
,
"
const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit
";
specialize
qw/vp9_post_proc_down_and_across
mmx
sse2/
;
specialize
qw/vp9_post_proc_down_and_across sse2/
;
$vp9_post_proc_down_and_across_sse2
=
vp9_post_proc_down_and_across_xmm
;
add_proto
qw/void vp9_plane_add_noise/
,
"
uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch
";
specialize
qw/vp9_plane_add_noise
mmx
sse2/
;
specialize
qw/vp9_plane_add_noise sse2/
;
$vp9_plane_add_noise_sse2
=
vp9_plane_add_noise_wmt
;
}
...
...
vp9/common/x86/vp9_postproc_mmx.asm
deleted
100644 → 0
View file @
ebac8f34
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%define VP9_FILTER_WEIGHT 128
%define VP9_FILTER_SHIFT 7
;void vp9_post_proc_down_and_across_mmx
;(
; unsigned char *src_ptr,
; unsigned char *dst_ptr,
; int src_pixels_per_line,
; int dst_pixels_per_line,
; int rows,
; int cols,
; int flimit
;)
global
sym
(
vp9_post_proc_down_and_across_mmx
)
PRIVATE
sym
(
vp9_post_proc_down_and_across_mmx
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
7
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
; move the global rd onto the stack, since we don't have enough registers
; to do PIC addressing
movq
mm0
,
[
GLOBAL
(
rd
)]
sub
rsp
,
8
movq
[
rsp
],
mm0
%define RD [rsp]
%else
%define RD [GLOBAL(rd)]
%endif
push
rbx
lea
rbx
,
[
GLOBAL
(
Bl
ur
)]
movd
mm2
,
dword
ptr
arg
(
6
)
;flimit
punpcklwd
mm2
,
mm2
punpckldq
mm2
,
mm2
mov
rsi
,
arg
(
0
)
;src_ptr
mov
rdi
,
arg
(
1
)
;dst_ptr
movsxd
rcx
,
DWORD
PTR
arg
(
4
)
;rows
movsxd
rax
,
DWORD
PTR
arg
(
2
)
;src_pixels_per_line ; destination pitch?
pxor
mm0
,
mm0
; mm0 = 00000000
.nextrow:
xor
rdx
,
rdx
; clear out rdx for use as loop counter
.nextcol:
pxor
mm7
,
mm7
; mm7 = 00000000
movq
mm6
,
[
rbx
+
32
]
; mm6 = kernel 2 taps
movq
mm3
,
[
rsi
]
; mm4 = r0 p0..p7
punpcklbw
mm3
,
mm0
; mm3 = p0..p3
movq
mm1
,
mm3
; mm1 = p0..p3
pmullw
mm3
,
mm6
; mm3 *= kernel 2 modifiers
movq
mm6
,
[
rbx
+
48
]
; mm6 = kernel 3 taps
movq
mm5
,
[
rsi
+
rax
]
; mm4 = r1 p0..p7
punpcklbw
mm5
,
mm0
; mm5 = r1 p0..p3
pmullw
mm6
,
mm5
; mm6 *= p0..p3 * kernel 3 modifiers
paddusw
mm3
,
mm6
; mm3 += mm6
; thresholding
movq
mm7
,
mm1
; mm7 = r0 p0..p3
psubusw
mm7
,
mm5
; mm7 = r0 p0..p3 - r1 p0..p3
psubusw
mm5
,
mm1
; mm5 = r1 p0..p3 - r0 p0..p3
paddusw
mm7
,
mm5
; mm7 = abs(r0 p0..p3 - r1 p0..p3)
pcmpgtw
mm7
,
mm2
movq
mm6
,
[
rbx
+
64
]
; mm6 = kernel 4 modifiers
movq
mm5
,
[
rsi
+
2
*
rax
]
; mm4 = r2 p0..p7
punpcklbw
mm5
,
mm0
; mm5 = r2 p0..p3
pmullw
mm6
,
mm5
; mm5 *= kernel 4 modifiers
paddusw
mm3
,
mm6
; mm3 += mm5
; thresholding
movq
mm6
,
mm1
; mm6 = r0 p0..p3
psubusw
mm6
,
mm5
; mm6 = r0 p0..p3 - r2 p0..p3
psubusw
mm5
,
mm1
; mm5 = r2 p0..p3 - r2 p0..p3
paddusw
mm6
,
mm5
; mm6 = abs(r0 p0..p3 - r2 p0..p3)
pcmpgtw
mm6
,
mm2
por
mm7
,
mm6
; accumulate thresholds
neg
rax
movq
mm6
,
[
rbx
]
; kernel 0 taps
movq
mm5
,
[
rsi
+
2
*
rax
]
; mm4 = r-2 p0..p7
punpcklbw
mm5
,
mm0
; mm5 = r-2 p0..p3
pmullw
mm6
,
mm5
; mm5 *= kernel 0 modifiers
paddusw
mm3
,
mm6
; mm3 += mm5
; thresholding
movq
mm6
,
mm1
; mm6 = r0 p0..p3
psubusw
mm6
,
mm5
; mm6 = p0..p3 - r-2 p0..p3
psubusw
mm5
,
mm1
; mm5 = r-2 p0..p3 - p0..p3
paddusw
mm6
,
mm5
; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
pcmpgtw
mm6
,
mm2
por
mm7
,
mm6
; accumulate thresholds
movq
mm6
,
[
rbx
+
16
]
; kernel 1 taps
movq
mm4
,
[
rsi
+
rax
]
; mm4 = r-1 p0..p7
punpcklbw
mm4
,
mm0
; mm4 = r-1 p0..p3
pmullw
mm6
,
mm4
; mm4 *= kernel 1 modifiers.
paddusw
mm3
,
mm6
; mm3 += mm5
; thresholding
movq
mm6
,
mm1
; mm6 = r0 p0..p3
psubusw
mm6
,
mm4
; mm6 = p0..p3 - r-2 p0..p3
psubusw
mm4
,
mm1
; mm5 = r-1 p0..p3 - p0..p3
paddusw
mm6
,
mm4
; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
pcmpgtw
mm6
,
mm2
por
mm7
,
mm6
; accumulate thresholds
paddusw
mm3
,
RD
; mm3 += round value
psraw
mm3
,
VP9_FILTER_SHIFT
; mm3 /= 128
pand
mm1
,
mm7
; mm1 select vals > thresh from source
pandn
mm7
,
mm3
; mm7 select vals < thresh from blurred result
paddusw
mm1
,
mm7
; combination
packuswb
mm1
,
mm0
; pack to bytes
movd
[
rdi
],
mm1
;
neg
rax
; pitch is positive
add
rsi
,
4
add
rdi
,
4
add
rdx
,
4
cmp
edx
,
dword
ptr
arg
(
5
)
;cols
jl
.nextcol
; done with the all cols, start the across filtering in place
sub
rsi
,
rdx
sub
rdi
,
rdx
push
rax
xor
rdx
,
rdx
mov
rax
,
[
rdi
-
4
]
;
.acrossnextcol:
pxor
mm7
,
mm7
; mm7 = 00000000
movq
mm6
,
[
rbx
+
32
]
;
movq
mm4
,
[
rdi
+
rdx
]
; mm4 = p0..p7
movq
mm3
,
mm4
; mm3 = p0..p7
punpcklbw
mm3
,
mm0
; mm3 = p0..p3
movq
mm1
,
mm3
; mm1 = p0..p3
pmullw
mm3
,
mm6
; mm3 *= kernel 2 modifiers
movq
mm6
,
[
rbx
+
48
]
psrlq
mm4
,
8
; mm4 = p1..p7
movq
mm5
,
mm4
; mm5 = p1..p7
punpcklbw
mm5
,
mm0
; mm5 = p1..p4
pmullw
mm6
,
mm5
; mm6 *= p1..p4 * kernel 3 modifiers
paddusw
mm3
,
mm6
; mm3 += mm6
; thresholding
movq
mm7
,
mm1
; mm7 = p0..p3
psubusw
mm7
,
mm5
; mm7 = p0..p3 - p1..p4
psubusw
mm5
,
mm1
; mm5 = p1..p4 - p0..p3
paddusw
mm7
,
mm5
; mm7 = abs(p0..p3 - p1..p4)
pcmpgtw
mm7
,
mm2
movq
mm6
,
[
rbx
+
64
]
psrlq
mm4
,
8
; mm4 = p2..p7
movq
mm5
,
mm4
; mm5 = p2..p7
punpcklbw
mm5
,
mm0
; mm5 = p2..p5
pmullw
mm6
,
mm5
; mm5 *= kernel 4 modifiers
paddusw
mm3
,
mm6
; mm3 += mm5
; thresholding
movq
mm6
,
mm1
; mm6 = p0..p3
psubusw
mm6
,
mm5
; mm6 = p0..p3 - p1..p4
psubusw
mm5
,
mm1
; mm5 = p1..p4 - p0..p3
paddusw
mm6
,
mm5
; mm6 = abs(p0..p3 - p1..p4)
pcmpgtw
mm6
,
mm2
por
mm7
,
mm6
; accumulate thresholds
movq
mm6
,
[
rbx
]
movq
mm4
,
[
rdi
+
rdx
-
2
]
; mm4 = p-2..p5
movq
mm5
,
mm4
; mm5 = p-2..p5
punpcklbw
mm5
,
mm0
; mm5 = p-2..p1
pmullw
mm6
,
mm5
; mm5 *= kernel 0 modifiers
paddusw
mm3
,
mm6
; mm3 += mm5
; thresholding
movq
mm6
,
mm1
; mm6 = p0..p3
psubusw
mm6
,
mm5
; mm6 = p0..p3 - p1..p4
psubusw
mm5
,
mm1
; mm5 = p1..p4 - p0..p3
paddusw
mm6
,
mm5
; mm6 = abs(p0..p3 - p1..p4)
pcmpgtw
mm6
,
mm2
por
mm7
,
mm6
; accumulate thresholds
movq
mm6
,
[
rbx
+
16
]
psrlq
mm4
,
8
; mm4 = p-1..p5
punpcklbw
mm4
,
mm0
; mm4 = p-1..p2
pmullw
mm6
,
mm4
; mm4 *= kernel 1 modifiers.
paddusw
mm3
,
mm6
; mm3 += mm5
; thresholding
movq
mm6
,
mm1
; mm6 = p0..p3
psubusw
mm6
,
mm4
; mm6 = p0..p3 - p1..p4
psubusw
mm4
,
mm1
; mm5 = p1..p4 - p0..p3
paddusw
mm6
,
mm4
; mm6 = abs(p0..p3 - p1..p4)
pcmpgtw
mm6
,
mm2
por
mm7
,
mm6
; accumulate thresholds
paddusw
mm3
,
RD
; mm3 += round value
psraw
mm3
,
VP9_FILTER_SHIFT
; mm3 /= 128
pand
mm1
,
mm7
; mm1 select vals > thresh from source
pandn
mm7
,
mm3
; mm7 select vals < thresh from blurred result
paddusw
mm1
,
mm7
; combination
packuswb
mm1
,
mm0
; pack to bytes
mov
DWORD
PTR
[
rdi
+
rdx
-
4
],
eax
; store previous four bytes
movd
eax
,
mm1
add
rdx
,
4
cmp
edx
,
dword
ptr
arg
(
5
)
;cols
jl
.acrossnextcol
;
mov
DWORD
PTR
[
rdi
+
rdx
-
4
],
eax
pop
rax
; done with this rwo
add
rsi
,
rax
; next line
movsxd
rax
,
dword
ptr
arg
(
3
)
;dst_pixels_per_line ; destination pitch?
add
rdi
,
rax
; next destination
movsxd
rax
,
dword
ptr
arg
(
2
)
;src_pixels_per_line ; destination pitch?
dec
rcx
; decrement count
jnz
.nextrow
; next row
pop
rbx
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
UNSHADOW_ARGS
pop
rbp
ret
%undef RD
;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
; int pitch, int rows, int cols,int flimit)
extern
sym
(
vp9_rv
)
global
sym
(
vp9_mbpost_proc_down_mmx
)
PRIVATE
sym
(
vp9_mbpost_proc_down_mmx
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
5
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
ALIGN
_STACK
16
,
rax
sub
rsp
,
136
; unsigned char d[16][8] at [rsp]
; create flimit2 at [rsp+128]
mov
eax
,
dword
ptr
arg
(
4
)
;flimit
mov
[
rsp
+
128
],
eax
mov
[
rsp
+
128
+
4
],
eax
%define flimit2 [rsp+128]
%if ABI_IS_32BIT=0
lea
r8
,
[
GLOBAL
(
sym
(
vp9_rv
))]
%endif
;rows +=8;
add
dword
ptr
arg
(
2
),
8
;for(c=0; c<cols; c+=4)
.loop_col:
mov
rsi
,
arg
(
0
)
;s
pxor
mm0
,
mm0
;
movsxd
rax
,
dword
ptr
arg
(
1
)
;pitch ;
neg
rax
; rax = -pitch
lea
rsi
,
[
rsi
+
rax
*
8
]
; ; rdi = s[-pitch*8]
neg
rax
pxor
mm5
,
mm5
pxor
mm6
,
mm6
;
pxor
mm7
,
mm7
;
mov
rdi
,
rsi
mov
rcx
,
15
;
.loop_initvar:
movd
mm1
,
DWORD
PTR
[
rdi
]
;
punpcklbw
mm1
,
mm0
;
paddw
mm5
,
mm1
;
pmullw
mm1
,
mm1
;
movq
mm2
,
mm1
;
punpcklwd
mm1
,
mm0
;
punpckhwd
mm2
,
mm0
;
paddd
mm6
,
mm1
;
paddd
mm7
,
mm2
;
lea
rdi
,
[
rdi
+
rax
]
;
dec
rcx
jne
.loop_initvar
;save the var and sum
xor
rdx
,
rdx
.loop_row:
movd
mm1
,
DWORD
PTR
[
rsi
]
; [s-pitch*8]
movd
mm2
,
DWORD
PTR
[
rdi
]
; [s+pitch*7]
punpcklbw
mm1
,
mm0
punpcklbw
mm2
,
mm0
paddw
mm5
,
mm2
psubw
mm5
,
mm1
pmullw
mm2
,
mm2
movq
mm4
,
mm2
punpcklwd
mm2
,
mm0
punpckhwd
mm4
,
mm0
paddd
mm6
,
mm2
paddd
mm7
,
mm4
pmullw
mm1
,
mm1
movq
mm2
,
mm1
punpcklwd
mm1
,
mm0
psubd
mm6
,
mm1
punpckhwd
mm2
,
mm0
psubd
mm7
,
mm2
movq
mm3
,
mm6
pslld
mm3
,
4
psubd
mm3
,
mm6
movq
mm1
,
mm5
movq
mm4
,
mm5
pmullw
mm1
,
mm1
pmulhw
mm4
,
mm4
movq
mm2
,
mm1
punpcklwd
mm1
,
mm4
punpckhwd
mm2
,
mm4
movq
mm4
,
mm7
pslld
mm4
,
4
psubd
mm4
,
mm7
psubd
mm3
,
mm1
psubd
mm4
,
mm2
psubd
mm3
,
flimit2
psubd
mm4
,
flimit2
psrad
mm3
,
31
psrad
mm4
,
31
packssdw
mm3
,
mm4
packsswb
mm3
,
mm0
movd
mm1
,
DWORD
PTR
[
rsi
+
rax
*
8
]
movq
mm2
,
mm1
punpcklbw
mm1
,
mm0
paddw
mm1
,
mm5
mov
rcx
,
rdx
and
rcx
,
127
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
push
rax
lea
rax
,
[
GLOBAL
(
sym
(
vp9_rv
))]
movq
mm4
,
[
rax
+
rcx
*
2
]
;vp9_rv[rcx*2]
pop
rax
%elif ABI_IS_32BIT=0
movq
mm4
,
[
r8
+
rcx
*
2
]
;vp9_rv[rcx*2]
%else
movq
mm4
,
[
sym
(
vp9_rv
)
+
rcx
*
2
]
%endif
paddw
mm1
,
mm4
;paddw xmm1, eight8s
psraw
mm1
,
4
packuswb
mm1
,
mm0
pand
mm1
,
mm3
pandn
mm3
,
mm2
por
mm1
,
mm3
and
rcx
,
15
movd
DWORD
PTR
[
rsp
+
rcx
*
4
],
mm1
;d[rcx*4]
mov
rcx
,
rdx
sub
rcx
,
8
and
rcx
,
15
movd
mm1
,
DWORD
PTR
[
rsp
+
rcx
*
4
]
;d[rcx*4]
movd
[
rsi
],
mm1
lea
rsi
,
[
rsi
+
rax
]
lea
rdi
,
[
rdi
+
rax
]
add
rdx
,
1
cmp
edx
,
dword
arg
(
2
)
;rows
jl
.loop_row
add
dword
arg
(
0
),
4
; s += 4
sub
dword
arg
(
3
),
4
; cols -= 4
cmp
dword
arg
(
3
),
0
jg
.loop_col
add
rsp
,
136
pop
rsp
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
UNSHADOW_ARGS
pop
rbp
ret
%undef flimit2
;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
; unsigned char blackclamp[16],
; unsigned char whiteclamp[16],
; unsigned char bothclamp[16],
; unsigned int width, unsigned int height, int pitch)
global
sym
(
vp9_plane_add_noise_mmx
)
PRIVATE
sym
(
vp9_plane_add_noise_mmx
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
8
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
.addnoise_loop:
call
sym
(
LIBVPX_RAND
)
WRT
_PLT
mov
rcx
,
arg
(
1
)
;noise
and
rax
,
0xff
add
rcx
,
rax
; we rely on the fact that the clamping vectors are stored contiguously
; in black/white/both order. Note that we have to reload this here because
; rdx could be trashed by rand()
mov
rdx
,
arg
(
2
)
; blackclamp
mov
rdi
,
rcx
movsxd
rcx
,
dword
arg
(
5
)
;[Width]
mov
rsi
,
arg
(
0
)
;Pos
xor
rax
,
rax
.addnoise_nextset:
movq
mm1
,[
rsi
+
rax
]
; get the source
psubusb
mm1
,
[
rdx
]
;blackclamp ; clamp both sides so we don't outrange adding noise
paddusb
mm1
,
[
rdx
+
32
]
;bothclamp
psubusb
mm1
,
[
rdx
+
16
]
;whiteclamp
movq
mm2
,[
rdi
+
rax
]
; get the noise for this line
paddb
mm1
,
mm2
; add it in
movq
[
rsi
+
rax
],
mm1
; store the result
add
rax
,
8
; move to the next line
cmp
rax
,
rcx
jl
.addnoise_nextset
movsxd
rax
,
dword
arg
(
7
)
; Pitch
add
arg
(
0
),
rax
; Start += Pitch
sub
dword
arg
(
6
),
1
; Height -= 1
jg
.addnoise_loop
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
UNSHADOW_ARGS
pop
rbp
ret
SECTION
_RODATA
align
16
Blur:
times
16
dw
16
times
8
dw
64
times
16
dw
16
times
8
dw
0
rd:
times
4
dw
0x40
vp9/vp9_common.mk
View file @
89963bf5
...
...
@@ -80,7 +80,6 @@ VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_AVX2)
+=
common/x86/vp9_subpixel_8t_intrin_avx2.c
VP9_COMMON_SRCS-$(HAVE_SSSE3)
+=
common/x86/vp9_subpixel_8t_intrin_ssse3.c
ifeq
($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX)
+=
common/x86/vp9_postproc_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2)
+=
common/x86/vp9_postproc_sse2.asm
endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment