Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
A
aom-rav1e
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Xiph.Org
aom-rav1e
Commits
f1a3b1e0
Commit
f1a3b1e0
authored
Jun 24, 2010
by
Scott LaVarnway
Committed by
John Koleszar
Jun 24, 2010
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added first-pass sse2 version of Yaowu's new fdct.
Change-Id: Ib479210067510162879c368428b92690591120b2
parent
d0dd01b8
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
180 additions
and
244 deletions
+180
-244
vp8/encoder/x86/dct_sse2.asm
vp8/encoder/x86/dct_sse2.asm
+159
-231
vp8/encoder/x86/dct_x86.h
vp8/encoder/x86/dct_x86.h
+10
-6
vp8/encoder/x86/x86_csystemdependent.c
vp8/encoder/x86/x86_csystemdependent.c
+10
-7
vp8/vp8cx.mk
vp8/vp8cx.mk
+1
-0
No files found.
vp8/encoder/x86/dct_sse2.asm
View file @
f1a3b1e0
...
...
@@ -11,251 +11,179 @@
%include "vpx_ports/x86_abi_support.asm"
global
sym
(
vp8_short_fdct4x4_wmt
)
%define DCTCONSTANTSBITS (16)
%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1))
%define x_c1 (60547)
; cos(pi /8) * (1<<15)
%define x_c2 (46341)
; cos(pi*2/8) * (1<<15)
%define x_c3 (25080)
; cos(pi*3/8) * (1<<15)
%define _1STSTAGESHIFT 14
%define _2NDSTAGESHIFT 16
;; using matrix multiply
;void vp8_short_fdct4x4_wmt(short *input, short *output)
sym
(
vp8_short_fdct4x4_wmt
):
;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
global
sym
(
vp8_short_fdct4x4_sse2
)
sym
(
vp8_short_fdct4x4_sse2
):
push
rbp
mov
rbp
,
rsp
SHADOW_ARGS_TO_STACK
2
SHADOW_ARGS_TO_STACK
3
;; SAVE_XMM
GET_GOT
rbx
push
rsi
push
rdi
; end prolog
mov
rax
,
arg
(
0
)
;input
mov
rcx
,
arg
(
1
)
;output
lea
rdx
,
[
dct_matrix_sse2
GLOBAL
]
movdqu
xmm0
,
[
rax
]
movdqu
xmm1
,
[
rax
+
16
]
; first column
movdqa
xmm2
,
xmm0
movdqa
xmm7
,
[
rdx
]
pmaddwd
xmm2
,
xmm7
movdqa
xmm3
,
xmm1
pmaddwd
xmm3
,
xmm7
movdqa
xmm4
,
xmm2
punpckldq
xmm2
,
xmm3
punpckhdq
xmm4
,
xmm3
movdqa
xmm3
,
xmm2
punpckldq
xmm2
,
xmm4
punpckhdq
xmm3
,
xmm4
paddd
xmm2
,
xmm3
paddd
xmm2
,
XMMWORD
PTR
[
dct1st_stage_rounding_sse2
GLOBAL
]
psrad
xmm2
,
_1STSTAGESHIFT
;second column
movdqa
xmm3
,
xmm0
pmaddwd
xmm3
,
[
rdx
+
16
]
movdqa
xmm4
,
xmm1
pmaddwd
xmm4
,
[
rdx
+
16
]
movdqa
xmm5
,
xmm3
punpckldq
xmm3
,
xmm4
punpckhdq
xmm5
,
xmm4
movdqa
xmm4
,
xmm3
punpckldq
xmm3
,
xmm5
punpckhdq
xmm4
,
xmm5
paddd
xmm3
,
xmm4
paddd
xmm3
,
XMMWORD
PTR
[
dct1st_stage_rounding_sse2
GLOBAL
]
psrad
xmm3
,
_1STSTAGESHIFT
packssdw
xmm2
,
xmm3
;third column
movdqa
xmm3
,
xmm0
pmaddwd
xmm3
,
[
rdx
+
32
]
movdqa
xmm4
,
xmm1
pmaddwd
xmm4
,
[
rdx
+
32
]
movdqa
xmm5
,
xmm3
punpckldq
xmm3
,
xmm4
punpckhdq
xmm5
,
xmm4
movdqa
xmm4
,
xmm3
punpckldq
xmm3
,
xmm5
punpckhdq
xmm4
,
xmm5
paddd
xmm3
,
xmm4
paddd
xmm3
,
XMMWORD
PTR
[
dct1st_stage_rounding_sse2
GLOBAL
]
psrad
xmm3
,
_1STSTAGESHIFT
;fourth column (this is the last column, so we do not have save the source any more)
pmaddwd
xmm0
,
[
rdx
+
48
]
pmaddwd
xmm1
,
[
rdx
+
48
]
movdqa
xmm4
,
xmm0
punpckldq
xmm0
,
xmm1
punpckhdq
xmm4
,
xmm1
movdqa
xmm1
,
xmm0
punpckldq
xmm0
,
xmm4
punpckhdq
xmm1
,
xmm4
paddd
xmm0
,
xmm1
paddd
xmm0
,
XMMWORD
PTR
[
dct1st_stage_rounding_sse2
GLOBAL
]
psrad
xmm0
,
_1STSTAGESHIFT
packssdw
xmm3
,
xmm0
; done with one pass
; now start second pass
movdqa
xmm0
,
xmm2
movdqa
xmm1
,
xmm3
pmaddwd
xmm2
,
xmm7
pmaddwd
xmm3
,
xmm7
movdqa
xmm4
,
xmm2
punpckldq
xmm2
,
xmm3
mov
rsi
,
arg
(
0
)
movsxd
rax
,
DWORD
PTR
arg
(
2
)
lea
rdi
,
[
rsi
+
rax
*
2
]
movq
xmm0
,
MMWORD
PTR
[
rsi
]
;03 02 01 00
movq
xmm2
,
MMWORD
PTR
[
rsi
+
rax
]
;13 12 11 10
movq
xmm1
,
MMWORD
PTR
[
rsi
+
rax
*
2
]
;23 22 21 20
movq
xmm3
,
MMWORD
PTR
[
rdi
+
rax
]
;33 32 31 30
punpcklqdq
xmm0
,
xmm2
;13 12 11 10 03 02 01 00
punpcklqdq
xmm1
,
xmm3
;33 32 31 30 23 22 21 20
mov
rdi
,
arg
(
1
)
movdqa
xmm2
,
xmm0
punpckldq
xmm0
,
xmm1
;23 22 03 02 21 20 01 00
punpckhdq
xmm2
,
xmm1
;33 32 13 12 31 30 11 10
movdqa
xmm1
,
xmm0
punpckldq
xmm0
,
xmm2
;31 21 30 20 11 10 01 00
pshufhw
xmm1
,
xmm1
,
0b1h
;22 23 02 03 xx xx xx xx
pshufhw
xmm2
,
xmm2
,
0b1h
;32 33 12 13 xx xx xx xx
punpckhdq
xmm1
,
xmm2
;32 33 22 23 12 13 02 03
movdqa
xmm3
,
xmm0
paddw
xmm0
,
xmm1
;b1 a1 b1 a1 b1 a1 b1 a1
psubw
xmm3
,
xmm1
;c1 d1 c1 d1 c1 d1 c1 d1
psllw
xmm0
,
3
;b1 <<= 3 a1 <<= 3
psllw
xmm3
,
3
;c1 <<= 3 d1 <<= 3
movdqa
xmm1
,
xmm0
pmaddwd
xmm0
,
XMMWORD
PTR
[
_mult_add
GLOBAL
]
;a1 + b1
pmaddwd
xmm1
,
XMMWORD
PTR
[
_mult_sub
GLOBAL
]
;a1 - b1
movdqa
xmm4
,
xmm3
pmaddwd
xmm3
,
XMMWORD
PTR
[
_5352_2217
GLOBAL
]
;c1*2217 + d1*5352
pmaddwd
xmm4
,
XMMWORD
PTR
[
_2217_neg5352
GLOBAL
]
;d1*2217 - c1*5352
paddd
xmm3
,
XMMWORD
PTR
[
_14500
GLOBAL
]
paddd
xmm4
,
XMMWORD
PTR
[
_7500
GLOBAL
]
psrad
xmm3
,
12
;(c1 * 2217 + d1 * 5352 + 14500)>>12
psrad
xmm4
,
12
;(d1 * 2217 - c1 * 5352 + 7500)>>12
packssdw
xmm0
,
xmm1
;op[2] op[0]
packssdw
xmm3
,
xmm4
;op[3] op[1]
; 23 22 21 20 03 02 01 00
;
; 33 32 31 30 13 12 11 10
;
movdqa
xmm2
,
xmm0
punpcklqdq
xmm0
,
xmm3
;13 12 11 10 03 02 01 00
punpckhqdq
xmm2
,
xmm3
;23 22 21 20 33 32 31 30
movdqa
xmm3
,
xmm0
punpcklwd
xmm0
,
xmm2
;32 30 22 20 12 10 02 00
punpckhwd
xmm3
,
xmm2
;33 31 23 21 13 11 03 01
movdqa
xmm2
,
xmm0
punpcklwd
xmm0
,
xmm3
;13 12 11 10 03 02 01 00
punpckhwd
xmm2
,
xmm3
;33 32 31 30 23 22 21 20
movdqa
xmm5
,
XMMWORD
PTR
[
_7
GLOBAL
]
pshufd
xmm2
,
xmm2
,
04eh
movdqa
xmm3
,
xmm0
paddw
xmm0
,
xmm2
;b1 b1 b1 b1 a1 a1 a1 a1
psubw
xmm3
,
xmm2
;c1 c1 c1 c1 d1 d1 d1 d1
pshufd
xmm0
,
xmm0
,
0d8h
;b1 b1 a1 a1 b1 b1 a1 a1
movdqa
xmm2
,
xmm3
;save d1 for compare
pshufd
xmm3
,
xmm3
,
0d8h
;c1 c1 d1 d1 c1 c1 d1 d1
pshuflw
xmm0
,
xmm0
,
0d8h
;b1 b1 a1 a1 b1 a1 b1 a1
pshuflw
xmm3
,
xmm3
,
0d8h
;c1 c1 d1 d1 c1 d1 c1 d1
pshufhw
xmm0
,
xmm0
,
0d8h
;b1 a1 b1 a1 b1 a1 b1 a1
pshufhw
xmm3
,
xmm3
,
0d8h
;c1 d1 c1 d1 c1 d1 c1 d1
movdqa
xmm1
,
xmm0
pmaddwd
xmm0
,
XMMWORD
PTR
[
_mult_add
GLOBAL
]
;a1 + b1
pmaddwd
xmm1
,
XMMWORD
PTR
[
_mult_sub
GLOBAL
]
;a1 - b1
pxor
xmm4
,
xmm4
;zero out for compare
paddd
xmm0
,
xmm5
paddd
xmm1
,
xmm5
pcmpeqw
xmm2
,
xmm4
psrad
xmm0
,
4
;(a1 + b1 + 7)>>4
psrad
xmm1
,
4
;(a1 - b1 + 7)>>4
pandn
xmm2
,
XMMWORD
PTR
[
_cmp_mask
GLOBAL
]
;clear upper,
;and keep bit 0 of lower
movdqa
xmm4
,
xmm3
pmaddwd
xmm3
,
XMMWORD
PTR
[
_5352_2217
GLOBAL
]
;c1*2217 + d1*5352
pmaddwd
xmm4
,
XMMWORD
PTR
[
_2217_neg5352
GLOBAL
]
;d1*2217 - c1*5352
paddd
xmm3
,
XMMWORD
PTR
[
_12000
GLOBAL
]
paddd
xmm4
,
XMMWORD
PTR
[
_51000
GLOBAL
]
packssdw
xmm0
,
xmm1
;op[8] op[0]
psrad
xmm3
,
16
;(c1 * 2217 + d1 * 5352 + 12000)>>16
psrad
xmm4
,
16
;(d1 * 2217 - c1 * 5352 + 51000)>>16
packssdw
xmm3
,
xmm4
;op[12] op[4]
movdqa
xmm1
,
xmm0
paddw
xmm3
,
xmm2
;op[4] += (d1!=0)
punpcklqdq
xmm0
,
xmm3
;op[4] op[0]
punpckhqdq
xmm1
,
xmm3
;op[12] op[8]
movdqa
XMMWORD
PTR
[
rdi
+
0
],
xmm0
movdqa
XMMWORD
PTR
[
rdi
+
16
],
xmm1
punpckhdq
xmm4
,
xmm3
movdqa
xmm3
,
xmm2
punpckldq
xmm2
,
xmm4
punpckhdq
xmm3
,
xmm4
paddd
xmm2
,
xmm3
paddd
xmm2
,
XMMWORD
PTR
[
dct2nd_stage_rounding_sse2
GLOBAL
]
psrad
xmm2
,
_2NDSTAGESHIFT
;second column
movdqa
xmm3
,
xmm0
pmaddwd
xmm3
,
[
rdx
+
16
]
movdqa
xmm4
,
xmm1
pmaddwd
xmm4
,
[
rdx
+
16
]
movdqa
xmm5
,
xmm3
punpckldq
xmm3
,
xmm4
punpckhdq
xmm5
,
xmm4
movdqa
xmm4
,
xmm3
punpckldq
xmm3
,
xmm5
punpckhdq
xmm4
,
xmm5
paddd
xmm3
,
xmm4
paddd
xmm3
,
XMMWORD
PTR
[
dct2nd_stage_rounding_sse2
GLOBAL
]
psrad
xmm3
,
_2NDSTAGESHIFT
packssdw
xmm2
,
xmm3
movdqu
[
rcx
],
xmm2
;third column
movdqa
xmm3
,
xmm0
pmaddwd
xmm3
,
[
rdx
+
32
]
movdqa
xmm4
,
xmm1
pmaddwd
xmm4
,
[
rdx
+
32
]
movdqa
xmm5
,
xmm3
punpckldq
xmm3
,
xmm4
punpckhdq
xmm5
,
xmm4
movdqa
xmm4
,
xmm3
punpckldq
xmm3
,
xmm5
punpckhdq
xmm4
,
xmm5
paddd
xmm3
,
xmm4
paddd
xmm3
,
XMMWORD
PTR
[
dct2nd_stage_rounding_sse2
GLOBAL
]
psrad
xmm3
,
_2NDSTAGESHIFT
;fourth column
pmaddwd
xmm0
,
[
rdx
+
48
]
pmaddwd
xmm1
,
[
rdx
+
48
]
movdqa
xmm4
,
xmm0
punpckldq
xmm0
,
xmm1
punpckhdq
xmm4
,
xmm1
movdqa
xmm1
,
xmm0
punpckldq
xmm0
,
xmm4
punpckhdq
xmm1
,
xmm4
paddd
xmm0
,
xmm1
paddd
xmm0
,
XMMWORD
PTR
[
dct2nd_stage_rounding_sse2
GLOBAL
]
psrad
xmm0
,
_2NDSTAGESHIFT
packssdw
xmm3
,
xmm0
movdqu
[
rcx
+
16
],
xmm3
mov
rsp
,
rbp
; begin epilog
pop
rdi
pop
rsi
REST
ORE_GOT
;; RESTORE_XMM
UNSHADOW_ARGS
pop
rbp
ret
SECTION
_RODATA
;static unsigned int dct1st_stage_rounding_sse2[4] =
align
16
dct1st_stage_rounding_sse2:
times
4
dd
8192
;static unsigned int dct2nd_stage_rounding_sse2[4] =
_5352_2217:
dw
5352
dw
2217
dw
5352
dw
2217
dw
5352
dw
2217
dw
5352
dw
2217
align
16
dct2nd_stage_rounding_sse2:
times
4
dd
32768
;static short dct_matrix_sse2[4][8]=
_2217_neg5352:
dw
2217
dw
-
5352
dw
2217
dw
-
5352
dw
2217
dw
-
5352
dw
2217
dw
-
5352
align
16
dct_matrix_sse2:
times
8
dw
23170
dw
30274
dw
12540
dw
-
12540
dw
-
30274
dw
30274
dw
12540
dw
-
12540
dw
-
30274
dw
23170
times
2
dw
-
23170
times
2
dw
23170
times
2
dw
-
23170
dw
23170
_mult_add:
times
8
dw
1
align
16
_cmp_mask:
times
4
dw
1
times
4
dw
0
dw
12540
dw
-
30274
dw
30274
dw
-
12540
dw
12540
dw
-
30274
dw
30274
dw
-
12540
align
16
_mult_sub:
dw
1
dw
-
1
dw
1
dw
-
1
dw
1
dw
-
1
dw
1
dw
-
1
align
16
_7:
times
4
dd
7
align
16
_14500:
times
4
dd
14500
align
16
_7500:
times
4
dd
7500
align
16
_12000:
times
4
dd
12000
align
16
_51000:
times
4
dd
51000
vp8/encoder/x86/dct_x86.h
View file @
f1a3b1e0
...
...
@@ -24,7 +24,7 @@ extern prototype_fdct(vp8_short_fdct4x4_mmx);
extern
prototype_fdct
(
vp8_short_fdct8x4_mmx
);
#if !CONFIG_RUNTIME_CPU_DETECT
#if 0
new c version,
#if 0
#undef vp8_fdct_short4x4
#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
...
...
@@ -40,19 +40,23 @@ extern prototype_fdct(vp8_short_fdct8x4_mmx);
extern
prototype_fdct
(
vp8_short_fdct8x4_wmt
);
extern
prototype_fdct
(
vp8_short_walsh4x4_sse2
);
#if !CONFIG_RUNTIME_CPU_DETECT
extern
prototype_fdct
(
vp8_short_fdct4x4_sse2
);
#if 0
#if !CONFIG_RUNTIME_CPU_DETECT
#if 1
/* short SSE2 DCT currently disabled, does not match the MMX version */
#undef vp8_fdct_short4x4
#define vp8_fdct_short4x4 vp8_short_fdct4x4_
wmt
#define vp8_fdct_short4x4 vp8_short_fdct4x4_
sse2
#undef vp8_fdct_short8x4
#define vp8_fdct_short8x4 vp8_short_fdct8x4_
wmt
#define vp8_fdct_short8x4 vp8_short_fdct8x4_
sse2
#endif
#undef vp8_fdct_fast4x4
#define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
#undef vp8_fdct_fast8x4
#define vp8_fdct_fast8x4 vp8_
fast_fdct8x4_wmt
#define vp8_fdct_fast8x4 vp8_
short_fdct8x4_sse2
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_sse2
...
...
vp8/encoder/x86/x86_csystemdependent.c
View file @
f1a3b1e0
...
...
@@ -82,6 +82,11 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
#endif
#if HAVE_SSE2
void
vp8_short_fdct8x4_sse2
(
short
*
input
,
short
*
output
,
int
pitch
)
{
vp8_short_fdct4x4_sse2
(
input
,
output
,
pitch
);
vp8_short_fdct4x4_sse2
(
input
+
4
,
output
+
16
,
pitch
);
}
int
vp8_fast_quantize_b_impl_sse
(
short
*
coeff_ptr
,
short
*
zbin_ptr
,
short
*
qcoeff_ptr
,
short
*
dequant_ptr
,
...
...
@@ -268,13 +273,11 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi
->
rtcd
.
variance
.
get16x16var
=
vp8_get16x16var_sse2
;
/* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */
;
#if 0 //new fdct
/* short SSE2 DCT currently disabled, does not match the MMX version */
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_wmt;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_wmt;
/* cpi->rtcd.fdct.fast4x4 not implemented for wmt */;
cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_wmt;
#endif
cpi
->
rtcd
.
fdct
.
short4x4
=
vp8_short_fdct4x4_sse2
;
cpi
->
rtcd
.
fdct
.
short8x4
=
vp8_short_fdct8x4_sse2
;
cpi
->
rtcd
.
fdct
.
fast4x4
=
vp8_short_fdct4x4_sse2
;
cpi
->
rtcd
.
fdct
.
fast8x4
=
vp8_short_fdct8x4_sse2
;
cpi
->
rtcd
.
fdct
.
walsh_short4x4
=
vp8_short_walsh4x4_sse2
;
cpi
->
rtcd
.
encodemb
.
berr
=
vp8_block_error_xmm
;
...
...
vp8/vp8cx.mk
View file @
f1a3b1e0
...
...
@@ -93,6 +93,7 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm
VP8_CX_SRCS-$(HAVE_MMX)
+=
encoder/x86/sad_mmx.asm
VP8_CX_SRCS-$(HAVE_MMX)
+=
encoder/x86/dct_mmx.asm
VP8_CX_SRCS-$(HAVE_MMX)
+=
encoder/x86/subtract_mmx.asm
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/variance_sse2.c
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/variance_impl_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2)
+=
encoder/x86/sad_sse2.asm
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment