Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Stefan Strogin
flac
Commits
31bdd704
Commit
31bdd704
authored
Jul 25, 2004
by
Josh Coalson
Browse files
add brady's first round of altivec implementations
parent
41294c92
Changes
3
Hide whitespace changes
Inline
Side-by-side
src/libFLAC/Makefile.am
View file @
31bdd704
...
...
@@ -42,6 +42,10 @@ ARCH_SUBDIRS = ia32
libFLAC_la_LIBADD
=
ia32/libFLAC-asm.la
endif
endif
if
FLaC__CPU_PPC
ARCH_SUBDIRS
=
ppc
libFLAC_la_LIBADD
=
ppc/libFLAC-asm.la
endif
endif
SUBDIRS
=
$(ARCH_SUBDIRS)
include .
...
...
src/libFLAC/ppc/Makefile.am
0 → 100644
View file @
31bdd704
# libFLAC - Free Lossless Audio Codec library
# Copyright (C) 2004 Josh Coalson
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# - Neither the name of the Xiph.org Foundation nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SUFFIXES
=
.s .lo
.s.lo
:
$(LIBTOOL)
--mode
=
compile as
-force_cpusubtype_ALL
-o
$@
$<
noinst_LTLIBRARIES
=
libFLAC-asm.la
libFLAC_asm_la_SOURCES
=
\
lpc_asm.s
src/libFLAC/ppc/lpc_asm.s
0 → 100644
View file @
31bdd704
; libFLAC - Free Lossless Audio Codec library
; Copyright (C) 2004 Josh Coalson
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
;
; - Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
;
; - Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the distribution.
;
; - Neither the name of the Xiph.org Foundation nor the names of its
; contributors may be used to endorse or promote products derived from
; this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.
text
.
align
2
.
globl
_FLAC__lpc_restore_signal_asm_ppc_altivec_16
.
globl
_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8
_FLAC__lpc_restore_signal_asm_ppc_altivec_16
:
; r3: residual[]
; r4: data_len
; r5: qlp_coeff[]
; r6: order
; r7: lp_quantization
; r8: data[]
; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
; bps<=15 for mid-side coding, since that uses an extra bit)
; these should be fast; the inner loop is unrolled (it takes no more than
; 3*(order%4) instructions, all of which are arithmetic), and all of the
; coefficients and all relevant history stay in registers, so the outer loop
; has only one load from memory (the residual)
; I haven't yet run this through simg4, so there may be some avoidable stalls,
; and there may be a somewhat more clever way to do the outer loop
; the branch mechanism may prevent dynamic loading; I still need to examine
; this issue, and there may be a more elegant method
stmw
r31
,-
4
(
r1
)
addi
r9
,
r1
,-
28
li
r31
,
0xf
andc
r9
,
r9
,
r31
; for quadword-aligned stack data
slwi
r6
,
r6
,
2
; adjust for word size
slwi
r4
,
r4
,
2
add
r4
,
r4
,
r8
; r4 = data+data_len
mfspr
r0
,
256
; cache old vrsave
addis
r31
,
0
,
hi16
(
0xfffffc00
)
ori
r31
,
r31
,
lo16
(
0xfffffc00
)
mtspr
256
,
r31
; declare VRs in vrsave
cmplw
cr0
,
r8
,
r4
; i<data_len
bc
4
,
0
,
L1400
; load coefficients into v0-v7 and initial history into v8-v15
li
r31
,
0xf
and
r31
,
r8
,
r31
; r31: data%4
li
r11
,
16
subf
r31
,
r31
,
r11
; r31: 4-(data%4)
slwi
r31
,
r31
,
3
; convert to bits for vsro
li
r10
,-
4
stw
r31
,-
4
(
r9
)
lvewx
v0
,
r10
,
r9
vspltisb
v18
,-
1
vsro
v18
,
v18
,
v0
; v18: mask vector
li
r31
,
0x8
lvsl
v0
,
0
,
r31
vsldoi
v0
,
v0
,
v0
,
12
li
r31
,
0xc
lvsl
v1
,
0
,
r31
vspltisb
v2
,
0
vspltisb
v3
,-
1
vmrglw
v2
,
v2
,
v3
vsel
v0
,
v1
,
v0
,
v2
; v0: reversal permutation vector
add
r10
,
r5
,
r6
lvsl
v17
,
0
,
r5
; v17: coefficient alignment permutation vector
vperm
v17
,
v17
,
v17
,
v0
; v17: reversal coefficient alignment permutation vector
mr
r11
,
r8
lvsl
v16
,
0
,
r11
; v16: history alignment permutation vector
lvx
v0
,
0
,
r5
addi
r5
,
r5
,
16
lvx
v1
,
0
,
r5
vperm
v0
,
v0
,
v1
,
v17
lvx
v8
,
0
,
r11
addi
r11
,
r11
,-
16
lvx
v9
,
0
,
r11
vperm
v8
,
v9
,
v8
,
v16
cmplw
cr0
,
r5
,
r10
bc
12
,
0
,
L1101
vand
v0
,
v0
,
v18
addis
r31
,
0
,
hi16
(
L1307
)
ori
r31
,
r31
,
lo16
(
L1307
)
b
L1199
L1101
:
addi
r5
,
r5
,
16
lvx
v2
,
0
,
r5
vperm
v1
,
v1
,
v2
,
v17
addi
r11
,
r11
,-
16
lvx
v10
,
0
,
r11
vperm
v9
,
v10
,
v9
,
v16
cmplw
cr0
,
r5
,
r10
bc
12
,
0
,
L1102
vand
v1
,
v1
,
v18
addis
r31
,
0
,
hi16
(
L1306
)
ori
r31
,
r31
,
lo16
(
L1306
)
b
L1199
L1102
:
addi
r5
,
r5
,
16
lvx
v3
,
0
,
r5
vperm
v2
,
v2
,
v3
,
v17
addi
r11
,
r11
,-
16
lvx
v11
,
0
,
r11
vperm
v10
,
v11
,
v10
,
v16
cmplw
cr0
,
r5
,
r10
bc
12
,
0
,
L1103
vand
v2
,
v2
,
v18
addis
r31
,
0
,
hi16
(
L1305
)
ori
r31
,
r31
,
lo16
(
L1305
)
b
L1199
L1103
:
addi
r5
,
r5
,
16
lvx
v4
,
0
,
r5
vperm
v3
,
v3
,
v4
,
v17
addi
r11
,
r11
,-
16
lvx
v12
,
0
,
r11
vperm
v11
,
v12
,
v11
,
v16
cmplw
cr0
,
r5
,
r10
bc
12
,
0
,
L1104
vand
v3
,
v3
,
v18
addis
r31
,
0
,
hi16
(
L1304
)
ori
r31
,
r31
,
lo16
(
L1304
)
b
L1199
L1104
:
addi
r5
,
r5
,
16
lvx
v5
,
0
,
r5
vperm
v4
,
v4
,
v5
,
v17
addi
r11
,
r11
,-
16
lvx
v13
,
0
,
r11
vperm
v12
,
v13
,
v12
,
v16
cmplw
cr0
,
r5
,
r10
bc
12
,
0
,
L1105
vand
v4
,
v4
,
v18
addis
r31
,
0
,
hi16
(
L1303
)
ori
r31
,
r31
,
lo16
(
L1303
)
b
L1199
L1105
:
addi
r5
,
r5
,
16
lvx
v6
,
0
,
r5
vperm
v5
,
v5
,
v6
,
v17
addi
r11
,
r11
,-
16
lvx
v14
,
0
,
r11
vperm
v13
,
v14
,
v13
,
v16
cmplw
cr0
,
r5
,
r10
bc
12
,
0
,
L1106
vand
v5
,
v5
,
v18
addis
r31
,
0
,
hi16
(
L1302
)
ori
r31
,
r31
,
lo16
(
L1302
)
b
L1199
L1106
:
addi
r5
,
r5
,
16
lvx
v7
,
0
,
r5
vperm
v6
,
v6
,
v7
,
v17
addi
r11
,
r11
,-
16
lvx
v15
,
0
,
r11
vperm
v14
,
v15
,
v14
,
v16
cmplw
cr0
,
r5
,
r10
bc
12
,
0
,
L1107
vand
v6
,
v6
,
v18
addis
r31
,
0
,
hi16
(
L1301
)
ori
r31
,
r31
,
lo16
(
L1301
)
b
L1199
L1107
:
addi
r5
,
r5
,
16
lvx
v19
,
0
,
r5
vperm
v7
,
v7
,
v19
,
v17
addi
r11
,
r11
,-
16
lvx
v19
,
0
,
r11
vperm
v15
,
v19
,
v15
,
v16
vand
v7
,
v7
,
v18
addis
r31
,
0
,
hi16
(
L1300
)
ori
r31
,
r31
,
lo16
(
L1300
)
L1199
:
mtctr
r31
; set up invariant vectors
vspltish
v16
,
0
; v16: zero vector
li
r10
,-
12
lvsr
v17
,
r10
,
r8
; v17: result shift vector
lvsl
v18
,
r10
,
r3
; v18: residual shift back vector
li
r10
,-
4
stw
r7
,-
4
(
r9
)
lvewx
v19
,
r10
,
r9
; v19: lp_quantization vector
L1200
:
vmulosh
v20
,
v0
,
v8
; v20: sum vector
bcctr
20
,
0
L1300
:
vmulosh
v21
,
v7
,
v15
vsldoi
v15
,
v15
,
v14
,
4
; increment history
vaddsws
v20
,
v20
,
v21
L1301
:
vmulosh
v21
,
v6
,
v14
vsldoi
v14
,
v14
,
v13
,
4
vaddsws
v20
,
v20
,
v21
L1302
:
vmulosh
v21
,
v5
,
v13
vsldoi
v13
,
v13
,
v12
,
4
vaddsws
v20
,
v20
,
v21
L1303
:
vmulosh
v21
,
v4
,
v12
vsldoi
v12
,
v12
,
v11
,
4
vaddsws
v20
,
v20
,
v21
L1304
:
vmulosh
v21
,
v3
,
v11
vsldoi
v11
,
v11
,
v10
,
4
vaddsws
v20
,
v20
,
v21
L1305
:
vmulosh
v21
,
v2
,
v10
vsldoi
v10
,
v10
,
v9
,
4
vaddsws
v20
,
v20
,
v21
L1306
:
vmulosh
v21
,
v1
,
v9
vsldoi
v9
,
v9
,
v8
,
4
vaddsws
v20
,
v20
,
v21
L1307
:
vsumsws
v20
,
v20
,
v16
; v20[3]: sum
vsraw
v20
,
v20
,
v19
; v20[3]: sum >> lp_quantization
lvewx
v21
,
0
,
r3
; v21[n]: *residual
vperm
v21
,
v21
,
v21
,
v18
; v21[3]: *residual
vaddsws
v20
,
v21
,
v20
; v20[3]: *residual + (sum >> lp_quantization)
vsldoi
v18
,
v18
,
v18
,
4
; increment shift vector
vperm
v21
,
v20
,
v20
,
v17
; v21[n]: shift for storage
vsldoi
v17
,
v17
,
v17
,
12
; increment shift vector
stvewx
v21
,
0
,
r8
vsldoi
v20
,
v20
,
v20
,
12
vsldoi
v8
,
v8
,
v20
,
4
; insert value onto history
addi
r3
,
r3
,
4
addi
r8
,
r8
,
4
cmplw
cr0
,
r8
,
r4
; i<data_len
bc
12
,
0
,
L1200
L1400
:
mtspr
256
,
r0
; restore old vrsave
lmw
r31
,-
4
(
r1
)
blr
_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8
:
; r3: residual[]
; r4: data_len
; r5: qlp_coeff[]
; r6: order
; r7: lp_quantization
; r8: data[]
; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
; this version assumes order<=8; it uses fewer vector registers, which should
; save time in context switches, and has less code, which may improve
; instruction caching
stmw
r31
,-
4
(
r1
)
addi
r9
,
r1
,-
28
li
r31
,
0xf
andc
r9
,
r9
,
r31
; for quadword-aligned stack data
slwi
r6
,
r6
,
2
; adjust for word size
slwi
r4
,
r4
,
2
add
r4
,
r4
,
r8
; r4 = data+data_len
mfspr
r0
,
256
; cache old vrsave
addis
r31
,
0
,
hi16
(
0xffc00000
)
ori
r31
,
r31
,
lo16
(
0xffc00000
)
mtspr
256
,
r31
; declare VRs in vrsave
cmplw
cr0
,
r8
,
r4
; i<data_len
bc
4
,
0
,
L2400
; load coefficients into v0-v1 and initial history into v2-v3
li
r31
,
0xf
and
r31
,
r8
,
r31
; r31: data%4
li
r11
,
16
subf
r31
,
r31
,
r11
; r31: 4-(data%4)
slwi
r31
,
r31
,
3
; convert to bits for vsro
li
r10
,-
4
stw
r31
,-
4
(
r9
)
lvewx
v0
,
r10
,
r9
vspltisb
v6
,-
1
vsro
v6
,
v6
,
v0
; v6: mask vector
li
r31
,
0x8
lvsl
v0
,
0
,
r31
vsldoi
v0
,
v0
,
v0
,
12
li
r31
,
0xc
lvsl
v1
,
0
,
r31
vspltisb
v2
,
0
vspltisb
v3
,-
1
vmrglw
v2
,
v2
,
v3
vsel
v0
,
v1
,
v0
,
v2
; v0: reversal permutation vector
add
r10
,
r5
,
r6
lvsl
v5
,
0
,
r5
; v5: coefficient alignment permutation vector
vperm
v5
,
v5
,
v5
,
v0
; v5: reversal coefficient alignment permutation vector
mr
r11
,
r8
lvsl
v4
,
0
,
r11
; v4: history alignment permutation vector
lvx
v0
,
0
,
r5
addi
r5
,
r5
,
16
lvx
v1
,
0
,
r5
vperm
v0
,
v0
,
v1
,
v5
lvx
v2
,
0
,
r11
addi
r11
,
r11
,-
16
lvx
v3
,
0
,
r11
vperm
v2
,
v3
,
v2
,
v4
cmplw
cr0
,
r5
,
r10
bc
12
,
0
,
L2101
vand
v0
,
v0
,
v6
addis
r31
,
0
,
hi16
(
L2301
)
ori
r31
,
r31
,
lo16
(
L2301
)
b
L2199
L2101
:
addi
r5
,
r5
,
16
lvx
v7
,
0
,
r5
vperm
v1
,
v1
,
v7
,
v5
addi
r11
,
r11
,-
16
lvx
v7
,
0
,
r11
vperm
v3
,
v7
,
v3
,
v4
vand
v1
,
v1
,
v6
addis
r31
,
0
,
hi16
(
L2300
)
ori
r31
,
r31
,
lo16
(
L2300
)
L2199
:
mtctr
r31
; set up invariant vectors
vspltish
v4
,
0
; v4: zero vector
li
r10
,-
12
lvsr
v5
,
r10
,
r8
; v5: result shift vector
lvsl
v6
,
r10
,
r3
; v6: residual shift back vector
li
r10
,-
4
stw
r7
,-
4
(
r9
)
lvewx
v7
,
r10
,
r9
; v7: lp_quantization vector
L2200
:
vmulosh
v8
,
v0
,
v2
; v8: sum vector
bcctr
20
,
0
L2300
:
vmulosh
v9
,
v1
,
v3
vsldoi
v3
,
v3
,
v2
,
4
vaddsws
v8
,
v8
,
v9
L2301
:
vsumsws
v8
,
v8
,
v4
; v8[3]: sum
vsraw
v8
,
v8
,
v7
; v8[3]: sum >> lp_quantization
lvewx
v9
,
0
,
r3
; v9[n]: *residual
vperm
v9
,
v9
,
v9
,
v6
; v9[3]: *residual
vaddsws
v8
,
v9
,
v8
; v8[3]: *residual + (sum >> lp_quantization)
vsldoi
v6
,
v6
,
v6
,
4
; increment shift vector
vperm
v9
,
v8
,
v8
,
v5
; v9[n]: shift for storage
vsldoi
v5
,
v5
,
v5
,
12
; increment shift vector
stvewx
v9
,
0
,
r8
vsldoi
v8
,
v8
,
v8
,
12
vsldoi
v2
,
v2
,
v8
,
4
; insert value onto history
addi
r3
,
r3
,
4
addi
r8
,
r8
,
4
cmplw
cr0
,
r8
,
r4
; i<data_len
bc
12
,
0
,
L2200
L2400
:
mtspr
256
,
r0
; restore old vrsave
lmw
r31
,-
4
(
r1
)
blr
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment