Commit 47faab80 authored by Josh Coalson's avatar Josh Coalson
Browse files

split PPC asm code into two directores, one holding asm compatible with OS X...

split PPC asm code into two directores, one holding asm compatible with OS X native 'as', and one for 'gas'
parent 5f2b46d0
......@@ -44,7 +44,8 @@ endif
endif
if FLaC__CPU_PPC
ARCH_SUBDIRS = ppc
libFLAC_la_LIBADD = ppc/libFLAC-asm.la
#@@@@@@ need to conditionalize this to gas or as depending on configure
libFLAC_la_LIBADD = ppc/as/libFLAC-asm.la
LOCAL_EXTRA_LDFLAGS = "-Wl,-read_only_relocs,warning"
endif
endif
......
......@@ -49,7 +49,7 @@ DEBUG_CFLAGS = -DFLAC__OVERFLOW_DETECT
ifeq ($(DARWIN_BUILD),yes)
SRCS_S = \
ppc/lpc_asm.s
ppc/as/lpc_asm.s
else
ifeq ($(SOLARIS_BUILD),yes)
else
......
......@@ -28,15 +28,6 @@
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SUFFIXES = .s .lo
STRIP_NON_ASM = sh $(top_srcdir)/strip_non_asm_libtool_args.sh
# For some unknown reason libtool can't figure out the tag for 'as', so
# we fake it with --tag=CC and strip out unwanted options.
.s.lo:
$(LIBTOOL) --tag=CC --mode=compile $(STRIP_NON_ASM) as -force_cpusubtype_ALL -o $@ $<
noinst_LTLIBRARIES = libFLAC-asm.la
libFLAC_asm_la_SOURCES = \
lpc_asm.s
#@@@@@@ We actually want to only compile one based on some configuration,
#@@@@@@ so we'll eventually have to conditionalize the sub-Makefile.am's
SUBDIRS = as gas
# libFLAC - Free Lossless Audio Codec library
# Copyright (C) 2004 Josh Coalson
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# - Neither the name of the Xiph.org Foundation nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SUFFIXES = .s .lo
STRIP_NON_ASM = sh $(top_srcdir)/strip_non_asm_libtool_args.sh
# For some unknown reason libtool can't figure out the tag for 'as', so
# we fake it with --tag=CC and strip out unwanted options.
.s.lo:
$(LIBTOOL) --tag=CC --mode=compile $(STRIP_NON_ASM) as -force_cpusubtype_ALL -o $@ $<
noinst_LTLIBRARIES = libFLAC-asm.la
libFLAC_asm_la_SOURCES = \
lpc_asm.s
# libFLAC - Free Lossless Audio Codec library
# Copyright (C) 2004 Josh Coalson
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# - Neither the name of the Xiph.org Foundation nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SUFFIXES = .s .lo
STRIP_NON_ASM = sh $(top_srcdir)/strip_non_asm_libtool_args.sh
# For some unknown reason libtool can't figure out the tag for 'gas', so
# we fake it with --tag=CC and strip out unwanted options.
.s.lo:
$(LIBTOOL) --tag=CC --mode=compile $(STRIP_NON_ASM) gas -force_cpusubtype_ALL -o $@ $<
noinst_LTLIBRARIES = libFLAC-asm.la
libFLAC_asm_la_SOURCES = \
lpc_asm.s
# libFLAC - Free Lossless Audio Codec library
# Copyright (C) 2004 Josh Coalson
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# - Neither the name of the Xiph.org Foundation nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.text
.align 2
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16
.type _FLAC__lpc_restore_signal_asm_ppc_altivec_16, @function
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8
.type _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8, @function
_FLAC__lpc_restore_signal_asm_ppc_altivec_16:
# r3: residual[]
# r4: data_len
# r5: qlp_coeff[]
# r6: order
# r7: lp_quantization
# r8: data[]
# see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
# these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
# bps<=15 for mid-side coding, since that uses an extra bit)
# these should be fast; the inner loop is unrolled (it takes no more than
# 3*(order%4) instructions, all of which are arithmetic), and all of the
# coefficients and all relevant history stay in registers, so the outer loop
# has only one load from memory (the residual)
# I have not yet run this through simg4, so there may be some avoidable stalls,
# and there may be a somewhat more clever way to do the outer loop
# the branch mechanism may prevent dynamic loading; I still need to examine
# this issue, and there may be a more elegant method
stmw r31,-4(r1)
addi r9,r1,-28
li r31,0xf
andc r9,r9,r31 # for quadword-aligned stack data
slwi r6,r6,2 # adjust for word size
slwi r4,r4,2
add r4,r4,r8 # r4 = data+data_len
mfspr r0,256 # cache old vrsave
addis r31,0,0xffff
ori r31,r31,0xfc00
mtspr 256,r31 # declare VRs in vrsave
cmplw cr0,r8,r4 # i<data_len
bc 4,0,L1400
# load coefficients into v0-v7 and initial history into v8-v15
li r31,0xf
and r31,r8,r31 # r31: data%4
li r11,16
subf r31,r31,r11 # r31: 4-(data%4)
slwi r31,r31,3 # convert to bits for vsro
li r10,-4
stw r31,-4(r9)
lvewx v0,r10,r9
vspltisb v18,-1
vsro v18,v18,v0 # v18: mask vector
li r31,0x8
lvsl v0,0,r31
vsldoi v0,v0,v0,12
li r31,0xc
lvsl v1,0,r31
vspltisb v2,0
vspltisb v3,-1
vmrglw v2,v2,v3
vsel v0,v1,v0,v2 # v0: reversal permutation vector
add r10,r5,r6
lvsl v17,0,r5 # v17: coefficient alignment permutation vector
vperm v17,v17,v17,v0 # v17: reversal coefficient alignment permutation vector
mr r11,r8
lvsl v16,0,r11 # v16: history alignment permutation vector
lvx v0,0,r5
addi r5,r5,16
lvx v1,0,r5
vperm v0,v0,v1,v17
lvx v8,0,r11
addi r11,r11,-16
lvx v9,0,r11
vperm v8,v9,v8,v16
cmplw cr0,r5,r10
bc 12,0,L1101
vand v0,v0,v18
addis r31,0,L1307@ha
ori r31,r31,L1307@l
b L1199
L1101:
addi r5,r5,16
lvx v2,0,r5
vperm v1,v1,v2,v17
addi r11,r11,-16
lvx v10,0,r11
vperm v9,v10,v9,v16
cmplw cr0,r5,r10
bc 12,0,L1102
vand v1,v1,v18
addis r31,0,L1306@ha
ori r31,r31,L1306@l
b L1199
L1102:
addi r5,r5,16
lvx v3,0,r5
vperm v2,v2,v3,v17
addi r11,r11,-16
lvx v11,0,r11
vperm v10,v11,v10,v16
cmplw cr0,r5,r10
bc 12,0,L1103
vand v2,v2,v18
lis r31,L1305@ha
la r31,L1305@l(r31)
b L1199
L1103:
addi r5,r5,16
lvx v4,0,r5
vperm v3,v3,v4,v17
addi r11,r11,-16
lvx v12,0,r11
vperm v11,v12,v11,v16
cmplw cr0,r5,r10
bc 12,0,L1104
vand v3,v3,v18
lis r31,L1304@ha
la r31,L1304@l(r31)
b L1199
L1104:
addi r5,r5,16
lvx v5,0,r5
vperm v4,v4,v5,v17
addi r11,r11,-16
lvx v13,0,r11
vperm v12,v13,v12,v16
cmplw cr0,r5,r10
bc 12,0,L1105
vand v4,v4,v18
lis r31,L1303@ha
la r31,L1303@l(r31)
b L1199
L1105:
addi r5,r5,16
lvx v6,0,r5
vperm v5,v5,v6,v17
addi r11,r11,-16
lvx v14,0,r11
vperm v13,v14,v13,v16
cmplw cr0,r5,r10
bc 12,0,L1106
vand v5,v5,v18
lis r31,L1302@ha
la r31,L1302@l(r31)
b L1199
L1106:
addi r5,r5,16
lvx v7,0,r5
vperm v6,v6,v7,v17
addi r11,r11,-16
lvx v15,0,r11
vperm v14,v15,v14,v16
cmplw cr0,r5,r10
bc 12,0,L1107
vand v6,v6,v18
lis r31,L1301@ha
la r31,L1301@l(r31)
b L1199
L1107:
addi r5,r5,16
lvx v19,0,r5
vperm v7,v7,v19,v17
addi r11,r11,-16
lvx v19,0,r11
vperm v15,v19,v15,v16
vand v7,v7,v18
lis r31,L1300@ha
la r31,L1300@l(r31)
L1199:
mtctr r31
# set up invariant vectors
vspltish v16,0 # v16: zero vector
li r10,-12
lvsr v17,r10,r8 # v17: result shift vector
lvsl v18,r10,r3 # v18: residual shift back vector
li r10,-4
stw r7,-4(r9)
lvewx v19,r10,r9 # v19: lp_quantization vector
L1200:
vmulosh v20,v0,v8 # v20: sum vector
bcctr 20,0
L1300:
vmulosh v21,v7,v15
vsldoi v15,v15,v14,4 # increment history
vaddsws v20,v20,v21
L1301:
vmulosh v21,v6,v14
vsldoi v14,v14,v13,4
vaddsws v20,v20,v21
L1302:
vmulosh v21,v5,v13
vsldoi v13,v13,v12,4
vaddsws v20,v20,v21
L1303:
vmulosh v21,v4,v12
vsldoi v12,v12,v11,4
vaddsws v20,v20,v21
L1304:
vmulosh v21,v3,v11
vsldoi v11,v11,v10,4
vaddsws v20,v20,v21
L1305:
vmulosh v21,v2,v10
vsldoi v10,v10,v9,4
vaddsws v20,v20,v21
L1306:
vmulosh v21,v1,v9
vsldoi v9,v9,v8,4
vaddsws v20,v20,v21
L1307:
vsumsws v20,v20,v16 # v20[3]: sum
vsraw v20,v20,v19 # v20[3]: sum >> lp_quantization
lvewx v21,0,r3 # v21[n]: *residual
vperm v21,v21,v21,v18 # v21[3]: *residual
vaddsws v20,v21,v20 # v20[3]: *residual + (sum >> lp_quantization)
vsldoi v18,v18,v18,4 # increment shift vector
vperm v21,v20,v20,v17 # v21[n]: shift for storage
vsldoi v17,v17,v17,12 # increment shift vector
stvewx v21,0,r8
vsldoi v20,v20,v20,12
vsldoi v8,v8,v20,4 # insert value onto history
addi r3,r3,4
addi r8,r8,4
cmplw cr0,r8,r4 # i<data_len
bc 12,0,L1200
L1400:
mtspr 256,r0 # restore old vrsave
lmw r31,-4(r1)
blr
_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
# r3: residual[]
# r4: data_len
# r5: qlp_coeff[]
# r6: order
# r7: lp_quantization
# r8: data[]
# see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
# this version assumes order<=8; it uses fewer vector registers, which should
# save time in context switches, and has less code, which may improve
# instruction caching
stmw r31,-4(r1)
addi r9,r1,-28
li r31,0xf
andc r9,r9,r31 # for quadword-aligned stack data
slwi r6,r6,2 # adjust for word size
slwi r4,r4,2
add r4,r4,r8 # r4 = data+data_len
mfspr r0,256 # cache old vrsave
addis r31,0,0xffc0
ori r31,r31,0x0000
mtspr 256,r31 # declare VRs in vrsave
cmplw cr0,r8,r4 # i<data_len
bc 4,0,L2400
# load coefficients into v0-v1 and initial history into v2-v3
li r31,0xf
and r31,r8,r31 # r31: data%4
li r11,16
subf r31,r31,r11 # r31: 4-(data%4)
slwi r31,r31,3 # convert to bits for vsro
li r10,-4
stw r31,-4(r9)
lvewx v0,r10,r9
vspltisb v6,-1
vsro v6,v6,v0 # v6: mask vector
li r31,0x8
lvsl v0,0,r31
vsldoi v0,v0,v0,12
li r31,0xc
lvsl v1,0,r31
vspltisb v2,0
vspltisb v3,-1
vmrglw v2,v2,v3
vsel v0,v1,v0,v2 # v0: reversal permutation vector
add r10,r5,r6
lvsl v5,0,r5 # v5: coefficient alignment permutation vector
vperm v5,v5,v5,v0 # v5: reversal coefficient alignment permutation vector
mr r11,r8
lvsl v4,0,r11 # v4: history alignment permutation vector
lvx v0,0,r5
addi r5,r5,16
lvx v1,0,r5
vperm v0,v0,v1,v5
lvx v2,0,r11
addi r11,r11,-16
lvx v3,0,r11
vperm v2,v3,v2,v4
cmplw cr0,r5,r10
bc 12,0,L2101
vand v0,v0,v6
lis r31,L2301@ha
la r31,L2301@l(r31)
b L2199
L2101:
addi r5,r5,16
lvx v7,0,r5
vperm v1,v1,v7,v5
addi r11,r11,-16
lvx v7,0,r11
vperm v3,v7,v3,v4
vand v1,v1,v6
lis r31,L2300@ha
la r31,L2300@l(r31)
L2199:
mtctr r31
# set up invariant vectors
vspltish v4,0 # v4: zero vector
li r10,-12
lvsr v5,r10,r8 # v5: result shift vector
lvsl v6,r10,r3 # v6: residual shift back vector
li r10,-4
stw r7,-4(r9)
lvewx v7,r10,r9 # v7: lp_quantization vector
L2200:
vmulosh v8,v0,v2 # v8: sum vector
bcctr 20,0
L2300:
vmulosh v9,v1,v3
vsldoi v3,v3,v2,4
vaddsws v8,v8,v9
L2301:
vsumsws v8,v8,v4 # v8[3]: sum
vsraw v8,v8,v7 # v8[3]: sum >> lp_quantization
lvewx v9,0,r3 # v9[n]: *residual
vperm v9,v9,v9,v6 # v9[3]: *residual
vaddsws v8,v9,v8 # v8[3]: *residual + (sum >> lp_quantization)
vsldoi v6,v6,v6,4 # increment shift vector
vperm v9,v8,v8,v5 # v9[n]: shift for storage
vsldoi v5,v5,v5,12 # increment shift vector
stvewx v9,0,r8
vsldoi v8,v8,v8,12
vsldoi v2,v2,v8,4 # insert value onto history
addi r3,r3,4
addi r8,r8,4
cmplw cr0,r8,r4 # i<data_len
bc 12,0,L2200
L2400:
mtspr 256,r0 # restore old vrsave
lmw r31,-4(r1)
blr
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment