Commit 11b5d51b authored by Erik de Castro Lopo's avatar Erik de Castro Lopo

Add AVX2 CPU support stream encoder.

Patch-from: lvqcl <lvqcl.mail@gmail.com>
parent a75b8705
......@@ -112,6 +112,7 @@ libFLAC_sources = \
lpc_intrin_sse.c \
lpc_intrin_sse2.c \
lpc_intrin_sse41.c \
lpc_intrin_avx2.c \
md5.c \
memory.c \
metadata_iterators.c \
......@@ -120,6 +121,7 @@ libFLAC_sources = \
stream_encoder.c \
stream_encoder_intrin_sse2.c \
stream_encoder_intrin_ssse3.c \
stream_encoder_intrin_avx2.c \
stream_encoder_framing.c \
window.c \
$(extra_ogg_sources)
......
......@@ -90,6 +90,7 @@ SRCS_C = \
lpc_intrin_sse.c \
lpc_intrin_sse2.c \
lpc_intrin_sse41.c \
lpc_intrin_avx2.c \
md5.c \
memory.c \
metadata_iterators.c \
......@@ -98,6 +99,7 @@ SRCS_C = \
stream_encoder.c \
stream_encoder_intrin_sse2.c \
stream_encoder_intrin_ssse3.c \
stream_encoder_intrin_avx2.c \
stream_encoder_framing.c \
window.c \
$(OGG_SRCS)
......
......@@ -166,6 +166,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
# endif
# ifdef FLAC__AVX2_SUPPORTED
void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
# endif
# endif
#endif
......
......@@ -57,6 +57,11 @@ extern void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32
unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps);
#endif
#ifdef FLAC__AVX2_SUPPORTED
extern void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps);
#endif
#endif
#endif
......@@ -221,6 +221,12 @@
<ClCompile Include="float.c" />
<ClCompile Include="format.c" />
<ClCompile Include="lpc.c" />
<ClCompile Include="lpc_intrin_avx2.c">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
</ClCompile>
<ClCompile Include="lpc_intrin_sse.c" />
<ClCompile Include="lpc_intrin_sse2.c" />
<ClCompile Include="lpc_intrin_sse41.c" />
......@@ -235,6 +241,12 @@
<ClCompile Include="stream_decoder.c" />
<ClCompile Include="stream_encoder.c" />
<ClCompile Include="stream_encoder_framing.c" />
<ClCompile Include="stream_encoder_intrin_avx2.c">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
</ClCompile>
<ClCompile Include="stream_encoder_intrin_sse2.c" />
<ClCompile Include="stream_encoder_intrin_ssse3.c" />
<ClCompile Include="window.c" />
......
......@@ -157,6 +157,9 @@
<ClCompile Include="lpc_intrin_sse41.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="lpc_intrin_avx2.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="md5.c">
<Filter>Source Files</Filter>
</ClCompile>
......@@ -196,6 +199,9 @@
<ClCompile Include="stream_encoder_intrin_ssse3.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="stream_encoder_intrin_avx2.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="window.c">
<Filter>Source Files</Filter>
</ClCompile>
......
......@@ -181,6 +181,12 @@
<ClCompile Include="float.c" />
<ClCompile Include="format.c" />
<ClCompile Include="lpc.c" />
<ClCompile Include="lpc_intrin_avx2.c">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
</ClCompile>
<ClCompile Include="lpc_intrin_sse.c" />
<ClCompile Include="lpc_intrin_sse2.c" />
<ClCompile Include="lpc_intrin_sse41.c" />
......@@ -195,6 +201,12 @@
<ClCompile Include="stream_decoder.c" />
<ClCompile Include="stream_encoder.c" />
<ClCompile Include="stream_encoder_framing.c" />
<ClCompile Include="stream_encoder_intrin_avx2.c">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">/arch:AVX %(AdditionalOptions)</AdditionalOptions>
</ClCompile>
<ClCompile Include="stream_encoder_intrin_sse2.c" />
<ClCompile Include="stream_encoder_intrin_ssse3.c" />
<ClCompile Include="window.c" />
......
......@@ -157,6 +157,9 @@
<ClCompile Include="lpc_intrin_sse41.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="lpc_intrin_avx2.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="md5.c">
<Filter>Source Files</Filter>
</ClCompile>
......@@ -196,6 +199,9 @@
<ClCompile Include="stream_encoder_intrin_ssse3.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="stream_encoder_intrin_avx2.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="window.c">
<Filter>Source Files</Filter>
</ClCompile>
......
This diff is collapsed.
......@@ -950,6 +950,13 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41;
}
# endif
# ifdef FLAC__AVX2_SUPPORTED
if(encoder->private_->cpuinfo.ia32.avx2) {
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2;
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2;
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2;
}
# endif
# ifdef FLAC__SSE2_SUPPORTED
if (encoder->private_->cpuinfo.ia32.sse2) {
......@@ -986,6 +993,13 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41;
}
# endif
# ifdef FLAC__AVX2_SUPPORTED
if(encoder->private_->cpuinfo.x86.avx2) {
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2;
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2;
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2;
}
# endif
# ifdef FLAC__SSE2_SUPPORTED
encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2;
......@@ -1013,6 +1027,10 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
if(encoder->private_->cpuinfo.ia32.ssse3)
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_ssse3;
# endif
# ifdef FLAC__AVX2_SUPPORTED
if(encoder->private_->cpuinfo.ia32.avx2)
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_avx2;
# endif
# elif defined FLAC__CPU_X86_64
# ifdef FLAC__SSE2_SUPPORTED
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_sse2;
......@@ -1021,6 +1039,10 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
if(encoder->private_->cpuinfo.x86.ssse3)
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_ssse3;
# endif
# ifdef FLAC__AVX2_SUPPORTED
if(encoder->private_->cpuinfo.x86.avx2)
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_avx2;
# endif
# endif /* FLAC__CPU_... */
}
#endif /* !FLAC__NO_ASM && FLAC__HAS_X86INTRIN */
......
/* libFLAC - Free Lossless Audio Codec library
* Copyright (C) 2000-2009 Josh Coalson
* Copyright (C) 2011-2014 Xiph.Org Foundation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of the Xiph.org Foundation nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#ifndef FLAC__NO_ASM
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#include "private/stream_encoder.h"
#include "private/bitmath.h"
#ifdef FLAC__AVX2_SUPPORTED
#include <stdlib.h> /* for abs() */
#include <immintrin.h> /* AVX2 */
#include "FLAC/assert.h"
FLAC__SSE_TARGET("avx2")
void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps)
{
const unsigned default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
unsigned partitions = 1u << max_partition_order;
FLAC__ASSERT(default_partition_samples > predictor_order);
/* first do max_partition_order */
{
unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order);
__m256i res256, sum256;
__m128i res128, sum128;
if(FLAC__bitmath_ilog2(default_partition_samples) + bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
for(partition = residual_sample = 0; partition < partitions; partition++) {
end += default_partition_samples;
sum256 = _mm256_setzero_si256();
for( ; (int)residual_sample < (int)end-7; residual_sample+=8) {
res256 = _mm256_abs_epi32(_mm256_loadu_si256((const __m256i*)(residual+residual_sample)));
sum256 = _mm256_add_epi32(sum256, res256);
}
sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256));
for( ; (int)residual_sample < (int)end-3; residual_sample+=4) {
res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
sum128 = _mm_add_epi32(sum128, res128);
}
for( ; residual_sample < end; residual_sample++) {
res128 = _mm_cvtsi32_si128(residual[residual_sample]);
res128 = _mm_abs_epi32(res128);
sum128 = _mm_add_epi32(sum128, res128);
}
sum128 = _mm_hadd_epi32(sum128, sum128);
sum128 = _mm_hadd_epi32(sum128, sum128);
abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(sum128);
}
}
else { /* have to pessimistically use 64 bits for accumulator */
for(partition = residual_sample = 0; partition < partitions; partition++) {
end += default_partition_samples;
sum256 = _mm256_setzero_si256();
for( ; (int)residual_sample < (int)end-3; residual_sample+=4) {
res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
res256 = _mm256_cvtepu32_epi64(res128);
sum256 = _mm256_add_epi64(sum256, res256);
}
sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256));
for( ; (int)residual_sample < (int)end-1; residual_sample+=2) {
res128 = _mm_loadl_epi64((const __m128i*)(residual+residual_sample));
res128 = _mm_abs_epi32(res128);
res128 = _mm_cvtepu32_epi64(res128);
sum128 = _mm_add_epi64(sum128, res128);
}
for( ; residual_sample < end; residual_sample++) {
res128 = _mm_cvtsi32_si128(residual[residual_sample]);
res128 = _mm_abs_epi32(res128);
sum128 = _mm_add_epi64(sum128, res128);
}
sum128 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8));
_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), sum128);
}
}
}
/* now merge partitions for lower orders */
{
unsigned from_partition = 0, to_partition = partitions;
int partition_order;
for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) {
unsigned i;
partitions >>= 1;
for(i = 0; i < partitions; i++) {
abs_residual_partition_sums[to_partition++] =
abs_residual_partition_sums[from_partition ] +
abs_residual_partition_sums[from_partition+1];
from_partition += 2;
}
}
}
_mm256_zeroupper();
}
#endif /* FLAC__AVX2_SUPPORTED */
#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
#endif /* FLAC__NO_ASM */
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment