Skip to content
Snippets Groups Projects
Commit e50e8084 authored by John Ridges's avatar John Ridges Committed by Jean-Marc Valin
Browse files

Improved SSE version of xcorr_kernel()

The loop no longer reads past its buffer and is slightly faster.
Also fixes RESTORE_STACK in celt_iir().
parent 70c9c3a4
No related branches found
No related tags found
No related merge requests found
......@@ -217,8 +217,8 @@ void celt_iir(const opus_val32 *_x,
}
for(i=0;i<ord;i++)
mem[i] = _y[N-i-1];
#endif
RESTORE_STACK;
#endif
}
void _celt_autocorr(
......
/* Copyright (c) 2013 Xiph.Org Foundation
Written by Jean-Marc Valin */
/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */
/**
@file pitch_sse.h
@brief Pitch analysis
......@@ -37,61 +36,39 @@
#include "arch.h"
#define OVERRIDE_XCORR_KERNEL
static inline void xcorr_kernel(const opus_val16 * _x, const opus_val16 * _y, opus_val32 _sum[4], int len)
static inline void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
{
int j;
__m128 sum;
__m128 x;
__m128 y;
__m128 y2;
__m128 y1;
__m128 y3;
__m128 tmp;
sum = _mm_loadu_ps(_sum);
__m128 xsum1, xsum2;
xsum1 = _mm_loadu_ps(sum);
xsum2 = _mm_setzero_ps();
x = _mm_loadu_ps(_x);
y = _mm_loadu_ps(_y);
y1 = _mm_loadu_ps(_y+1);
for (j=0;j<len-3;j+=4)
{
_x+=4;
_y+=4;
y2 = _mm_loadu_ps(_y);
y3 = _mm_loadu_ps(_y+1);
tmp = _mm_shuffle_ps(x, x, 0x00);
sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y));
tmp = _mm_shuffle_ps(x, x, 0x55);
sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y1));
tmp = _mm_shuffle_ps(x, x, 0xaa);
y = _mm_shuffle_ps(y, y2, 0x4e);
sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y));
tmp = _mm_shuffle_ps(x, x, 0xff);
y = _mm_shuffle_ps(y1, y3, 0x4e);
sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y));
x = _mm_loadu_ps(_x);
y = y2;
y1 = y3;
}
_y++;
if (j++<len)
for (j = 0; j < len-3; j += 4)
{
tmp = _mm_shuffle_ps(x, x, 0x00);
sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y));
}
if (j++<len)
{
tmp = _mm_shuffle_ps(x, x, 0x55);
y = _mm_loadu_ps(_y++);
sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y));
__m128 x0 = _mm_loadu_ps(x+j);
__m128 y0 = _mm_loadu_ps(y+j);
__m128 y3 = _mm_loadu_ps(y+j+3);
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),y0));
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
_mm_shuffle_ps(y0,y3,0x49)));
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
_mm_shuffle_ps(y0,y3,0x9e)));
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
}
if (j++<len)
if (j < len)
{
tmp = _mm_shuffle_ps(x, x, 0xaa);
y = _mm_loadu_ps(_y++);
sum = _mm_add_ps(sum, _mm_mul_ps(tmp, y));
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
if (++j < len)
{
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
if (++j < len)
{
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
}
}
}
_mm_storeu_ps(_sum, sum);
_mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
}
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment