Commit 4f510809 authored by Geza Lore's avatar Geza Lore

Flip the result of the inverse transform for FLIPADST.

When using FLIPADST, the vp10_inv_txfm_add functions used to flip
the destination array, add the result of the inverse transform, to it
and then flip the destination back. This has been replaced by
flipping the result of the inverse transform before adding it to the
destination. Up-Down flipping is done by negating the destination
stride, and staring from the bottom, so it should now be free.
Left-right flipping is done with the usual SSE2 instructions in the
optimized code.

The C functions match the SSE2 functions as expected, so the C functions
now do the flipping as well when required. Adding this cleanly required
some refactoring of the C functions, but there is no measurable
performance impact when ext-tx is not enabled.

Encode speedup with ext-tx enabled is about 3%.

Change-Id: I5b04e5d720f0b9f0d54fd8607a8764f2314c7234
parent 2b39bcec
This diff is collapsed.
......@@ -11,6 +11,54 @@
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
#include "vpx_ports/mem.h"
#include "vp10/common/enums.h"
#if CONFIG_EXT_TX
// Reverse the 8 16 bit words in __m128i
static INLINE __m128i mm_reverse_epi16(const __m128i x) {
const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
return _mm_shuffle_epi32(b, 0x4e);
}
static INLINE void fliplr_4x4(__m128i in[2]) {
in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
}
static INLINE void fliplr_8x8(__m128i in[8]) {
in[0] = mm_reverse_epi16(in[0]);
in[1] = mm_reverse_epi16(in[1]);
in[2] = mm_reverse_epi16(in[2]);
in[3] = mm_reverse_epi16(in[3]);
in[4] = mm_reverse_epi16(in[4]);
in[5] = mm_reverse_epi16(in[5]);
in[6] = mm_reverse_epi16(in[6]);
in[7] = mm_reverse_epi16(in[7]);
}
static INLINE void fliplr_16x8(__m128i in[16]) {
fliplr_8x8(&in[0]);
fliplr_8x8(&in[8]);
}
#define FLIPLR_16x16(in0, in1) do { \
__m128i *tmp; \
fliplr_16x8(in0); \
fliplr_16x8(in1); \
tmp = (in0); \
(in0) = (in1); \
(in1) = tmp; \
} while (0)
#define FLIPUD_PTR(dest, stride, size) do { \
(dest) = (dest) + ((size) - 1) * (stride); \
(stride) = - (stride); \
} while (0)
#endif
void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
......@@ -22,22 +70,50 @@ void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
in[1] = load_input_data(input + 8);
switch (tx_type) {
case 0: // DCT_DCT
case DCT_DCT:
idct4_sse2(in);
idct4_sse2(in);
break;
case ADST_DCT:
idct4_sse2(in);
iadst4_sse2(in);
break;
case DCT_ADST:
iadst4_sse2(in);
idct4_sse2(in);
break;
case 1: // ADST_DCT
case ADST_ADST:
iadst4_sse2(in);
iadst4_sse2(in);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
idct4_sse2(in);
iadst4_sse2(in);
FLIPUD_PTR(dest, stride, 4);
break;
case 2: // DCT_ADST
case DCT_FLIPADST:
iadst4_sse2(in);
idct4_sse2(in);
fliplr_4x4(in);
break;
case FLIPADST_FLIPADST:
iadst4_sse2(in);
iadst4_sse2(in);
FLIPUD_PTR(dest, stride, 4);
fliplr_4x4(in);
break;
case 3: // ADST_ADST
case ADST_FLIPADST:
iadst4_sse2(in);
iadst4_sse2(in);
fliplr_4x4(in);
break;
case FLIPADST_ADST:
iadst4_sse2(in);
iadst4_sse2(in);
FLIPUD_PTR(dest, stride, 4);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
......@@ -52,12 +128,12 @@ void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
// Reconstruction and Store
{
__m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
__m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
__m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
__m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
d0 = _mm_unpacklo_epi32(d0,
_mm_cvtsi32_si128(*(const int *)(dest + stride)));
d2 = _mm_unpacklo_epi32(
d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
__m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
d0 = _mm_unpacklo_epi32(d0, d1);
d2 = _mm_unpacklo_epi32(d2, d3);
d0 = _mm_unpacklo_epi8(d0, zero);
d2 = _mm_unpacklo_epi8(d2, zero);
d0 = _mm_add_epi16(d0, in[0]);
......@@ -94,22 +170,50 @@ void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
in[7] = load_input_data(input + 8 * 7);
switch (tx_type) {
case 0: // DCT_DCT
case DCT_DCT:
idct8_sse2(in);
idct8_sse2(in);
break;
case ADST_DCT:
idct8_sse2(in);
iadst8_sse2(in);
break;
case DCT_ADST:
iadst8_sse2(in);
idct8_sse2(in);
break;
case 1: // ADST_DCT
case ADST_ADST:
iadst8_sse2(in);
iadst8_sse2(in);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
idct8_sse2(in);
iadst8_sse2(in);
FLIPUD_PTR(dest, stride, 8);
break;
case 2: // DCT_ADST
case DCT_FLIPADST:
iadst8_sse2(in);
idct8_sse2(in);
fliplr_8x8(in);
break;
case 3: // ADST_ADST
case FLIPADST_FLIPADST:
iadst8_sse2(in);
iadst8_sse2(in);
FLIPUD_PTR(dest, stride, 8);
fliplr_8x8(in);
break;
case ADST_FLIPADST:
iadst8_sse2(in);
iadst8_sse2(in);
fliplr_8x8(in);
break;
case FLIPADST_ADST:
iadst8_sse2(in);
iadst8_sse2(in);
FLIPUD_PTR(dest, stride, 8);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
......@@ -146,29 +250,59 @@ void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
void vp10_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride, int tx_type) {
__m128i in0[16], in1[16];
__m128i in[32];
__m128i *in0 = &in[0];
__m128i *in1 = &in[16];
load_buffer_8x16(input, in0);
input += 8;
load_buffer_8x16(input, in1);
switch (tx_type) {
case 0: // DCT_DCT
case DCT_DCT:
idct16_sse2(in0, in1);
idct16_sse2(in0, in1);
break;
case ADST_DCT:
idct16_sse2(in0, in1);
iadst16_sse2(in0, in1);
break;
case DCT_ADST:
iadst16_sse2(in0, in1);
idct16_sse2(in0, in1);
break;
case ADST_ADST:
iadst16_sse2(in0, in1);
iadst16_sse2(in0, in1);
break;
case 1: // ADST_DCT
#if CONFIG_EXT_TX
case FLIPADST_DCT:
idct16_sse2(in0, in1);
iadst16_sse2(in0, in1);
FLIPUD_PTR(dest, stride, 16);
break;
case 2: // DCT_ADST
case DCT_FLIPADST:
iadst16_sse2(in0, in1);
idct16_sse2(in0, in1);
FLIPLR_16x16(in0, in1);
break;
case FLIPADST_FLIPADST:
iadst16_sse2(in0, in1);
iadst16_sse2(in0, in1);
FLIPUD_PTR(dest, stride, 16);
FLIPLR_16x16(in0, in1);
break;
case ADST_FLIPADST:
iadst16_sse2(in0, in1);
iadst16_sse2(in0, in1);
FLIPLR_16x16(in0, in1);
break;
case 3: // ADST_ADST
case FLIPADST_ADST:
iadst16_sse2(in0, in1);
iadst16_sse2(in0, in1);
FLIPUD_PTR(dest, stride, 16);
break;
#endif // CONFIG_EXT_TX
default:
assert(0);
break;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment