Commit 3fcb356e authored by Yi Luo's avatar Yi Luo
Browse files

Update partial inverse DCT according to VP9

- Partial inverse DCT unit tests have been enhanced.
- IDCT x86_64 assembly code has been removed.

Change-Id: Ic3bed2c0e70abdfd642a4f74fa969cc672d4795f
parent 75cf92f4
...@@ -238,9 +238,7 @@ DSP_SRCS-yes += inv_txfm.c ...@@ -238,9 +238,7 @@ DSP_SRCS-yes += inv_txfm.c
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm
ifeq ($(ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c
DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3_x86_64.asm
endif # ARCH_X86_64
ifeq ($(HAVE_NEON_ASM),yes) ifeq ($(HAVE_NEON_ASM),yes)
DSP_SRCS-yes += arm/save_reg_neon$(ASM) DSP_SRCS-yes += arm/save_reg_neon$(ASM)
......
...@@ -425,10 +425,10 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { ...@@ -425,10 +425,10 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_idct4x4_1_add sse2/; specialize qw/aom_idct4x4_1_add sse2/;
add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct8x8_64_add sse2/, "$ssse3_x86_64"; specialize qw/aom_idct8x8_64_add sse2 ssse3/;
add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct8x8_12_add sse2/, "$ssse3_x86_64"; specialize qw/aom_idct8x8_12_add sse2 ssse3/;
add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct8x8_1_add sse2/; specialize qw/aom_idct8x8_1_add sse2/;
...@@ -436,6 +436,8 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { ...@@ -436,6 +436,8 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct16x16_256_add sse2/; specialize qw/aom_idct16x16_256_add sse2/;
add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct16x16_10_add sse2/; specialize qw/aom_idct16x16_10_add sse2/;
...@@ -443,15 +445,15 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { ...@@ -443,15 +445,15 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_idct16x16_1_add sse2/; specialize qw/aom_idct16x16_1_add sse2/;
add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_1024_add sse2/, "$ssse3_x86_64"; specialize qw/aom_idct32x32_1024_add sse2 ssse3/;
add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_135_add sse2/, "$ssse3_x86_64"; specialize qw/aom_idct32x32_135_add sse2 ssse3/;
# Need to add 135 eob idct32x32 implementations. # Need to add 135 eob idct32x32 implementations.
$aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2; $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_34_add sse2/, "$ssse3_x86_64"; specialize qw/aom_idct32x32_34_add sse2 ssse3/;
add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_1_add sse2/; specialize qw/aom_idct32x32_1_add sse2/;
...@@ -480,10 +482,10 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { ...@@ -480,10 +482,10 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_idct8x8_1_add sse2 neon dspr2 msa/; specialize qw/aom_idct8x8_1_add sse2 neon dspr2 msa/;
add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; specialize qw/aom_idct8x8_64_add sse2 ssse3 neon dspr2 msa/;
add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; specialize qw/aom_idct8x8_12_add sse2 ssse3 neon dspr2 msa/;
add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct16x16_1_add sse2 neon dspr2 msa/; specialize qw/aom_idct16x16_1_add sse2 neon dspr2 msa/;
...@@ -491,14 +493,16 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { ...@@ -491,14 +493,16 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct16x16_256_add sse2 neon dspr2 msa/; specialize qw/aom_idct16x16_256_add sse2 neon dspr2 msa/;
add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct16x16_10_add sse2 neon dspr2 msa/; specialize qw/aom_idct16x16_10_add sse2 neon dspr2 msa/;
add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; specialize qw/aom_idct32x32_1024_add sse2 ssse3 neon dspr2 msa/;
add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; specialize qw/aom_idct32x32_135_add sse2 ssse3 neon dspr2 msa/;
# Need to add 135 eob idct32x32 implementations. # Need to add 135 eob idct32x32 implementations.
$aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2; $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
$aom_idct32x32_135_add_neon=aom_idct32x32_1024_add_neon; $aom_idct32x32_135_add_neon=aom_idct32x32_1024_add_neon;
...@@ -506,7 +510,7 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { ...@@ -506,7 +510,7 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
$aom_idct32x32_135_add_msa=aom_idct32x32_1024_add_msa; $aom_idct32x32_135_add_msa=aom_idct32x32_1024_add_msa;
add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; specialize qw/aom_idct32x32_34_add sse2 ssse3 neon dspr2 msa/;
# Need to add 34 eob idct32x32 neon implementation. # Need to add 34 eob idct32x32 neon implementation.
$aom_idct32x32_34_add_neon=aom_idct32x32_1024_add_neon; $aom_idct32x32_34_add_neon=aom_idct32x32_1024_add_neon;
......
...@@ -747,6 +747,32 @@ void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) { ...@@ -747,6 +747,32 @@ void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
output[15] = WRAPLOW(-x1); output[15] = WRAPLOW(-x1);
} }
void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
int stride) {
int i, j;
tran_low_t out[16 * 16] = { 0 };
tran_low_t *outptr = out;
tran_low_t temp_in[16], temp_out[16];
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 8x8 area, we only need to calculate first 8 rows here.
for (i = 0; i < 8; ++i) {
aom_idct16_c(input, outptr);
input += 16;
outptr += 16;
}
// Then transform columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
aom_idct16_c(temp_in, temp_out);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 6));
}
}
}
void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
int stride) { int stride) {
tran_low_t out[16 * 16] = { 0 }; tran_low_t out[16 * 16] = { 0 };
......
...@@ -258,63 +258,6 @@ void aom_iadst4_sse2(__m128i *in) { ...@@ -258,63 +258,6 @@ void aom_iadst4_sse2(__m128i *in) {
in[1] = _mm_packs_epi32(u[2], u[3]); in[1] = _mm_packs_epi32(u[2], u[3]);
} }
#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
out2, out3, out4, out5, out6, out7) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
\
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
\
out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
}
#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
{ \
const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
\
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
\
out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
}
#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
}
// Define Macro for multiplying elements by constants and adding them together. // Define Macro for multiplying elements by constants and adding them together.
#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
res0, res1, res2, res3) \ res0, res1, res2, res3) \
......
...@@ -56,6 +56,37 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { ...@@ -56,6 +56,37 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
} }
#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
out2, out3, out4, out5, out6, out7) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
\
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
\
out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
}
#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
{ \ { \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
...@@ -195,6 +226,32 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { ...@@ -195,6 +226,32 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
RECON_AND_STORE(dest + 15 * stride, in[15]); RECON_AND_STORE(dest + 15 * stride, in[15]);
} }
#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
{ \
const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
\
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
\
out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
}
#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
}
void iadst16_8col(__m128i *in); void iadst16_8col(__m128i *in);
void idct16_8col(__m128i *in); void idct16_8col(__m128i *in);
void aom_idct4_sse2(__m128i *in); void aom_idct4_sse2(__m128i *in);
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment