Commit 78136edc authored by Jingning Han's avatar Jingning Han
Browse files

SSE2 high precision 32x32 forward DCT

Enable SSE2 implementation of high precision 32x32 forward DCT. The
intermediate stacks are of 32-bits. The run-time goes down from
32126 cycles to 13442 cycles.

Change-Id: Ib5ccafe3176c65bd6f2dbdef790bd47bbc880e56
parent b89eef8f
......@@ -143,7 +143,7 @@ typedef struct {
unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
unsigned char segment_id; // Segment id for current frame
// Flags used for prediction status of various bistream signals
// Flags used for prediction status of various bit-stream signals
unsigned char seg_id_predicted;
// Indicates if the mb is part of the image (1) vs border (0)
......
......@@ -27,6 +27,9 @@
#define pair_set_epi16(a, b) \
_mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))
#define pair_set_epi32(a, b) \
_mm_set_epi32(b, a, b, a)
// Constants:
// for (int i = 1; i< 32; ++i)
// printf("static const int cospi_%d_64 = %.0f;\n", i,
......
......@@ -740,7 +740,7 @@ prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int p
specialize vp9_short_fdct8x4 sse2
prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct32x32
specialize vp9_short_fdct32x32 sse2
prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct32x32_rd sse2
......
......@@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdio.h>
#include <math.h>
#include <limits.h>
......
This diff is collapsed.
......@@ -2573,13 +2573,13 @@ void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
}
#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2
#define FDCT32x32_LOW_PRECISION 1
#define FDCT32x32_HIGH_PRECISION 0
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
#undef FDCT32x32_2D
#undef FDCT32x32_LOW_PRECISION
#undef FDCT32x32_HIGH_PRECISION
#define FDCT32x32_2D vp9_short_fdct32x32_sse2
#define FDCT32x32_LOW_PRECISION 0
#define FDCT32x32_HIGH_PRECISION 1
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
#undef FDCT32x32_2D
#undef FDCT32x32_LOW_PRECISION
#undef FDCT32x32_HIGH_PRECISION
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment