Commit 8b8aaffc authored by Peng Bin's avatar Peng Bin Committed by Zoe Liu

Refactor pair_set_epi16 for speedup

Use _mm_set1_epi32 instead of _mm_set_epi16, less instructions produced
by compiler. This patch also removes the duplicate define of the same
function.

Speed test results:
1. Unittest for each test cases in SSE2/AV1LbdInvTxfm2d shows 60%~80%
speedup (except those case with TX_TYPE include iidentity)
2. A brief speed test shows that with this CL, for speed1 encoder speeds up
~3% and decoder speeds up ~1.8%.
(Baseline is 18976fa5)

Change-Id: I2b0e12973fda05a21d6b6eb0f0efe11df6edfb84
parent cbfffa8e
...@@ -16,9 +16,8 @@ ...@@ -16,9 +16,8 @@
#include "aom/aom_integer.h" #include "aom/aom_integer.h"
#include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms.h"
#define pair_set_epi16(a, b) \ #define pair_set_epi16(a, b) \
_mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
#define dual_set_epi16(a, b) \ #define dual_set_epi16(a, b) \
_mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
......
...@@ -17,16 +17,13 @@ ...@@ -17,16 +17,13 @@
#include "./av1_rtcd.h" #include "./av1_rtcd.h"
#include "aom/aom_integer.h" #include "aom/aom_integer.h"
#include "aom_dsp/x86/transpose_sse2.h" #include "aom_dsp/x86/transpose_sse2.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "av1/common/av1_txfm.h" #include "av1/common/av1_txfm.h"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
#define pair_set_epi16(a, b) \
_mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \ #define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
{ \ { \
__m128i t0 = _mm_unpacklo_epi16(in0, in1); \ __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment