Commit 8b8aaffc authored by Peng Bin's avatar Peng Bin Committed by Zoe Liu

Refactor pair_set_epi16 for speedup

Use _mm_set1_epi32 instead of _mm_set_epi16, less instructions produced
by compiler. This patch also removes the duplicate define of the same

Speed test results:
1. Unittest for each test cases in SSE2/AV1LbdInvTxfm2d shows 60%~80%
speedup (except those case with TX_TYPE include iidentity)
2. A brief speed test shows that with this CL, for speed1 encoder speeds up
~3% and decoder speeds up ~1.8%.
(Baseline is 18976fa5)

Change-Id: I2b0e12973fda05a21d6b6eb0f0efe11df6edfb84
parent cbfffa8e
......@@ -17,8 +17,7 @@
#include "aom_dsp/x86/synonyms.h"
#define pair_set_epi16(a, b) \
_mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
_mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
#define dual_set_epi16(a, b) \
_mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
......@@ -17,16 +17,13 @@
#include "./av1_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/x86/transpose_sse2.h"
#include "aom_dsp/x86/txfm_common_sse2.h"
#include "av1/common/av1_txfm.h"
#ifdef __cplusplus
extern "C" {
#define pair_set_epi16(a, b) \
_mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
{ \
__m128i t0 = _mm_unpacklo_epi16(in0, in1); \
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment