Commit 18976fa5 authored by Peng Bin's avatar Peng Bin Committed by Bin Peng

Add inv txfm2d sse2 for sizes with 4

Implement av1_lowbd_inv_txfm2d_add_4x4_sse2
Implement av1_lowbd_inv_txfm2d_add_4x8_sse2
Implement av1_lowbd_inv_txfm2d_add_8x4_sse2
Implement av1_lowbd_inv_txfm2d_add_4x16_sse2
Implement av1_lowbd_inv_txfm2d_add_16x4_sse2

A brief speed test shows that using the included SSE2 functions
completed by this CL, for speed1 lowbitdepth encoder speeds up >9%
and lowbitdepth decoder speeds up >25%, comparing to the highbitdepth
implementation in the baseline.

Change-Id: I0576a2a146c0b1a7b483c9d35c3d21d979e263cd
parent 4917295b
......@@ -107,10 +107,14 @@ static INLINE void transpose_16bit_4x4(const __m128i *const in,
const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
// Unpack 32 bit elements resulting in:
// out[0]: 00 10 20 30 01 11 21 31
// out[1]: 02 12 22 32 03 13 23 33
// out[0]: 00 10 20 30
// out[1]: 01 11 21 31
// out[2]: 02 12 22 32
// out[3]: 03 13 23 33
out[0] = _mm_unpacklo_epi32(a0, a1);
out[1] = _mm_unpackhi_epi32(a0, a1);
out[1] = _mm_srli_si128(out[0], 8);
out[2] = _mm_unpackhi_epi32(a0, a1);
out[3] = _mm_srli_si128(out[2], 8);
}
static INLINE void transpose_16bit_4x8(const __m128i *const in,
......@@ -155,6 +159,54 @@ static INLINE void transpose_16bit_4x8(const __m128i *const in,
out[3] = _mm_unpackhi_epi64(b2, b3);
}
static INLINE void transpose_16bit_8x4(const __m128i *const in,
__m128i *const out) {
// Unpack 16 bit elements. Goes from:
// in[0]: 00 01 02 03 04 05 06 07
// in[1]: 10 11 12 13 14 15 16 17
// in[2]: 20 21 22 23 24 25 26 27
// in[3]: 30 31 32 33 34 35 36 37
// to:
// a0: 00 10 01 11 02 12 03 13
// a1: 20 30 21 31 22 32 23 33
// a4: 04 14 05 15 06 16 07 17
// a5: 24 34 25 35 26 36 27 37
const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
// Unpack 32 bit elements resulting in:
// b0: 00 10 20 30 01 11 21 31
// b2: 04 14 24 34 05 15 25 35
// b4: 02 12 22 32 03 13 23 33
// b6: 06 16 26 36 07 17 27 37
const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
// Unpack 64 bit elements resulting in:
// out[0]: 00 10 20 30 XX XX XX XX
// out[1]: 01 11 21 31 XX XX XX XX
// out[2]: 02 12 22 32 XX XX XX XX
// out[3]: 03 13 23 33 XX XX XX XX
// out[4]: 04 14 24 34 XX XX XX XX
// out[5]: 05 15 25 35 XX XX XX XX
// out[6]: 06 16 26 36 XX XX XX XX
// out[7]: 07 17 27 37 XX XX XX XX
const __m128i zeros = _mm_setzero_si128();
out[0] = _mm_unpacklo_epi64(b0, zeros);
out[1] = _mm_unpackhi_epi64(b0, zeros);
out[2] = _mm_unpacklo_epi64(b4, zeros);
out[3] = _mm_unpackhi_epi64(b4, zeros);
out[4] = _mm_unpacklo_epi64(b2, zeros);
out[5] = _mm_unpackhi_epi64(b2, zeros);
out[6] = _mm_unpacklo_epi64(b6, zeros);
out[7] = _mm_unpackhi_epi64(b6, zeros);
}
static INLINE void transpose_16bit_8x8(const __m128i *const in,
__m128i *const out) {
// Unpack 16 bit elements. Goes from:
......
This diff is collapsed.
......@@ -59,6 +59,11 @@ static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
}
static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
const __m128i a_low = _mm_load_si128((const __m128i *)a);
return _mm_packs_epi32(a_low, a_low);
}
// Store 8 16 bit values. Sign extend the values.
static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
const __m128i a_lo = _mm_unpacklo_epi16(a, a);
......@@ -107,6 +112,13 @@ static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
}
}
static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
__m128i *out, int out_size) {
for (int i = 0; i < out_size; ++i) {
out[i] = load_32bit_to_16bit_w4(in + i * stride);
}
}
static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
int stride, __m128i *out,
int out_size) {
......@@ -194,6 +206,9 @@ typedef struct {
transform_1d_sse2 col, row; // vertical and horizontal
} transform_2d_sse2;
void av1_lowbd_inv_txfm2d_add_4x4_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
void av1_lowbd_inv_txfm2d_add_8x8_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
......@@ -208,6 +223,12 @@ void av1_lowbd_inv_txfm2d_add_64x64_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
#endif
void av1_lowbd_inv_txfm2d_add_4x8_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
void av1_lowbd_inv_txfm2d_add_8x4_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
void av1_lowbd_inv_txfm2d_add_8x16_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
......@@ -228,6 +249,12 @@ void av1_lowbd_inv_txfm2d_add_64x32_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
#endif
void av1_lowbd_inv_txfm2d_add_4x16_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
void av1_lowbd_inv_txfm2d_add_16x4_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
void av1_lowbd_inv_txfm2d_add_8x32_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
......
......@@ -329,15 +329,15 @@ TEST_P(AV1LbdInvTxfm2d, DISABLED_Speed) {
#include "av1/common/x86/av1_txfm_sse2.h"
const LbdInvTxfm2dFunc kLbdInvFuncSSE2List[TX_SIZES_ALL] = {
NULL, // TX_4X4
av1_lowbd_inv_txfm2d_add_4x4_sse2, // TX_4X4
av1_lowbd_inv_txfm2d_add_8x8_sse2, // TX_8X8
av1_lowbd_inv_txfm2d_add_16x16_sse2, // TX_16X16
av1_lowbd_inv_txfm2d_add_32x32_sse2, // TX_32X32
#if CONFIG_TX64X64
av1_lowbd_inv_txfm2d_add_64x64_sse2, // 64x64
#endif // CONFIG_TX64X64
NULL, // TX_4X8
NULL, // TX_8X4
av1_lowbd_inv_txfm2d_add_4x8_sse2, // TX_4X8
av1_lowbd_inv_txfm2d_add_8x4_sse2, // TX_8X4
av1_lowbd_inv_txfm2d_add_8x16_sse2, // TX_8X16
av1_lowbd_inv_txfm2d_add_16x8_sse2, // TX_16X8
av1_lowbd_inv_txfm2d_add_16x32_sse2, // TX_16X32
......@@ -346,8 +346,8 @@ const LbdInvTxfm2dFunc kLbdInvFuncSSE2List[TX_SIZES_ALL] = {
av1_lowbd_inv_txfm2d_add_32x64_sse2, // TX_32X64
av1_lowbd_inv_txfm2d_add_64x32_sse2, // TX_64X32
#endif // CONFIG_TX64X64
NULL, // TX_4X16
NULL, // TX_16X4
av1_lowbd_inv_txfm2d_add_4x16_sse2, // TX_4X16
av1_lowbd_inv_txfm2d_add_16x4_sse2, // TX_16X4
av1_lowbd_inv_txfm2d_add_8x32_sse2, // 8x32
av1_lowbd_inv_txfm2d_add_32x8_sse2, // 32x8
#if CONFIG_TX64X64
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment