Commit cb5c47f2 authored by Debargha Mukherjee's avatar Debargha Mukherjee Committed by Gerrit Code Review

Merge "Accelerated transform in high bit depth"

parents 194b374b 406030d1
...@@ -85,16 +85,26 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/; ...@@ -85,16 +85,26 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
# dct # dct
# #
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure # Force C versions if CONFIG_EMULATE_HARDWARE is 1
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht4x4_16_add/; specialize qw/vp9_iht4x4_16_add/;
add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht8x8_64_add/; specialize qw/vp9_iht8x8_64_add/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp9_iht16x16_256_add/; specialize qw/vp9_iht16x16_256_add/;
} else {
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht4x4_16_add sse2/;
add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/vp9_iht8x8_64_add sse2/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp9_iht16x16_256_add sse2/;
}
} else { } else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1 # Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
......
...@@ -12,14 +12,14 @@ ...@@ -12,14 +12,14 @@
#include "vpx_dsp/x86/txfm_common_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h"
#include "vpx_ports/mem.h" #include "vpx_ports/mem.h"
void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) { int tx_type) {
__m128i in[2]; __m128i in[2];
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8); const __m128i eight = _mm_set1_epi16(8);
in[0] = _mm_loadu_si128((const __m128i *)(input)); in[0] = load_input_data(input);
in[1] = _mm_loadu_si128((const __m128i *)(input + 8)); in[1] = load_input_data(input + 8);
switch (tx_type) { switch (tx_type) {
case 0: // DCT_DCT case 0: // DCT_DCT
...@@ -77,21 +77,21 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, ...@@ -77,21 +77,21 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
} }
} }
void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) { int tx_type) {
__m128i in[8]; __m128i in[8];
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1 << 4); const __m128i final_rounding = _mm_set1_epi16(1 << 4);
// load input data // load input data
in[0] = _mm_load_si128((const __m128i *)input); in[0] = load_input_data(input);
in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); in[1] = load_input_data(input + 8 * 1);
in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); in[2] = load_input_data(input + 8 * 2);
in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); in[3] = load_input_data(input + 8 * 3);
in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); in[4] = load_input_data(input + 8 * 4);
in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); in[5] = load_input_data(input + 8 * 5);
in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); in[6] = load_input_data(input + 8 * 6);
in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); in[7] = load_input_data(input + 8 * 7);
switch (tx_type) { switch (tx_type) {
case 0: // DCT_DCT case 0: // DCT_DCT
...@@ -144,8 +144,8 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, ...@@ -144,8 +144,8 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
RECON_AND_STORE(dest + 7 * stride, in[7]); RECON_AND_STORE(dest + 7 * stride, in[7]);
} }
void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
int tx_type) { int stride, int tx_type) {
__m128i in0[16], in1[16]; __m128i in0[16], in1[16];
load_buffer_8x16(input, in0); load_buffer_8x16(input, in0);
......
...@@ -626,39 +626,6 @@ if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes")) ...@@ -626,39 +626,6 @@ if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes"))
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure # Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_1_add/;
add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_16_add/;
add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_1_add/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_64_add/;
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_12_add/;
add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_1_add/;
add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_256_add/;
add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_10_add/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add/;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add/;
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add/;
add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_iwht4x4_1_add/; specialize qw/vpx_iwht4x4_1_add/;
...@@ -691,6 +658,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { ...@@ -691,6 +658,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1 # Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_16_add/;
add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_1_add/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_64_add/;
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_12_add/;
add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_1_add/;
add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_256_add/;
add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_10_add/;
add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_1_add/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add/;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add/;
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add/;
add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct4x4_16_add/; specialize qw/vpx_highbd_idct4x4_16_add/;
...@@ -706,6 +706,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { ...@@ -706,6 +706,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct16x16_10_add/; specialize qw/vpx_highbd_idct16x16_10_add/;
} else { } else {
add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_16_add sse2/;
add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct4x4_1_add sse2/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_64_add sse2/;
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_12_add sse2/;
add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_1_add sse2/;
add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_256_add sse2/;
add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_10_add sse2/;
add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_1_add sse2/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add sse2/;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add sse2/;
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add sse2/;
add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct4x4_16_add sse2/; specialize qw/vpx_highbd_idct4x4_16_add sse2/;
......
...@@ -21,7 +21,8 @@ ...@@ -21,7 +21,8 @@
*(int *)(dest) = _mm_cvtsi128_si32(d0); \ *(int *)(dest) = _mm_cvtsi128_si32(d0); \
} }
void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8); const __m128i eight = _mm_set1_epi16(8);
const __m128i cst = _mm_setr_epi16( const __m128i cst = _mm_setr_epi16(
...@@ -32,8 +33,8 @@ void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { ...@@ -32,8 +33,8 @@ void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
__m128i input0, input1, input2, input3; __m128i input0, input1, input2, input3;
// Rows // Rows
input0 = _mm_load_si128((const __m128i *)input); input0 = load_input_data(input);
input2 = _mm_load_si128((const __m128i *)(input + 8)); input2 = load_input_data(input + 8);
// Construct i3, i1, i3, i1, i2, i0, i2, i0 // Construct i3, i1, i3, i1, i2, i0, i2, i0
input0 = _mm_shufflelo_epi16(input0, 0xd8); input0 = _mm_shufflelo_epi16(input0, 0xd8);
...@@ -151,7 +152,8 @@ void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { ...@@ -151,7 +152,8 @@ void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
} }
} }
void vpx_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
__m128i dc_value; __m128i dc_value;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
int a; int a;
...@@ -449,7 +451,8 @@ void iadst4_sse2(__m128i *in) { ...@@ -449,7 +451,8 @@ void iadst4_sse2(__m128i *in) {
out7 = _mm_subs_epi16(stp1_0, stp2_7); \ out7 = _mm_subs_epi16(stp1_0, stp2_7); \
} }
void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1 << 4); const __m128i final_rounding = _mm_set1_epi16(1 << 4);
...@@ -469,14 +472,14 @@ void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { ...@@ -469,14 +472,14 @@ void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
int i; int i;
// Load input data. // Load input data.
in0 = _mm_load_si128((const __m128i *)input); in0 = load_input_data(input);
in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); in1 = load_input_data(input + 8 * 1);
in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); in2 = load_input_data(input + 8 * 2);
in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); in3 = load_input_data(input + 8 * 3);
in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); in4 = load_input_data(input + 8 * 4);
in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); in5 = load_input_data(input + 8 * 5);
in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); in6 = load_input_data(input + 8 * 6);
in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); in7 = load_input_data(input + 8 * 7);
// 2-D // 2-D
for (i = 0; i < 2; i++) { for (i = 0; i < 2; i++) {
...@@ -518,7 +521,8 @@ void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { ...@@ -518,7 +521,8 @@ void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
RECON_AND_STORE(dest + 7 * stride, in7); RECON_AND_STORE(dest + 7 * stride, in7);
} }
void vpx_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
__m128i dc_value; __m128i dc_value;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
int a; int a;
...@@ -792,7 +796,8 @@ void iadst8_sse2(__m128i *in) { ...@@ -792,7 +796,8 @@ void iadst8_sse2(__m128i *in) {
in[7] = _mm_sub_epi16(k__const_0, s1); in[7] = _mm_sub_epi16(k__const_0, s1);
} }
void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1 << 4); const __m128i final_rounding = _mm_set1_epi16(1 << 4);
...@@ -812,10 +817,10 @@ void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { ...@@ -812,10 +817,10 @@ void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
// Rows. Load 4-row input data. // Rows. Load 4-row input data.
in0 = _mm_load_si128((const __m128i *)input); in0 = load_input_data(input);
in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); in1 = load_input_data(input + 8 * 1);
in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); in2 = load_input_data(input + 8 * 2);
in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); in3 = load_input_data(input + 8 * 3);
// 8x4 Transpose // 8x4 Transpose
TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
...@@ -1169,7 +1174,7 @@ void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { ...@@ -1169,7 +1174,7 @@ void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
stp2_10, stp2_13, stp2_11, stp2_12) \ stp2_10, stp2_13, stp2_11, stp2_12) \
} }
void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) { int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1 << 5); const __m128i final_rounding = _mm_set1_epi16(1 << 5);
...@@ -1214,22 +1219,22 @@ void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, ...@@ -1214,22 +1219,22 @@ void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
// 1-D idct // 1-D idct
// Load input data. // Load input data.
in[0] = _mm_load_si128((const __m128i *)input); in[0] = load_input_data(input);
in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); in[8] = load_input_data(input + 8 * 1);
in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); in[1] = load_input_data(input + 8 * 2);
in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); in[9] = load_input_data(input + 8 * 3);
in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); in[2] = load_input_data(input + 8 * 4);
in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); in[10] = load_input_data(input + 8 * 5);
in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); in[3] = load_input_data(input + 8 * 6);
in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); in[11] = load_input_data(input + 8 * 7);
in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); in[4] = load_input_data(input + 8 * 8);
in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); in[12] = load_input_data(input + 8 * 9);
in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); in[5] = load_input_data(input + 8 * 10);
in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); in[13] = load_input_data(input + 8 * 11);
in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); in[6] = load_input_data(input + 8 * 12);
in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); in[14] = load_input_data(input + 8 * 13);
in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); in[7] = load_input_data(input + 8 * 14);
in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); in[15] = load_input_data(input + 8 * 15);
array_transpose_8x8(in, in); array_transpose_8x8(in, in);
array_transpose_8x8(in + 8, in + 8); array_transpose_8x8(in + 8, in + 8);
...@@ -1294,7 +1299,8 @@ void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, ...@@ -1294,7 +1299,8 @@ void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
} }
} }
void vpx_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
__m128i dc_value; __m128i dc_value;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
int a, i; int a, i;
...@@ -2152,7 +2158,7 @@ void iadst16_sse2(__m128i *in0, __m128i *in1) { ...@@ -2152,7 +2158,7 @@ void iadst16_sse2(__m128i *in0, __m128i *in1) {
iadst16_8col(in1); iadst16_8col(in1);
} }
void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) { int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1 << 5); const __m128i final_rounding = _mm_set1_epi16(1 << 5);
...@@ -2184,10 +2190,10 @@ void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, ...@@ -2184,10 +2190,10 @@ void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
int i; int i;
// First 1-D inverse DCT // First 1-D inverse DCT
// Load input data. // Load input data.
in[0] = _mm_load_si128((const __m128i *)input); in[0] = load_input_data(input);
in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); in[1] = load_input_data(input + 8 * 2);
in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); in[2] = load_input_data(input + 8 * 4);
in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); in[3] = load_input_data(input + 8 * 6);
TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
...@@ -2391,7 +2397,7 @@ void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, ...@@ -2391,7 +2397,7 @@ void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
#define LOAD_DQCOEFF(reg, input) \ #define LOAD_DQCOEFF(reg, input) \
{ \ { \
reg = _mm_load_si128((const __m128i *) input); \ reg = load_input_data(input); \
input += 8; \ input += 8; \
} \ } \
...@@ -3029,7 +3035,7 @@ void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, ...@@ -3029,7 +3035,7 @@ void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
} }
// Only upper-left 8x8 has non-zero coeff // Only upper-left 8x8 has non-zero coeff
void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) { int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5); const __m128i final_rounding = _mm_set1_epi16(1<<5);
...@@ -3081,14 +3087,14 @@ void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, ...@@ -3081,14 +3087,14 @@ void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
int i; int i;
// Load input data. Only need to load the top left 8x8 block. // Load input data. Only need to load the top left 8x8 block.
in[0] = _mm_load_si128((const __m128i *)input); in[0] = load_input_data(input);
in[1] = _mm_load_si128((const __m128i *)(input + 32)); in[1] = load_input_data(input + 32);
in[2] = _mm_load_si128((const __m128i *)(input + 64)); in[2] = load_input_data(input + 64);
in[3] = _mm_load_si128((const __m128i *)(input + 96)); in[3] = load_input_data(input + 96);
in[4] = _mm_load_si128((const __m128i *)(input + 128)); in[4] = load_input_data(input + 128);
in[5] = _mm_load_si128((const __m128i *)(input + 160)); in[5] = load_input_data(input + 160);
in[6] = _mm_load_si128((const __m128i *)(input + 192)); in[6] = load_input_data(input + 192);
in[7] = _mm_load_si128((const __m128i *)(input + 224)); in[7] = load_input_data(input + 224);
for (i = 8; i < 32; ++i) { for (i = 8; i < 32; ++i) {
in[i] = _mm_setzero_si128(); in[i] = _mm_setzero_si128();
...@@ -3188,7 +3194,7 @@ void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, ...@@ -3188,7 +3194,7 @@ void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
} }
} }
void vpx_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) { int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1 << 5); const __m128i final_rounding = _mm_set1_epi16(1 << 5);
...@@ -3464,7 +3470,8 @@ void vpx_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, ...@@ -3464,7 +3470,8 @@ void vpx_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
} }
} }
void vpx_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
__m128i dc_value; __m128i dc_value;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = _mm_setzero_si128();
int a, i; int a, i;
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "./vpx_config.h" #include "./vpx_config.h"
#include "vpx/vpx_integer.h" #include "vpx/vpx_integer.h"
#include "vpx_dsp/inv_txfm.h" #include "vpx_dsp/inv_txfm.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
// perform 8x8 transpose // perform 8x8 transpose
static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
...@@ -89,24 +90,35 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { ...@@ -89,24 +90,35 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
res0[15] = tbuf[7]; res0[15] = tbuf[7];
} }
static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { // Function to allow 8 bit optimisations to be used when profile 0 is used with
in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); // highbitdepth enabled
in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); static INLINE __m128i load_input_data(const tran_low_t *data) {
in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); #if CONFIG_VP9_HIGHBITDEPTH
in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); data[6], data[7]);
in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); #else
in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); return _mm_load_si128((const __m128i *)data);