Commit 698af562 authored by Sebastien Alaiwan's avatar Sebastien Alaiwan Committed by Fred BARBIER

Explicit requirement about sizeof(tran_low_t)

Here, we're testing CONFIG_HIGHBITDEPTH but what we really depend upon
is the actual size of the coefficients.

Change-Id: I33d71e4b38b4b83bb4232346f4d449f20bcf740e
parent 2b6456c1
......@@ -15,21 +15,21 @@
#include "./aom_config.h"
static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
#if CONFIG_HIGHBITDEPTH
const __m256i zero = _mm256_setzero_si256();
const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
if (sizeof(tran_low_t) == 4) {
const __m256i zero = _mm256_setzero_si256();
const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
__m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
__m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
__m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
__m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
__m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
__m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
__m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
__m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
_mm256_storeu_si256((__m256i *)out, y0);
_mm256_storeu_si256((__m256i *)(out + 8), y1);
#else
_mm256_storeu_si256((__m256i *)out, *coeff);
#endif
_mm256_storeu_si256((__m256i *)out, y0);
_mm256_storeu_si256((__m256i *)(out + 8), y1);
} else {
_mm256_storeu_si256((__m256i *)out, *coeff);
}
}
#endif // AOM_DSP_X86_FWD_TXFM_AVX2_H
......@@ -247,16 +247,16 @@ static INLINE int k_check_epi32_overflow_32(
}
static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
#if CONFIG_HIGHBITDEPTH
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
__m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
__m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
_mm_store_si128((__m128i *)(dst_ptr), out0);
_mm_store_si128((__m128i *)(dst_ptr + 4), out1);
#else
_mm_store_si128((__m128i *)(dst_ptr), *poutput);
#endif // CONFIG_HIGHBITDEPTH
if (sizeof(tran_low_t) == 4) {
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
__m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
__m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
_mm_store_si128((__m128i *)(dst_ptr), out0);
_mm_store_si128((__m128i *)(dst_ptr + 4), out1);
} else {
_mm_store_si128((__m128i *)(dst_ptr), *poutput);
}
}
static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
......
......@@ -18,17 +18,17 @@
#include "aom_dsp/x86/txfm_common_avx2.h"
static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
#if CONFIG_HIGHBITDEPTH
*in = _mm256_setr_epi16(
(int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
(int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
(int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
(int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
(int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
(int16_t)coeff[15]);
#else
*in = _mm256_loadu_si256((const __m256i *)coeff);
#endif
if (sizeof(tran_low_t) == 4) {
*in = _mm256_setr_epi16(
(int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
(int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
(int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
(int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
(int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
(int16_t)coeff[15]);
} else {
*in = _mm256_loadu_si256((const __m256i *)coeff);
}
}
static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
......
......@@ -133,12 +133,12 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
// Function to allow 8 bit optimisations to be used when profile 0 is used with
// highbitdepth enabled
static INLINE __m128i load_input_data(const tran_low_t *data) {
#if CONFIG_HIGHBITDEPTH
return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
data[6], data[7]);
#else
return _mm_load_si128((const __m128i *)data);
#endif
if (sizeof(tran_low_t) == 4) {
return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
data[6], data[7]);
} else {
return _mm_load_si128((const __m128i *)data);
}
}
static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
......
......@@ -16,29 +16,29 @@
#include "aom/aom_integer.h"
static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
#if CONFIG_HIGHBITDEPTH
return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
(int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
(int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
(int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
#else
return _mm_load_si128((const __m128i *)coeff_ptr);
#endif
if (sizeof(tran_low_t) == 4) {
return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
(int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
(int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
(int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
} else {
return _mm_load_si128((const __m128i *)coeff_ptr);
}
}
static INLINE void store_coefficients(__m128i coeff_vals,
tran_low_t *coeff_ptr) {
#if CONFIG_HIGHBITDEPTH
__m128i one = _mm_set1_epi16(1);
__m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
__m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
__m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
__m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
_mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
_mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
#else
_mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
#endif
if (sizeof(tran_low_t) == 4) {
__m128i one = _mm_set1_epi16(1);
__m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
__m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
__m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
__m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
_mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
_mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
} else {
_mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
}
}
void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
......
......@@ -16,16 +16,16 @@
// This header file should be put below any x86 intrinsics head file
static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
#if CONFIG_HIGHBITDEPTH
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
__m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
__m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
_mm_storeu_si128((__m128i *)(dst_ptr), out0);
_mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
#else
_mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
#endif // CONFIG_HIGHBITDEPTH
if (sizeof(tran_low_t) == 4) {
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
__m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
__m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
_mm_storeu_si128((__m128i *)(dst_ptr), out0);
_mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
} else {
_mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
}
}
#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
......@@ -16,24 +16,24 @@
#include "aom_dsp/aom_dsp_common.h"
static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) {
#if CONFIG_HIGHBITDEPTH
const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
*c = _mm256_packs_epi32(x0, x1);
*c = _mm256_permute4x64_epi64(*c, 0xD8);
#else
*c = _mm256_loadu_si256((const __m256i *)coeff);
#endif
if (sizeof(tran_low_t) == 4) {
const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
*c = _mm256_packs_epi32(x0, x1);
*c = _mm256_permute4x64_epi64(*c, 0xD8);
} else {
*c = _mm256_loadu_si256((const __m256i *)coeff);
}
}
static INLINE void write_zero(tran_low_t *qcoeff) {
const __m256i zero = _mm256_setzero_si256();
#if CONFIG_HIGHBITDEPTH
_mm256_storeu_si256((__m256i *)qcoeff, zero);
_mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
#else
_mm256_storeu_si256((__m256i *)qcoeff, zero);
#endif
if (sizeof(tran_low_t) == 4) {
_mm256_storeu_si256((__m256i *)qcoeff, zero);
_mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
} else {
_mm256_storeu_si256((__m256i *)qcoeff, zero);
}
}
static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
......@@ -83,19 +83,16 @@ static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
_mm256_storeu_si256((__m256i *)addr + 1, x1); \
} while (0)
#if CONFIG_HIGHBITDEPTH
#define store_two_quan(q, addr1, dq, addr2) \
do { \
store_quan(q, addr1); \
store_quan(dq, addr2); \
} while (0)
#else
#define store_two_quan(q, addr1, dq, addr2) \
do { \
_mm256_storeu_si256((__m256i *)addr1, q); \
_mm256_storeu_si256((__m256i *)addr2, dq); \
#define store_two_quan(q, addr1, dq, addr2) \
do { \
if (sizeof(tran_low_t) == 4) { \
store_quan(q, addr1); \
store_quan(dq, addr2); \
} else { \
_mm256_storeu_si256((__m256i *)addr1, q); \
_mm256_storeu_si256((__m256i *)addr2, dq); \
} \
} while (0)
#endif
static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
const int16_t *iscan_ptr, tran_low_t *qcoeff,
......
......@@ -18,53 +18,53 @@
static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
__m128i *c0, __m128i *c1) {
const tran_low_t *addr = coeff + offset;
#if CONFIG_HIGHBITDEPTH
const __m128i x0 = _mm_load_si128((const __m128i *)addr);
const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
*c0 = _mm_packs_epi32(x0, x1);
*c1 = _mm_packs_epi32(x2, x3);
#else
*c0 = _mm_load_si128((const __m128i *)addr);
*c1 = _mm_load_si128((const __m128i *)addr + 1);
#endif
if (sizeof(tran_low_t) == 4) {
const __m128i x0 = _mm_load_si128((const __m128i *)addr);
const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
*c0 = _mm_packs_epi32(x0, x1);
*c1 = _mm_packs_epi32(x2, x3);
} else {
*c0 = _mm_load_si128((const __m128i *)addr);
*c1 = _mm_load_si128((const __m128i *)addr + 1);
}
}
static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
tran_low_t *qcoeff, intptr_t offset) {
tran_low_t *addr = qcoeff + offset;
#if CONFIG_HIGHBITDEPTH
const __m128i zero = _mm_setzero_si128();
__m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
__m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
__m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
_mm_store_si128((__m128i *)addr, y0);
_mm_store_si128((__m128i *)addr + 1, y1);
sign_bits = _mm_cmplt_epi16(*qc1, zero);
y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
_mm_store_si128((__m128i *)addr + 2, y0);
_mm_store_si128((__m128i *)addr + 3, y1);
#else
_mm_store_si128((__m128i *)addr, *qc0);
_mm_store_si128((__m128i *)addr + 1, *qc1);
#endif
if (sizeof(tran_low_t) == 4) {
const __m128i zero = _mm_setzero_si128();
__m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
__m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
__m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
_mm_store_si128((__m128i *)addr, y0);
_mm_store_si128((__m128i *)addr + 1, y1);
sign_bits = _mm_cmplt_epi16(*qc1, zero);
y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
_mm_store_si128((__m128i *)addr + 2, y0);
_mm_store_si128((__m128i *)addr + 3, y1);
} else {
_mm_store_si128((__m128i *)addr, *qc0);
_mm_store_si128((__m128i *)addr + 1, *qc1);
}
}
static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
const __m128i zero = _mm_setzero_si128();
tran_low_t *addr = qcoeff + offset;
#if CONFIG_HIGHBITDEPTH
_mm_store_si128((__m128i *)addr, zero);
_mm_store_si128((__m128i *)addr + 1, zero);
_mm_store_si128((__m128i *)addr + 2, zero);
_mm_store_si128((__m128i *)addr + 3, zero);
#else
_mm_store_si128((__m128i *)addr, zero);
_mm_store_si128((__m128i *)addr + 1, zero);
#endif
if (sizeof(tran_low_t) == 4) {
_mm_store_si128((__m128i *)addr, zero);
_mm_store_si128((__m128i *)addr + 1, zero);
_mm_store_si128((__m128i *)addr + 2, zero);
_mm_store_si128((__m128i *)addr + 3, zero);
} else {
_mm_store_si128((__m128i *)addr, zero);
_mm_store_si128((__m128i *)addr + 1, zero);
}
}
void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
......
......@@ -17,14 +17,15 @@
static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
__m256i *c) {
const tran_low_t *addr = coeff + offset;
#if CONFIG_HIGHBITDEPTH
const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
const __m256i y = _mm256_packs_epi32(x0, x1);
*c = _mm256_permute4x64_epi64(y, 0xD8);
#else
*c = _mm256_loadu_si256((const __m256i *)addr);
#endif
if (sizeof(tran_low_t) == 4) {
const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
const __m256i y = _mm256_packs_epi32(x0, x1);
*c = _mm256_permute4x64_epi64(y, 0xD8);
} else {
*c = _mm256_loadu_si256((const __m256i *)addr);
}
}
int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment