Commit e9fde265 authored by Yi Luo's avatar Yi Luo
Browse files
parent fbabcad6
......@@ -2990,4 +2990,5 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
}
}
}
_mm256_zeroupper();
} // NOLINT
......@@ -911,4 +911,5 @@ void aom_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
q6 = _mm_or_si128(flat2_q6, q6);
_mm_storeu_si128((__m128i *)(s + 6 * p), q6);
}
_mm256_zeroupper();
}
......@@ -78,6 +78,7 @@ void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
_mm_storeu_si128((__m128i *)(res), sum);
}
_mm256_zeroupper();
}
void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
......@@ -162,4 +163,5 @@ void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
_mm_storeu_si128((__m128i *)(res), sum);
}
_mm256_zeroupper();
}
......@@ -37,6 +37,7 @@
sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
res = _mm_cvtsi128_si32(sum_sad128); \
_mm256_zeroupper(); \
return res; \
}
......@@ -69,6 +70,7 @@
sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
res = _mm_cvtsi128_si32(sum_sad128); \
_mm256_zeroupper(); \
return res; \
}
......@@ -122,6 +124,7 @@ FSAD32
sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
res = _mm_cvtsi128_si32(sum_sad128); \
_mm256_zeroupper(); \
return res; \
}
......@@ -160,6 +163,7 @@ FSAD32
sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
res = _mm_cvtsi128_si32(sum_sad128); \
_mm256_zeroupper(); \
return res; \
}
......
......@@ -8,6 +8,8 @@
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <immintrin.h>
#include "./aom_dsp_rtcd.h"
typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
......@@ -43,9 +45,13 @@ unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
unsigned int variance;
variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
aom_get16x16var_avx2, 16);
return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
_mm256_zeroupper();
return variance;
}
unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
......@@ -53,6 +59,7 @@ unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
unsigned int *sse) {
int sum;
aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
_mm256_zeroupper();
return *sse;
}
......@@ -60,36 +67,52 @@ unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
unsigned int variance;
variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
aom_get32x32var_avx2, 32);
return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
_mm256_zeroupper();
return variance;
}
unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
unsigned int variance;
variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
aom_get32x32var_avx2, 32);
return *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
_mm256_zeroupper();
return variance;
}
unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
unsigned int variance;
variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
aom_get32x32var_avx2, 32);
return *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
_mm256_zeroupper();
return variance;
}
unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse) {
int sum;
unsigned int variance;
variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
aom_get32x32var_avx2, 32);
return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
_mm256_zeroupper();
return variance;
}
unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
......@@ -115,8 +138,12 @@ unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src,
aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
dst + 32, dst_stride, 64, &sse2);
const int se = se1 + se2;
unsigned int variance;
*sse = sse1 + sse2;
return *sse - (uint32_t)(((int64_t)se * se) >> 12);
variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
_mm256_zeroupper();
return variance;
}
unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src,
......@@ -126,7 +153,10 @@ unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src,
unsigned int *sse) {
const int se = aom_sub_pixel_variance32xh_avx2(
src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
return *sse - (uint32_t)(((int64_t)se * se) >> 10);
const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
_mm256_zeroupper();
return variance;
}
unsigned int aom_sub_pixel_avg_variance64x64_avx2(
......@@ -140,10 +170,13 @@ unsigned int aom_sub_pixel_avg_variance64x64_avx2(
src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
64, 64, &sse2);
const int se = se1 + se2;
unsigned int variance;
*sse = sse1 + sse2;
return *sse - (uint32_t)(((int64_t)se * se) >> 12);
variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
_mm256_zeroupper();
return variance;
}
unsigned int aom_sub_pixel_avg_variance32x32_avx2(
......@@ -152,5 +185,8 @@ unsigned int aom_sub_pixel_avg_variance32x32_avx2(
// Process 32 elements in parallel.
const int se = aom_sub_pixel_avg_variance32xh_avx2(
src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
return *sse - (uint32_t)(((int64_t)se * se) >> 10);
const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
_mm256_zeroupper();
return variance;
}
......@@ -139,6 +139,7 @@ void aom_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
*((int *)Sum) = _mm_cvtsi128_si32(sum_res);
}
_mm256_zeroupper();
}
void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
......@@ -228,6 +229,7 @@ void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
*((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
_mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
}
_mm256_zeroupper();
}
#define FILTER_SRC(filter) \
......@@ -482,6 +484,7 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
}
}
CALC_SUM_AND_SSE
_mm256_zeroupper();
return sum;
}
......@@ -705,5 +708,6 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2(
}
}
CALC_SUM_AND_SSE
_mm256_zeroupper();
return sum;
}
......@@ -68,5 +68,6 @@ int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
_mm_storel_epi64((__m128i *)(&sse), sse_reg128);
_mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
_mm256_zeroupper();
return sse;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment