Commit 2ba201fc authored by James Zern's avatar James Zern Committed by Gerrit Code Review
Browse files

Merge "32 Align Load bug In the sub_pixel_avg_variance the parameter sec was...

Merge "32 Align Load bug In the sub_pixel_avg_variance the parameter sec was also aligned load and changed to unaligned."
parents 480693b9 efdfdf57
...@@ -333,7 +333,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, ...@@ -333,7 +333,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
if (y_offset == 0) { if (y_offset == 0) {
for (i = 0; i < height ; i++) { for (i = 0; i < height ; i++) {
LOAD_SRC_DST LOAD_SRC_DST
sec_reg = _mm256_load_si256((__m256i const *) (sec)); sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg); src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride; sec+= sec_stride;
// expend each byte to 2 bytes // expend each byte to 2 bytes
...@@ -347,7 +347,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, ...@@ -347,7 +347,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
for (i = 0; i < height ; i++) { for (i = 0; i < height ; i++) {
LOAD_SRC_DST LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, src_stride) AVG_NEXT_SRC(src_reg, src_stride)
sec_reg = _mm256_load_si256((__m256i const *) (sec)); sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg); src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride; sec+= sec_stride;
// expend each byte to 2 bytes // expend each byte to 2 bytes
...@@ -369,7 +369,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, ...@@ -369,7 +369,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
MERGE_NEXT_SRC(src_reg, src_stride) MERGE_NEXT_SRC(src_reg, src_stride)
FILTER_SRC(filter) FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_load_si256((__m256i const *) (sec)); sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg); src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride; sec+= sec_stride;
MERGE_WITH_SRC(src_reg, zero_reg) MERGE_WITH_SRC(src_reg, zero_reg)
...@@ -385,7 +385,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, ...@@ -385,7 +385,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
for (i = 0; i < height ; i++) { for (i = 0; i < height ; i++) {
LOAD_SRC_DST LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1) AVG_NEXT_SRC(src_reg, 1)
sec_reg = _mm256_load_si256((__m256i const *) (sec)); sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg); src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride; sec+= sec_stride;
// expand each byte to 2 bytes // expand each byte to 2 bytes
...@@ -409,7 +409,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, ...@@ -409,7 +409,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
AVG_NEXT_SRC(src_reg, 1) AVG_NEXT_SRC(src_reg, 1)
// average between previous average to current average // average between previous average to current average
src_avg = _mm256_avg_epu8(src_avg, src_reg); src_avg = _mm256_avg_epu8(src_avg, src_reg);
sec_reg = _mm256_load_si256((__m256i const *) (sec)); sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg); src_avg = _mm256_avg_epu8(src_avg, sec_reg);
sec+= sec_stride; sec+= sec_stride;
// expand each byte to 2 bytes // expand each byte to 2 bytes
...@@ -437,7 +437,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, ...@@ -437,7 +437,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
MERGE_WITH_SRC(src_avg, src_reg) MERGE_WITH_SRC(src_avg, src_reg)
FILTER_SRC(filter) FILTER_SRC(filter)
src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_load_si256((__m256i const *) (sec)); sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg); src_avg = _mm256_avg_epu8(src_avg, sec_reg);
// expand each byte to 2 bytes // expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg) MERGE_WITH_SRC(src_avg, zero_reg)
...@@ -459,7 +459,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, ...@@ -459,7 +459,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
MERGE_NEXT_SRC(src_reg, 1) MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter) FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_load_si256((__m256i const *) (sec)); sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg); src_reg = _mm256_avg_epu8(src_reg, sec_reg);
MERGE_WITH_SRC(src_reg, zero_reg) MERGE_WITH_SRC(src_reg, zero_reg)
sec+= sec_stride; sec+= sec_stride;
...@@ -487,7 +487,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, ...@@ -487,7 +487,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// average between previous pack to the current // average between previous pack to the current
src_pack = _mm256_avg_epu8(src_pack, src_reg); src_pack = _mm256_avg_epu8(src_pack, src_reg);
sec_reg = _mm256_load_si256((__m256i const *) (sec)); sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg); src_pack = _mm256_avg_epu8(src_pack, sec_reg);
sec+= sec_stride; sec+= sec_stride;
MERGE_WITH_SRC(src_pack, zero_reg) MERGE_WITH_SRC(src_pack, zero_reg)
...@@ -524,7 +524,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, ...@@ -524,7 +524,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
// filter the source // filter the source
FILTER_SRC(yfilter) FILTER_SRC(yfilter)
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
sec_reg = _mm256_load_si256((__m256i const *) (sec)); sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg); src_pack = _mm256_avg_epu8(src_pack, sec_reg);
MERGE_WITH_SRC(src_pack, zero_reg) MERGE_WITH_SRC(src_pack, zero_reg)
src_pack = src_reg; src_pack = src_reg;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment