Commit 08184e32 authored by Yi Luo's avatar Yi Luo

Change register loading to fix stack overflow issue

- Use _mm_loadl_epi64 instead of _mm_loadu_si128 for
  uint16_t temp2[4 * 4] buffer.
- Refer to:
  d0de89a1 remove vpx_highbd_1[02]_sub_pixel_variance4x4_sse4_1
BUG=webm:1242

Change-Id: Ieff555c8dd8070937f27f4ec8535b77e1ed5b8b2
parent 76ff9b30
......@@ -1151,7 +1151,9 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE4_1, VpxSubpelVarianceTest,
::testing::Values(
make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_sse4_1, 8)));
make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_sse4_1, 8),
make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_sse4_1, 10),
make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_sse4_1, 12)));
INSTANTIATE_TEST_CASE_P(
SSE4_1, VpxSubpelAvgVarianceTest,
......
......@@ -1359,8 +1359,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
}
}
}
$vpx_highbd_10_sub_pixel_variance4x4_sse4_1='';
$vpx_highbd_12_sub_pixel_variance4x4_sse4_1='';
} # CONFIG_VP9_HIGHBITDEPTH
if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
......
......@@ -29,15 +29,15 @@ static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
a0 = _mm_loadu_si128((__m128i const *) (a + 0 * a_stride));
a1 = _mm_loadu_si128((__m128i const *) (a + 1 * a_stride));
a2 = _mm_loadu_si128((__m128i const *) (a + 2 * a_stride));
a3 = _mm_loadu_si128((__m128i const *) (a + 3 * a_stride));
a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
b0 = _mm_loadu_si128((__m128i const *) (b + 0 * b_stride));
b1 = _mm_loadu_si128((__m128i const *) (b + 1 * b_stride));
b2 = _mm_loadu_si128((__m128i const *) (b + 2 * b_stride));
b3 = _mm_loadu_si128((__m128i const *) (b + 3 * b_stride));
b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
u0 = _mm_unpacklo_epi16(a0, a1);
u1 = _mm_unpacklo_epi16(a2, a3);
......@@ -130,6 +130,44 @@ uint32_t vpx_highbd_8_sub_pixel_variance4x4_sse4_1(
4, dst, dst_stride, sse);
}
uint32_t vpx_highbd_10_sub_pixel_variance4x4_sse4_1(
const uint8_t *src, int src_stride,
int xoffset, int yoffset,
const uint8_t *dst, int dst_stride,
uint32_t *sse) {
uint16_t fdata3[(4 + 1) * 4];
uint16_t temp2[4 * 4];
vpx_highbd_var_filter_block2d_bil_first_pass(
src, fdata3, src_stride, 1, 4 + 1,
4, bilinear_filters_2t[xoffset]);
vpx_highbd_var_filter_block2d_bil_second_pass(
fdata3, temp2, 4, 4, 4, 4,
bilinear_filters_2t[yoffset]);
return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2),
4, dst, dst_stride, sse);
}
uint32_t vpx_highbd_12_sub_pixel_variance4x4_sse4_1(
const uint8_t *src, int src_stride,
int xoffset, int yoffset,
const uint8_t *dst, int dst_stride,
uint32_t *sse) {
uint16_t fdata3[(4 + 1) * 4];
uint16_t temp2[4 * 4];
vpx_highbd_var_filter_block2d_bil_first_pass(
src, fdata3, src_stride, 1, 4 + 1,
4, bilinear_filters_2t[xoffset]);
vpx_highbd_var_filter_block2d_bil_second_pass(
fdata3, temp2, 4, 4, 4, 4,
bilinear_filters_2t[yoffset]);
return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2),
4, dst, dst_stride, sse);
}
// Sub-pixel average
uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment