Commit 46443741 authored by David Barker's avatar David Barker Committed by Debargha Mukherjee
Browse files

Remove temporary condition from warp code

Patch https://aomedia-review.googlesource.com/c/10901/ temporarily
disabled the SSE2 warp filter for 4x4 blocks, because of a
data race when the filter was used at the right-hand edge of a
tile in a multithreaded encode.

This patch fixes the data race and re-enables the SSE2 warp filter.

Change-Id: I7058c897ddf538cd10001c5be13b1a1bfe8320fd
parent 93760cdc
...@@ -1342,14 +1342,9 @@ static void warp_plane(WarpedMotionParams *wm, uint8_t *ref, int width, ...@@ -1342,14 +1342,9 @@ static void warp_plane(WarpedMotionParams *wm, uint8_t *ref, int width,
const int16_t gamma = wm->gamma; const int16_t gamma = wm->gamma;
const int16_t delta = wm->delta; const int16_t delta = wm->delta;
if (p_width == 4) av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
av1_warp_affine_c(mat, ref, width, height, stride, pred, p_col, p_row, p_width, p_height, p_stride, subsampling_x, subsampling_y,
p_width, p_height, p_stride, subsampling_x, ref_frm, alpha, beta, gamma, delta);
subsampling_y, ref_frm, alpha, beta, gamma, delta);
else
av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
p_width, p_height, p_stride, subsampling_x, subsampling_y,
ref_frm, alpha, beta, gamma, delta);
} else { } else {
warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width, warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
p_height, p_stride, subsampling_x, subsampling_y, x_scale, p_height, p_stride, subsampling_x, subsampling_y, x_scale,
......
...@@ -270,15 +270,16 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width, ...@@ -270,15 +270,16 @@ void av1_highbd_warp_affine_ssse3(int32_t *mat, uint16_t *ref, int width,
// Store, blending with 'pred' if needed // Store, blending with 'pred' if needed
__m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
// Note: If we're outputting a 4x4 block, we need to be very careful // Note: If we're outputting a 4x4 block, we need to be very careful
// to only output 4 pixels at this point, to avoid encode/decode // to only output 4 pixels at this point, to avoid encode/decode
// mismatches when encoding with multiple threads. // mismatches when encoding with multiple threads.
if (p_width == 4) if (p_width == 4) {
if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadl_epi64(p));
_mm_storel_epi64(p, res_16bit); _mm_storel_epi64(p, res_16bit);
else } else {
if (ref_frm) res_16bit = _mm_avg_epu16(res_16bit, _mm_loadu_si128(p));
_mm_storeu_si128(p, res_16bit); _mm_storeu_si128(p, res_16bit);
}
} }
} }
} }
......
...@@ -278,15 +278,19 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height, ...@@ -278,15 +278,19 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
// Store, blending with 'pred' if needed // Store, blending with 'pred' if needed
__m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
if (ref_frm) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
// Note: If we're outputting a 4x4 block, we need to be very careful // Note: If we're outputting a 4x4 block, we need to be very careful
// to only output 4 pixels at this point, to avoid encode/decode // to only output 4 pixels at this point, to avoid encode/decode
// mismatches when encoding with multiple threads. // mismatches when encoding with multiple threads.
if (p_width == 4) if (p_width == 4) {
if (ref_frm) {
const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
res_8bit = _mm_avg_epu8(res_8bit, orig);
}
*(uint32_t *)p = _mm_cvtsi128_si32(res_8bit); *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
else } else {
if (ref_frm) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
_mm_storel_epi64(p, res_8bit); _mm_storel_epi64(p, res_8bit);
}
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment