Commit d3e22456 by Imdad Sardharwalla Committed by Debargha Mukherjee

### Optimise SSE4.1 self-guided filter implementation

```The cross_sum function has been replaced by a more optimised
version, increasing the speed of the filter by ~5%.

Change-Id: Ieb0fbe53033591919f719d0a288a55abd74ba2e4```
parent 7d2e5c96
 ... ... @@ -231,22 +231,34 @@ static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, // Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter // where the outer four corners have weight 3 and all other pixels have weight // 4. // // Pixels are indexed like this: // xtl xt xtr // xl x xr // xbl xb xbr // // buf points to x // // fours = xl + xt + xr + xb + x // threes = xtl + xtr + xbr + xbl // cross_sum = 4 * fours + 3 * threes // = 4 * (fours + threes) - threes // = (fours + threes) << 2 - threes static __m128i cross_sum(const int32_t *buf, int stride) { const __m128i a0 = xx_loadu_128(buf - 1 - stride); const __m128i a1 = xx_loadu_128(buf + 3 - stride); const __m128i b0 = xx_loadu_128(buf - 1); const __m128i b1 = xx_loadu_128(buf + 3); const __m128i c0 = xx_loadu_128(buf - 1 + stride); const __m128i c1 = xx_loadu_128(buf + 3 + stride); const __m128i fours = _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(_mm_alignr_epi8(b1, b0, 4), b0), _mm_add_epi32(_mm_alignr_epi8(b1, b0, 8), _mm_alignr_epi8(c1, c0, 4))), _mm_alignr_epi8(a1, a0, 4)); const __m128i threes = _mm_add_epi32( _mm_add_epi32(a0, c0), _mm_add_epi32(_mm_alignr_epi8(a1, a0, 8), _mm_alignr_epi8(c1, c0, 8))); const __m128i xtl = xx_loadu_128(buf - 1 - stride); const __m128i xt = xx_loadu_128(buf - stride); const __m128i xtr = xx_loadu_128(buf + 1 - stride); const __m128i xl = xx_loadu_128(buf - 1); const __m128i x = xx_loadu_128(buf); const __m128i xr = xx_loadu_128(buf + 1); const __m128i xbl = xx_loadu_128(buf - 1 + stride); const __m128i xb = xx_loadu_128(buf + stride); const __m128i xbr = xx_loadu_128(buf + 1 + stride); const __m128i fours = _mm_add_epi32( xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x)))); const __m128i threes = _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl))); return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes); } ... ...
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment