Implement sum_sqr_shift() using two passes with no branch inside the loops

Slightly slower on x86, about the same speed on ARMv7, should be faster on DSPs.

Implement sum_sqr_shift() using two passes with no branch inside the loops
7c645606 · Jean-Marc Valin · fe4d91c2 · 7c645606
Commit 7c645606 authored 9 years ago by Jean-Marc Valin
--- a/silk/sum_sqr_shift.c
+++ b/silk/sum_sqr_shift.c
@@ -41,43 +41,40 @@ void silk_sum_sqr_shift(
 )
 {
    opus_int   i, shft;
-    opus_int32 nrg_tmp, nrg;
+    opus_uint32 nrg_tmp;
+    opus_int32 nrg;

-    nrg  = 0;
-    shft = 0;
-    len--;
-    for( i = 0; i < len; i += 2 ) {
-        nrg = silk_SMLABB_ovflw( nrg, x[ i ], x[ i ] );
-        nrg = silk_SMLABB_ovflw( nrg, x[ i + 1 ], x[ i + 1 ] );
-        if( nrg < 0 ) {
-            /* Scale down */
-            nrg = (opus_int32)silk_RSHIFT_uint( (opus_uint32)nrg, 2 );
-            shft = 2;
-            i+=2;
-            break;
-        }
+    /* Do a first run with the maximum shift we could have. */
+    shft = 31-silk_CLZ32(len);
+    /* Let's be conservative with rounding and start with nrg=len. */
+    nrg  = len;
+    for( i = 0; i < len - 1; i += 2 ) {
+        nrg_tmp = silk_SMULBB( x[ i ], x[ i ] );
+        nrg_tmp = silk_SMLABB_ovflw( nrg_tmp, x[ i + 1 ], x[ i + 1 ] );
+        nrg = (opus_int32)silk_ADD_RSHIFT_uint( nrg, nrg_tmp, shft );
    }
-    for( ; i < len; i += 2 ) {
+    if( i < len ) {
+        /* One sample left to process */
+        nrg_tmp = silk_SMULBB( x[ i ], x[ i ] );
+        nrg = (opus_int32)silk_ADD_RSHIFT_uint( nrg, nrg_tmp, shft );
+    }
+    silk_assert( nrg >= 0 );
+    /* Make sure the result will fit in a 32-bit signed integer with two bits
+       of headroom. */
+    shft = silk_max_32(0, shft+3 - silk_CLZ32(nrg));
+    nrg = 0;
+    for( i = 0 ; i < len - 1; i += 2 ) {
        nrg_tmp = silk_SMULBB( x[ i ], x[ i ] );
        nrg_tmp = silk_SMLABB_ovflw( nrg_tmp, x[ i + 1 ], x[ i + 1 ] );
-        nrg = (opus_int32)silk_ADD_RSHIFT_uint( nrg, (opus_uint32)nrg_tmp, shft );
-        if( nrg < 0 ) {
-            /* Scale down */
-            nrg = (opus_int32)silk_RSHIFT_uint( (opus_uint32)nrg, 2 );
-            shft += 2;
-        }
+        nrg = (opus_int32)silk_ADD_RSHIFT_uint( nrg, nrg_tmp, shft );
    }
-    if( i == len ) {
+    if( i < len ) {
        /* One sample left to process */
        nrg_tmp = silk_SMULBB( x[ i ], x[ i ] );
        nrg = (opus_int32)silk_ADD_RSHIFT_uint( nrg, nrg_tmp, shft );
    }

-    /* Make sure to have at least one extra leading zero (two leading zeros in total) */
-    if( nrg & 0xC0000000 ) {
-        nrg = silk_RSHIFT_uint( (opus_uint32)nrg, 2 );
-        shft += 2;
-    }
+    silk_assert( nrg >= 0 );

    /* Output arguments */
    *shift  = shft;