From ec5d01cbe486ebc29f0c03423f9a253a1655df7f Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Mon, 20 Jan 2014 16:32:16 -0500
Subject: [PATCH] Using a table on ARM for unsigned division by small (<=256)
 integers.

Saves 0.6% for 64 kb/s and 1.8% for 128 kb/s when decoding on arm7tdmi.
---
 celt/entcode.c | 38 ++++++++++++++++++++++++++++++++++++++
 celt/entcode.h | 22 ++++++++++++++++++++++
 celt/entdec.c  |  2 +-
 celt/entenc.c  |  2 +-
 4 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/celt/entcode.c b/celt/entcode.c
index fa5d7c7c..edb7ef90 100644
--- a/celt/entcode.c
+++ b/celt/entcode.c
@@ -91,3 +91,41 @@ opus_uint32 ec_tell_frac(ec_ctx *_this){
   }
   return nbits-l;
 }
+
+#ifdef USE_SMALL_DIV_TABLE
+/* Result of 2^32/(2*i+1), except for i=0. */
+const opus_uint32 SMALL_DIV_TABLE[129] = {
+   0xFFFFFFFF, 0x55555555, 0x33333333, 0x24924924,
+   0x1C71C71C, 0x1745D174, 0x13B13B13, 0x11111111,
+   0x0F0F0F0F, 0x0D79435E, 0x0C30C30C, 0x0B21642C,
+   0x0A3D70A3, 0x097B425E, 0x08D3DCB0, 0x08421084,
+   0x07C1F07C, 0x07507507, 0x06EB3E45, 0x06906906,
+   0x063E7063, 0x05F417D0, 0x05B05B05, 0x0572620A,
+   0x05397829, 0x05050505, 0x04D4873E, 0x04A7904A,
+   0x047DC11F, 0x0456C797, 0x04325C53, 0x04104104,
+   0x03F03F03, 0x03D22635, 0x03B5CC0E, 0x039B0AD1,
+   0x0381C0E0, 0x0369D036, 0x03531DEC, 0x033D91D2,
+   0x0329161F, 0x03159721, 0x03030303, 0x02F14990,
+   0x02E05C0B, 0x02D02D02, 0x02C0B02C, 0x02B1DA46,
+   0x02A3A0FD, 0x0295FAD4, 0x0288DF0C, 0x027C4597,
+   0x02702702, 0x02647C69, 0x02593F69, 0x024E6A17,
+   0x0243F6F0, 0x0239E0D5, 0x02302302, 0x0226B902,
+   0x021D9EAD, 0x0214D021, 0x020C49BA, 0x02040810,
+   0x01FC07F0, 0x01F44659, 0x01ECC07B, 0x01E573AC,
+   0x01DE5D6E, 0x01D77B65, 0x01D0CB58, 0x01CA4B30,
+   0x01C3F8F0, 0x01BDD2B8, 0x01B7D6C3, 0x01B20364,
+   0x01AC5701, 0x01A6D01A, 0x01A16D3F, 0x019C2D14,
+   0x01970E4F, 0x01920FB4, 0x018D3018, 0x01886E5F,
+   0x0183C977, 0x017F405F, 0x017AD220, 0x01767DCE,
+   0x01724287, 0x016E1F76, 0x016A13CD, 0x01661EC6,
+   0x01623FA7, 0x015E75BB, 0x015AC056, 0x01571ED3,
+   0x01539094, 0x01501501, 0x014CAB88, 0x0149539E,
+   0x01460CBC, 0x0142D662, 0x013FB013, 0x013C995A,
+   0x013991C2, 0x013698DF, 0x0133AE45, 0x0130D190,
+   0x012E025C, 0x012B404A, 0x01288B01, 0x0125E227,
+   0x01234567, 0x0120B470, 0x011E2EF3, 0x011BB4A4,
+   0x01194538, 0x0116E068, 0x011485F0, 0x0112358E,
+   0x010FEF01, 0x010DB20A, 0x010B7E6E, 0x010953F3,
+   0x01073260, 0x0105197F, 0x0103091B, 0x01010101
+};
+#endif
diff --git a/celt/entcode.h b/celt/entcode.h
index dd13e49e..c10fe8d7 100644
--- a/celt/entcode.h
+++ b/celt/entcode.h
@@ -34,6 +34,12 @@
 # include <stddef.h>
 # include "ecintrin.h"
 
+extern const opus_uint32 SMALL_DIV_TABLE[129];
+
+#ifdef OPUS_ARM_ASM
+#define USE_SMALL_DIV_TABLE
+#endif
+
 /*OPT: ec_window must be at least 32 bits, but if you have fast arithmetic on a
    larger type, you can speed up the decoder by using it here.*/
 typedef opus_uint32           ec_window;
@@ -114,4 +120,20 @@ static OPUS_INLINE int ec_tell(ec_ctx *_this){
            rounding error is in the positive direction).*/
 opus_uint32 ec_tell_frac(ec_ctx *_this);
 
+/* Tested exhaustively for all n and for 1<=d<=256 */
+static OPUS_INLINE opus_uint32 celt_udiv(opus_uint32 n, opus_uint32 d) {
+#ifdef USE_SMALL_DIV_TABLE
+   if (d>256)
+      return n/d;
+   else {
+      opus_uint32 t, q;
+      t = EC_ILOG(d&-d);
+      q = (opus_uint64)SMALL_DIV_TABLE[d>>t]*(n>>(t-1))>>32;
+      return q+(n-q*d >= d);
+   }
+#else
+   return n/d;
+#endif
+}
+
 #endif
diff --git a/celt/entdec.c b/celt/entdec.c
index 3c264685..0b3433ed 100644
--- a/celt/entdec.c
+++ b/celt/entdec.c
@@ -138,7 +138,7 @@ void ec_dec_init(ec_dec *_this,unsigned char *_buf,opus_uint32 _storage){
 
 unsigned ec_decode(ec_dec *_this,unsigned _ft){
   unsigned s;
-  _this->ext=_this->rng/_ft;
+  _this->ext=celt_udiv(_this->rng,_ft);
   s=(unsigned)(_this->val/_this->ext);
   return _ft-EC_MINI(s+1,_ft);
 }
diff --git a/celt/entenc.c b/celt/entenc.c
index a7e34ece..271e4d30 100644
--- a/celt/entenc.c
+++ b/celt/entenc.c
@@ -127,7 +127,7 @@ void ec_enc_init(ec_enc *_this,unsigned char *_buf,opus_uint32 _size){
 
 void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft){
   opus_uint32 r;
-  r=_this->rng/_ft;
+  r=celt_udiv(_this->rng,_ft);
   if(_fl>0){
     _this->val+=_this->rng-IMUL32(r,(_ft-_fl));
     _this->rng=IMUL32(r,(_fh-_fl));
-- 
GitLab