diff --git a/libentcode/ecintrin.h b/libentcode/ecintrin.h
index 4c00596a6e633eb8eb9eb346483bfb7ce2b1d768..f9a960f9a2520da7de14942251cec29c6b39d96b 100644
--- a/libentcode/ecintrin.h
+++ b/libentcode/ecintrin.h
@@ -61,9 +61,10 @@
 #endif
 #if defined(EC_CLZ)
 /*Note that __builtin_clz is not defined when _x==0, according to the gcc
-   documentation (and that of the BSR instruction that implements it on x86),
-   so we have to special-case it.*/
-# define EC_ILOG(_x) (EC_CLZ0-EC_CLZ(_x)&-!!(_x))
+   documentation (and that of the BSR instruction that implements it on x86).
+  The majority of the time we can never pass it zero.
+  When we need to, it can be special cased.*/
+# define EC_ILOG(_x) (EC_CLZ0-EC_CLZ(_x))
 #else
 # define EC_ILOG(_x) (ec_ilog(_x))
 #endif
@@ -81,9 +82,10 @@
 #endif
 #if defined(EC_CLZ64)
 /*Note that __builtin_clz is not defined when _x==0, according to the gcc
-   documentation (and that of the BSR instruction that implements it on x86),
-   so we have to special-case it.*/
-# define EC_ILOG64(_x) (EC_CLZ64_0-EC_CLZ64(_x)&-!!(_x))
+   documentation (and that of the BSR instruction that implements it on x86).
+  The majority of the time we can never pass it zero.
+  When we need to, it can be special cased.*/
+# define EC_ILOG64(_x) (EC_CLZ64_0-EC_CLZ64(_x))
 #else
 # define EC_ILOG64(_x) (ec_ilog64(_x))
 #endif
diff --git a/libentcode/ectest.c b/libentcode/ectest.c
index c254722ddf019c41b10c53856cc911f735ae8588..3f64288c01e451f793ec72170afb5c3a869684c6 100644
--- a/libentcode/ectest.c
+++ b/libentcode/ectest.c
@@ -1,6 +1,8 @@
 #include <stdio.h>
+#include <math.h>
 #include "probenc.h"
 #include "probdec.h"
+#include "bitrenc.h"
 
 int main(int _argc,char **_argv){
   ec_byte_buffer buf;
@@ -8,32 +10,54 @@ int main(int _argc,char **_argv){
   ec_dec         dec;
   ec_probmod     mod;
   ec_uint64      sym64;
+  long           nbits;
+  double         entropy;
   int            ft;
   int            ftb;
   int            sym;
   int            sz;
   int            s;
   int            i;
+  entropy=0;
   /*Testing encoding of raw bit values.*/
   ec_byte_writeinit(&buf);
   ec_enc_init(&enc,&buf);
   for(ft=0;ft<1024;ft++){
     for(i=0;i<ft;i++){
+      entropy+=log(ft)*M_LOG2E;
       ec_enc_uint(&enc,i,ft);
+      entropy+=log(ft)*M_LOG2E+30;
       ec_enc_uint64(&enc,(ec_uint64)i<<30|i,(ec_uint64)ft<<30);
     }
   }
   /*Testing encoding of raw bit values.*/
   for(ftb=0;ftb<16;ftb++){
     for(i=0;i<(1<<ftb);i++){
+      long nbits;
+      long nbits2;
+      entropy+=ftb;
+      nbits=ec_enc_tell(&enc);
       ec_enc_bits(&enc,i,ftb);
+      nbits2=ec_enc_tell(&enc);
+      if(nbits2-nbits!=ftb){
+        fprintf(stderr,"Used %li bits to encode %i bits directly.\n",
+         nbits2-nbits,ftb);
+      }
+      entropy+=ftb+30;
+      nbits=nbits2;
       ec_enc_bits64(&enc,(ec_uint64)i<<30|i,ftb+30);
+      nbits2=ec_enc_tell(&enc);
+      if(nbits2-nbits!=ftb+30){
+        fprintf(stderr,"Used %li bits to encode %i bits directly.\n",
+         nbits2-nbits,ftb+30);
+      }
     }
   }
   for(sz=1;sz<256;sz++){
     ec_probmod_init_full(&mod,sz,1,sz+(sz>>1),NULL);
     for(i=0;i<sz;i++){
       s=((unsigned)(i*45678901+7))%sz;
+      entropy+=(log(mod.ft)-log(ec_bitree_get_freq(mod.bitree,s)))*M_LOG2E;
       ec_probmod_write(&mod,&enc,s);
     }
     ec_probmod_clear(&mod);
@@ -42,12 +66,19 @@ int main(int _argc,char **_argv){
     ec_probmod_init_full(&mod,sz,1,sz+(sz>>1),NULL);
     for(i=0;i<sz;i++){
       s=((unsigned)(i*45678901+7))%sz;
+      entropy+=(log(ec_bitree_get_cumul(mod.bitree,EC_MINI(s+6,sz))-
+       ec_bitree_get_cumul(mod.bitree,EC_MAXI(s-5,0)))-
+       log(ec_bitree_get_freq(mod.bitree,s)))*M_LOG2E;
       ec_probmod_write_range(&mod,&enc,s,EC_MAXI(s-5,0),EC_MINI(s+6,sz));
     }
     ec_probmod_clear(&mod);
   }
+  nbits=ec_enc_tell(&enc);
   ec_enc_done(&enc);
-  fprintf(stderr,"Encoded to %li bytes.\n",(long)(buf.ptr-buf.buf));
+  fprintf(stderr,
+   "Encoded %0.2lf bits of entropy to %li bits (%0.3lf%% wasted).\n",
+   entropy,nbits,100*(nbits-entropy)/nbits);
+  fprintf(stderr,"Packed to %li bytes.\n",(long)(buf.ptr-buf.buf));
   ec_byte_readinit(&buf,ec_byte_get_buffer(&buf),ec_byte_bytes(&buf));
   ec_dec_init(&dec,&buf);
   for(ft=0;ft<1024;ft++){
diff --git a/libentcode/entcode.c b/libentcode/entcode.c
index 3ace831f3ccdfc25a8ace627454283fcbf1caf3f..e1ca492d50f48845e123b4a795985e27b1118cc2 100644
--- a/libentcode/entcode.c
+++ b/libentcode/entcode.c
@@ -18,7 +18,7 @@ unsigned char *ec_byte_get_buffer(ec_byte_buffer *_b){
 
 int ec_ilog(ec_uint32 _v){
 #if defined(EC_CLZ)
-  return EC_CLZ0-EC_CLZ(_v)&-!!_v;
+  return EC_CLZ0-EC_CLZ(_v);
 #else
   /*On a Pentium M, this branchless version tested as the fastest on
      1,000,000,000 random 32-bit integers, edging out a similar version with
@@ -45,7 +45,7 @@ int ec_ilog(ec_uint32 _v){
 
 int ec_ilog64(ec_uint64 _v){
 #if defined(EC_CLZ64)
-  return EC_CLZ64_0-EC_CLZ64(_v)&-!!_v;
+  return EC_CLZ64_0-EC_CLZ64(_v);
 #else
   ec_uint32 v;
   int       ret;
diff --git a/libentcode/entenc.c b/libentcode/entenc.c
index 188aa4241baa8beaab25bc0d611acc4037bdf175..62ef8844fa2fef701d9acfaa2ca66c854933dc9e 100644
--- a/libentcode/entenc.c
+++ b/libentcode/entenc.c
@@ -75,12 +75,10 @@ void ec_enc_bits(ec_enc *_this,ec_uint32 _fl,int _ftb){
 }
 
 void ec_enc_bits64(ec_enc *_this,ec_uint64 _fl,int _ftb){
-  ec_uint32 fl;
-  ec_uint32 ft;
   if(_ftb>32){
-    _ftb-=32;
-    fl=(ec_uint32)(_fl>>_ftb)&0xFFFFFFFF;
-    ec_enc_bits(_this,fl,32);
+    ec_enc_bits(_this,(ec_uint32)(_fl>>32),_ftb-32);
+    _ftb=32;
+    _fl&=0xFFFFFFFF;
   }
   ec_enc_bits(_this,(ec_uint32)_fl,_ftb);
 }
@@ -91,7 +89,7 @@ void ec_enc_uint(ec_enc *_this,ec_uint32 _fl,ec_uint32 _ft){
   unsigned  fl;
   int       ftb;
   _ft--;
-  ftb=EC_ILOG(_ft);
+  ftb=EC_ILOG(_ft)&-!!_ft;
   while(ftb>EC_UNIT_BITS){
     ftb-=EC_UNIT_BITS;
     ft=(_ft>>ftb)+1;
@@ -114,7 +112,7 @@ void ec_enc_uint64(ec_enc *_this,ec_uint64 _fl,ec_uint64 _ft){
   unsigned  fl;
   int       ftb;
   _ft--;
-  ftb=EC_ILOG64(_ft);
+  ftb=EC_ILOG64(_ft)&-!!_ft;
   while(ftb>EC_UNIT_BITS){
     ftb-=EC_UNIT_BITS;
     ft=(unsigned)(_ft>>ftb)+1;
diff --git a/libentcode/entenc.h b/libentcode/entenc.h
index 1ba891cf9b2ed9a5796d934d562f621cc6b66b1a..3179bcc2ded6fe25b67bdb3b581cbeba61934be2 100644
--- a/libentcode/entenc.h
+++ b/libentcode/entenc.h
@@ -63,6 +63,12 @@ void ec_enc_uint(ec_enc *_this,ec_uint32 _fl,ec_uint32 _ft);
        This must be at least one, and no more than 2**64-1.*/
 void ec_enc_uint64(ec_enc *_this,ec_uint64 _fl,ec_uint64 _ft);
 
+/*Returns the number of bits "used" by the encoded symbols so far.
+  The actual number of bits may be larger, due to rounding to whole bytes, or
+   smaller, due to trailing zeros that can be stripped.
+  Return: the number of bits.*/
+long ec_enc_tell(ec_enc *_this);
+
 /*Indicates that there are no more symbols to encode.
   All reamining output bytes are flushed to the output buffer.
   ec_enc_init() must be called before the encoder can be used again.*/
diff --git a/libentcode/mfrngenc.c b/libentcode/mfrngenc.c
index ec861414be5e4e566b9021d57ef77495f17a9790..5d222a9157db2b839c9fa278e9b584365d49a07e 100644
--- a/libentcode/mfrngenc.c
+++ b/libentcode/mfrngenc.c
@@ -119,6 +119,19 @@ void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft){
   ec_enc_normalize(_this);
 }
 
+long ec_enc_tell(ec_enc *_this){
+  long nbits;
+  nbits=ec_byte_bytes(_this->buf)+(_this->rem>=0)+_this->ext<<3;
+  /*To handle the non-integral number of bits still left in the encoder state,
+     we compute the number of bits of low that must be encoded to ensure that
+     the value is inside the range for any possible subsequent bits.
+    Note that this is subtly different than the actual value we would end the
+     stream with, which tries to make as many of the trailing bits zeros as
+     possible.*/
+  nbits+=EC_CODE_BITS-EC_ILOG(_this->rng);
+  return nbits;
+}
+
 void ec_enc_done(ec_enc *_this){
   /*We compute the integer in the current interval that has the largest number
      of trailing zeros, and write that to the stream.
@@ -148,6 +161,7 @@ void ec_enc_done(ec_enc *_this){
     unsigned char *buf;
     /*Flush it into the output buffer.*/
     ec_enc_carry_out(_this,0);
+    _this->rem=-1;
     /*We may be able to drop some redundant bytes from the end.*/
     buf=ec_byte_get_buffer(_this->buf);
     p=buf+ec_byte_bytes(_this->buf)-1;
diff --git a/libentcode/rangeenc.c b/libentcode/rangeenc.c
index dcd9db5bbb6e2682b77a10819ae35972f0c0a0d3..5833da6479256e33a6b8f422b4d866b4e85d1a8a 100644
--- a/libentcode/rangeenc.c
+++ b/libentcode/rangeenc.c
@@ -91,6 +91,19 @@ void ec_encode(ec_enc *_this,unsigned _fl,unsigned _fh,unsigned _ft){
   ec_enc_normalize(_this);
 }
 
+long ec_enc_tell(ec_enc *_this){
+  long nbits;
+  nbits=ec_byte_bytes(_this->buf)+(_this->rem>=0)+_this->ext<<3;
+  /*To handle the non-integral number of bits still left in the encoder state,
+     we compute the number of bits of low that must be encoded to ensure that
+     the value is inside the range for any possible subsequent bits.
+    Note that this is subtly different than the actual value we would end the
+     stream with, which tries to make as many of the trailing bits zeros as
+     possible.*/
+  nbits+=EC_CODE_BITS-EC_ILOG(_this->rng);
+  return nbits;
+}
+
 void ec_enc_done(ec_enc *_this){
   /*We compute the integer in the current interval that has the largest number
      of trailing zeros, and write that to the stream.
@@ -120,6 +133,7 @@ void ec_enc_done(ec_enc *_this){
     unsigned char *buf;
     /*Flush it into the output buffer.*/
     ec_enc_carry_out(_this,0);
+    _this->rem=-1;
     /*We may be able to drop some redundant bytes from the end.*/
     buf=ec_byte_get_buffer(_this->buf);
     p=buf+ec_byte_bytes(_this->buf)-1;