Commit 2ec8d9e5 authored by Timothy B. Terriberry's avatar Timothy B. Terriberry Committed by Jean-Marc Valin
Browse files

Multiplier-free entropy coder

parent aeb4467d
......@@ -5,7 +5,7 @@
AUTOMAKE_OPTIONS = 1.6
#Fools KDevelop into including all files
SUBDIRS = libcelt
SUBDIRS = libcelt libentcode
rpm: dist
rpmbuild -ta ${PACKAGE}-${VERSION}.tar.gz
......@@ -121,7 +121,7 @@ fi
AC_SUBST(SIZE16)
AC_SUBST(SIZE32)
AC_OUTPUT([Makefile libcelt/Makefile])
AC_OUTPUT([Makefile libcelt/Makefile] libentcode/Makefile)
if test "x$src" = "x"; then
echo "**IMPORTANT**"
......
INCLUDES =
METASOURCES = AUTO
lib_LTLIBRARIES = libentcode.la
libentcode_la_SOURCES = bitrdec.c bitree.c bitrenc.c ecintrin.h entcode.c \
entdec.c entenc.c mfrngdec.c mfrngenc.c probdec.c probenc.c probmod.c
bin_PROGRAMS = ectest
ectest_SOURCES = ectest.c
ectest_LDADD = $(top_builddir)/libentcode/libentcode.la
noinst_HEADERS = bitrdec.h bitree.h bitrenc.h ecintrin.h entcode.h entdec.h \
entenc.h mfrngcod.h probdec.h probenc.h probmod.h
#include "bitree.h"
int ec_bitree_find_and_update(unsigned *_this,int _sz,int _split,
unsigned _freq,unsigned *_fl,int _val){
int base;
int test;
int fl;
base=-1;
fl=0;
while(_split>0){
test=base+_split;
if(test<_sz){
if(_freq>=_this[test]){
_freq-=_this[test];
fl+=_this[test];
base=test;
}
else _this[test]+=_val;
}
_split>>=1;
}
*_fl=fl;
return base+1;
}
#if !defined(_bitrenc_H)
# define _bitredec_H (1)
# include "bitree.h"
/*Decoder-specific functions for Binary Indexed Trees.
See bitree.h for more detailed documentation.*/
/*Gets the symbol that corresponds with a given frequency.
This is an omnibus function that also computes the cumulative frequency of
the symbols before the one returned, and updates the count of that symbol by
the given amount.
_sz: The size of the table.
_split: The largest power of two less than OR EQUAL to the table size.
_freq: A frequency in the range of one of the symbols in the alphabet.
_fl: Returns the sum of the frequencies of the symbols less than that of
the returned symbol.
_val: The amount to add to returned symbol's frequency.
Return: The smallest symbol whose cumulative frequency is greater than freq.*/
int ec_bitree_find_and_update(unsigned *_this,int _sz,int _split,
unsigned _freq,unsigned *_fl,int _val);
#endif
#include "bitree.h"
void ec_bitree_to_counts(unsigned *_this,int _sz,int _split){
int p;
int q;
int s;
for(p=_split;p>1;p=q){
q=p>>1;
for(s=p-1;s<_sz;s+=p)_this[s]-=_this[s-q];
}
}
void ec_bitree_from_counts(unsigned *_this,int _sz){
int p;
int q;
int s;
for(q=1,p=2;p<=_sz;q=p,p=q<<1){
for(s=p-1;s<_sz;s+=p)_this[s]+=_this[s-q];
}
}
unsigned ec_bitree_get_cumul(const unsigned *_this,int _sym){
unsigned ret;
ret=0;
while(_sym>0){
ret+=_this[_sym-1];
_sym&=_sym-1;
}
return ret;
}
unsigned ec_bitree_get_freq(const unsigned *_this,int _sym){
unsigned ret;
int p;
ret=_this[_sym];
p=_sym&_sym+1;
while(_sym!=p){
ret-=_this[_sym-1];
_sym&=_sym-1;
}
return ret;
}
#if 0
/*Fenwick's approach to re-scaling the counts.
This tests to be twice as slow or more than the one below, even with inline
functions enabled, and without loop vectorization (which would make Moffat's
approach even faster).*/
void ec_bitree_halve(unsigned *_this,int _sz,int _split){
int i;
for(i=_sz;i-->0;){
ec_bitree_update(_this,_sz,i,-(int)(ec_bitree_get_freq(_this,i)>>1));
}
}
#else
/*Fenwick mentions that this approach is also possible, and this is what
Moffat uses.
Simply convert the tree into a simple array of counts, perform the halving,
and then convert it back.*/
void ec_bitree_halve(unsigned *_this,int _sz,int _split){
int i;
ec_bitree_to_counts(_this,_sz,_split);
/*LOOP VECTORIZES.*/
for(i=0;i<_sz;i++)_this[i]-=_this[i]>>1;
ec_bitree_from_counts(_this,_sz);
}
#endif
#if 0
#include <stdio.h>
/*Simple regression test code.
Compile with bitrenc.c and bitrdec.c as well.*/
static void ec_bitree_print(unsigned *_this,int _sz){
int i;
for(i=0;i<_sz;i++)printf("%3i%c",_this[i],i+1<_sz?' ':'\n');
}
int main(void){
unsigned t[16]={0,8,4,9,2,10,5,11,1,12,6,13,3,14,7,15};
int fl;
int s;
int i;
ec_bitree_print(t,16);
ec_bitree_from_counts(t,16);
ec_bitree_print(t,16);
for(i=0;i<=16;i++)printf("%3i%c",ec_bitree_get_cumul(t,i),i<16?' ':'\n');
for(i=0;i<t[15];i++){
s=ec_bitree_find_and_update(t,16,16,i,&fl,0);
printf("%3i: %i %3i\n",i,s,fl);
}
for(i=0;i<16;i++){
s=ec_bitree_find_and_update(t,16,ec_bitree_get_cumul(t,i),&fl,100);
ec_bitree_to_counts(t,16,16);
ec_bitree_print(t,16);
ec_bitree_from_counts(t,16);
ec_bitree_update(t,16,s,-100);
}
return 0;
}
#endif
/*Implements Binary Indexed Trees for cumulative probability tables, based
upon a combination of the techniques described in \cite{Fen93,Fen95,Mof99}.
This is a really, amazingly elegant data structure, that maintains
cumulative frequency tables in logarithmic time, using exactly the same
space as an ordinary frequency table.
In addition, the non-cumulative frequency can be retrieved in constant
amortized time (under 2 memory references per symbol on average).
We are dealing primarily with relatively small alphabets and are not sorting
symbols by frequency, and so we adopt Fenwick's tree organization strategy.
It's complexity has better constant factors, and although it is logarithmic
in n (the alphabet size) instead of s (the index of the symbol), the latter
is not expected to be appreciably smaller.
We modify it however, to remove the special cases surrounding the element 0,
which greatly streamlines the code.
Our scheme has the added benefit that for alphabet sizes that are a power of
2, the last element of the array is the total cumulative frequency count.
We choose Moffat's approach to halving the entire frequency table, which is
over twice as fast in practice as that suggested by Fenwick, even though
they have the same number of memory accesses.
We also adapt Moffat's suggestion for an omnibus decode function that updates
the count of the symbol being decoded while it is searching for it.
We also have it retain and return the cumulative frequency count needed to
update the arithmetic decoder.
See bitrenc.h and bitrdec.h for encoding- and decoding-specific functions.
@TECHREPORT{Fen93,
author ="Peter Fenwick",
title ="A new data structure for cumulative probability tables",
institution="The University of Auckland, Department of Computer Science",
year =1993,
number =88,
month =May,
URL ="http://www.cs.auckland.ac.nz/~peter-f/ftplink/TechRep88.ps"
}
@TECHREPORT{Fen95,
author ="Peter Fenwick",
title ="A new data structure for cumulative probability tables: an
improved frequency to symbol algorithm",
institution="The University of Auckland, Department of Computer Science",
year =1995,
number =110,
month =Feb,
URL ="http://www.cs.auckland.ac.nz/~peter-f/ftplink/TechRep110.ps"
}
@ARTICLE{Mof99,
author ="Alistair Moffat",
title ="An improved data structure for cumulative probability tables",
journal ="Software Practice and Experience",
volume =29,
number =7,
pages ="647--659",
year =1999
}*/
#if !defined(_bitree_H)
# define _bitree_H (1)
/*Converts an array of frequency counts to our cumulative tree representation.
_sz: The size of the table.*/
void ec_bitree_from_counts(unsigned *_this,int _sz);
/*Converts our cumulative tree representation to an array of frequency counts.
_sz: The size of the table.
_split: The largest power of two less than OR EQUAL to the table size.*/
void ec_bitree_to_counts(unsigned *_this,int _sz,int _split);
/*Gets the cumulative frequency of the symbols less than the given one.
_sym: The symbol to obtain the cumulative frequency for.
Return: The sum of the frequencies of the symbols less than _sym.*/
unsigned ec_bitree_get_cumul(const unsigned *_this,int _sym);
/*Gets the frequency of a single symbol.
_sym: The symbol to obtain the frequency for.
Return: The frequency of _sym.*/
unsigned ec_bitree_get_freq(const unsigned *_this,int _sym);
/*Halves the frequency of each symbol, rounding up.
_sz: The size of the table.
_split: The largest power of two less than OR EQUAL to the table size.*/
void ec_bitree_halve(unsigned *_this,int _sz,int _split);
#endif
#include "bitrenc.h"
void ec_bitree_update(unsigned *_this,int _sz,int _sym,int _val){
do{
_this[_sym]+=_val;
_sym+=_sym+1&-(_sym+1);
}
while(_sym<_sz);
}
#if !defined(_bitrenc_H)
# define _bitrenc_H (1)
# include "bitree.h"
/*Encoder-specific functions for Binary Indexed Trees.
See bitree.h for more detailed documentation.*/
/*Updates the frequency of a given symbol.
_sz: The size of the table.
_sym: The symbol to update.
_val: The amount to add to this symbol's frequency.*/
void ec_bitree_update(unsigned *_this,int _sz,int _sym,int _val);
#endif
/*Some common macros for potential platform-specific optimization.*/
#include <math.h>
#include <limits.h>
#if !defined(_ecintrin_H)
# define _ecintrin_H (1)
/*Some specific platforms may have optimized intrinsic or inline assembly
versions of these functions which can substantially improve performance.
We define macros for them to allow easy incorporation of these non-ANSI
features.*/
/*Note that we do not provide a macro for abs(), because it is provided as a
library function, which we assume is translated into an intrinsic to avoid
the function call overhead and then implemented in the smartest way for the
target platform.
With modern gcc (4.x), this is true: it uses cmov instructions if the
architecture supports it and branchless bit-twiddling if it does not (the
speed difference between the two approaches is not measurable).
Interestingly, the bit-twiddling method was patented in 2000 (US 6,073,150)
by Sun Microsystems, despite prior art dating back to at least 1996:
http://web.archive.org/web/19961201174141/www.x86.org/ftp/articles/pentopt/PENTOPT.TXT
On gcc 3.x, however, our assumption is not true, as abs() is translated to a
conditional jump, which is horrible on deeply piplined architectures (e.g.,
all consumer architectures for the past decade or more) when the sign cannot
be reliably predicted.*/
/*Modern gcc (4.x) can compile the naive versions of min and max with cmov if
given an appropriate architecture, but the branchless bit-twiddling versions
are just as fast, and do not require any special target architecture.
Earlier gcc versions (3.x) compiled both code to the same assembly
instructions, because of the way they represented ((_b)>(_a)) internally.*/
#define EC_MAXI(_a,_b) ((_a)-((_a)-(_b)&-((_b)>(_a))))
#define EC_MINI(_a,_b) ((_a)+((_b)-(_a)&-((_b)<(_a))))
/*This has a chance of compiling branchless, and is just as fast as the
bit-twiddling method, which is slightly less portable, since it relies on a
sign-extended rightshift, which is not guaranteed by ANSI (but present on
every relevant platform).*/
#define EC_SIGNI(_a) (((_a)>0)-((_a)<0))
/*Slightly more portable than relying on a sign-extended right-shift (which is
not guaranteed by ANSI), and just as fast, since gcc (3.x and 4.x both)
compile it into the right-shift anyway.*/
#define EC_SIGNMASK(_a) (-((_a)<0))
/*Clamps an integer into the given range.
If _a>_c, then the lower bound _a is respected over the upper bound _c (this
behavior is required to meet our documented API behavior).
_a: The lower bound.
_b: The value to clamp.
_c: The upper boud.*/
#define EC_CLAMPI(_a,_b,_c) (EC_MAXI(_a,EC_MINI(_b,_c)))
/*Count leading zeros.
This macro should only be used for implementing ec_ilog(), if it is defined.
All other code should use EC_ILOG() instead.*/
#if __GNUC_PREREQ(3,4)
# if INT_MAX>=2147483647
# define EC_CLZ0 sizeof(unsigned)*CHAR_BIT
# define EC_CLZ(_x) (__builtin_clz(_x))
# elif LONG_MAX>=2147483647L
# define EC_CLZ0 sizeof(unsigned long)*CHAR_BIT
# define EC_CLZ(_x) (__builtin_clzl(_x))
# endif
#endif
#if defined(EC_CLZ)
/*Note that __builtin_clz is not defined when _x==0, according to the gcc
documentation (and that of the BSR instruction that implements it on x86),
so we have to special-case it.*/
# define EC_ILOG(_x) (EC_CLZ0-EC_CLZ(_x)&-!!(_x))
#else
# define EC_ILOG(_x) (ec_ilog(_x))
#endif
#endif
#include <stdio.h>
#include "probenc.h"
#include "probdec.h"
int main(int _argc,char **_argv){
ec_byte_buffer buf;
ec_enc enc;
ec_dec dec;
ec_probmod mod;
int ft;
int ftb;
int sym;
int sz;
int s;
int i;
/*Testing encoding of raw bit values.*/
ec_byte_writeinit(&buf);
ec_enc_init(&enc,&buf);
for(ft=0;ft<1024;ft++){
for(i=0;i<ft;i++){
ec_enc_uint(&enc,i,ft);
}
}
/*Testing encoding of raw bit values.*/
for(ftb=0;ftb<16;ftb++){
for(i=0;i<(1<<ftb);i++){
ec_enc_bits(&enc,i,ftb);
}
}
for(sz=1;sz<256;sz++){
ec_probmod_init_full(&mod,sz,1,sz+(sz>>1),NULL);
for(i=0;i<sz;i++){
s=((unsigned)(i*45678901+7))%sz;
ec_probmod_write(&mod,&enc,s);
}
ec_probmod_clear(&mod);
}
for(sz=11;sz<256;sz++){
ec_probmod_init_full(&mod,sz,1,sz+(sz>>1),NULL);
for(i=0;i<sz;i++){
s=((unsigned)(i*45678901+7))%sz;
ec_probmod_write_range(&mod,&enc,s,EC_MAXI(s-5,0),EC_MINI(s+6,sz));
}
ec_probmod_clear(&mod);
}
ec_enc_done(&enc);
fprintf(stderr,"Encoded to %li bytes.\n",(long)(buf.ptr-buf.buf));
ec_byte_readinit(&buf,ec_byte_get_buffer(&buf),ec_byte_bytes(&buf));
ec_dec_init(&dec,&buf);
for(ft=0;ft<1024;ft++){
for(i=0;i<ft;i++){
sym=ec_dec_uint(&dec,ft);
if(sym!=i){
fprintf(stderr,"Decoded %i instead of %i with ft of %i.\n",sym,i,ft);
return -1;
}
}
}
for(ftb=0;ftb<16;ftb++){
for(i=0;i<(1<<ftb);i++){
sym=ec_dec_bits(&dec,ftb);
if(sym!=i){
fprintf(stderr,"Decoded %i instead of %i with ftb of %i.\n",sym,i,ftb);
return -1;
}
}
}
for(sz=1;sz<256;sz++){
ec_probmod_init_full(&mod,sz,1,sz+(sz>>1),NULL);
for(i=0;i<sz;i++){
s=((unsigned)(i*45678901+7))%sz;
sym=ec_probmod_read(&mod,&dec);
if(sym!=s){
fprintf(stderr,"Decoded %i instead of %i with sz of %i.\n",sym,s,sz);
return -1;
}
}
ec_probmod_clear(&mod);
}
for(sz=11;sz<256;sz++){
ec_probmod_init_full(&mod,sz,1,sz+(sz>>1),NULL);
for(i=0;i<sz;i++){
s=((unsigned)(i*45678901+7))%sz;
sym=ec_probmod_read_range(&mod,&dec,EC_MAXI(s-5,0),EC_MINI(s+6,sz));
if(sym!=s){
fprintf(stderr,"Decoded %i instead of %i with sz of %i.\n",sym,s,sz);
return -1;
}
}
ec_probmod_clear(&mod);
}
ec_byte_writeclear(&buf);
fprintf(stderr,"All tests passed.\n");
return 0;
}
#include "entcode.h"
void ec_byte_reset(ec_byte_buffer *_b){
_b->ptr=_b->buf;
}
long ec_byte_bytes(ec_byte_buffer *_b){
return _b->ptr-_b->buf;
}
unsigned char *ec_byte_get_buffer(ec_byte_buffer *_b){
return _b->buf;
}
int ec_ilog(ec_uint32 _v){
#if defined(EC_CLZ)
return EC_CLZ0-EC_CLZ(_v)&-!!_v;
#else
/*On a Pentium M, this branchless version tested as the fastest on
1,000,000,000 random 32-bit integers, edging out a similar version with
branches, and a 256-entry LUT version.*/
int ret;
int m;
ret=!!_v;
m=!!(_v&0xFFFF0000)<<4;
_v>>=m;
ret|=m;
m=!!(_v&0xFF00)<<3;
_v>>=m;
ret|=m;
m=!!(_v&0xF0)<<2;
_v>>=m;
ret|=m;
m=!!(_v&0xC)<<1;
_v>>=m;
ret|=m;
ret+=!!(_v&0x2);
return ret;
#endif
}
#if !defined(_entcode_H)
# define _entcode_H (1)
# include <limits.h>
# include "ecintrin.h"
typedef unsigned ec_uint32;
typedef struct ec_byte_buffer ec_byte_buffer;
/*The number of bits to code at a time when coding bits directly.*/
# define EC_UNIT_BITS (8)
/*The mask for the given bits.*/
# define EC_UNIT_MASK ((1U<<EC_UNIT_BITS)-1)
/*Simple libogg1-style buffer.*/
struct ec_byte_buffer{
unsigned char *buf;
unsigned char *ptr;
long storage;
};
/*Encoding functions.*/
void ec_byte_writeinit(ec_byte_buffer *_b);
void ec_byte_writetrunc(ec_byte_buffer *_b,long _bytes);
void ec_byte_write1(ec_byte_buffer *_b,unsigned _value);
void ec_byte_write4(ec_byte_buffer *_b,ec_uint32 _value);
void ec_byte_writecopy(ec_byte_buffer *_b,void *_source,long _bytes);
void ec_byte_writeclear(ec_byte_buffer *_b);
/*Decoding functions.*/
void ec_byte_readinit(ec_byte_buffer *_b,unsigned char *_buf,long _bytes);
int ec_byte_look1(ec_byte_buffer *_b);
int ec_byte_look4(ec_byte_buffer *_b,ec_uint32 *_val);
void ec_byte_adv1(ec_byte_buffer *_b);
void ec_byte_adv4(ec_byte_buffer *_b);
int ec_byte_read1(ec_byte_buffer *_b);
int ec_byte_read4(ec_byte_buffer *_b,ec_uint32 *_val);
/*Shared functions.*/
void ec_byte_reset(ec_byte_buffer *_b);
long ec_byte_bytes(ec_byte_buffer *_b);
unsigned char *ec_byte_get_buffer(ec_byte_buffer *_b);
int ec_ilog(ec_uint32 _v);
#endif
#include <stddef.h>
#include "entdec.h"
void ec_byte_readinit(ec_byte_buffer *_b,unsigned char *_buf,long _bytes){
_b->buf=_b->ptr=_buf;
_b->storage=_bytes;
}
int ec_byte_look1(ec_byte_buffer *_b){
ptrdiff_t endbyte;
endbyte=_b->ptr-_b->buf;
if(endbyte>=_b->storage)return -1;
else return _b->ptr[0];
}
int ec_byte_look4(ec_byte_buffer *_b,ec_uint32 *_val){
ptrdiff_t endbyte;
endbyte=_b->ptr-_b->buf;
if(endbyte+4>_b->storage){
if(endbyte<_b->storage){
*_val=_b->ptr[0];
endbyte++;
if(endbyte<_b->storage){
*_val|=(ec_uint32)_b->ptr[1]<<8;
endbyte++;
if(endbyte<_b->storage)*_val|=(ec_uint32)_b->ptr[2]<<16;
}
}
return -1;
}
else{
*_val=_b->ptr[0];
*_val|=(ec_uint32)_b->ptr[1]<<8;
*_val|=(ec_uint32)_b->ptr[2]<<16;
*_val|=(ec_uint32)_b->ptr[3]<<24;
}
return 0;
}
void ec_byte_adv1(ec_byte_buffer *_b){
_b->ptr++;
}
void ec_byte_adv4(ec_byte_buffer *_b){
_b->ptr+=4;
}
int ec_byte_read1(ec_byte_buffer *_b){
ptrdiff_t endbyte;
endbyte=_b->ptr-_b->buf;
if(endbyte>=_b->storage)return -1;
else return *(_b->ptr++);
}
int ec_byte_read4(ec_byte_buffer *_b,ec_uint32 *_val){
unsigned char *end;
end=_b->buf+_b->storage;
if(_b->ptr+4>end){
if(_b->ptr<end){
*_val=*(_b->ptr++);