diff --git a/dnn/nnet.c b/dnn/nnet.c index 73c49fc305e617165e1d98c99b3440ef81688e9e..60bde585d70b9bbc01abed8d7a5bf3b549cdeea2 100644 --- a/dnn/nnet.c +++ b/dnn/nnet.c @@ -384,6 +384,58 @@ void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const f compute_generic_conv1d(&matrix, output, mem, input, layer->nb_inputs, layer->activation); } +/* Computes non-padded convolution for input [ ksize1 x in_channels x (len2+ksize2) ], + kernel [ out_channels x in_channels x ksize1 x ksize2 ], + storing the output as [ out_channels x len2 ]. + We assume that the output dimension along the ksize1 axis is 1, + i.e. processing one frame at a time. */ +void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int len2) +{ + int i; + int in_stride; + in_stride = len2+kheight-1; + OPUS_CLEAR(out, out_channels*len2); + for (i=0;i<out_channels;i++) { + int m; + for (m=0;m<in_channels;m++) { + int t; + for (t=0;t<ktime;t++) { + int h; + for (h=0;h<kheight;h++) { + int j; + for (j=0;j<len2;j++) { + out[i*len2 + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + t*kheight + h] * + in[t*in_channels*in_stride + m*in_stride + j + h]; + } + } + } + } + } +} + +#define MAX_CONV2D_INPUTS 2048 + +void compute_conv2d(const Conv2DLayer *conv, float *out, float *mem, const float *in, int len2, int activation) +{ + int i; + const float *bias; + float in_buf[MAX_CONV2D_INPUTS]; + int time_stride; + celt_assert(in != out); + time_stride = conv->in_channels*(len2+conv->kheight); + celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS); + OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride); + OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride); + OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride); + bias = conv->bias; + conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, len2); + if (bias != NULL) { + for (i=0;i<conv->out_channels*len2;i++) out[i] += bias[i]; + } + compute_activation(out, out, conv->out_channels*len2, activation); +} + + void compute_embedding(const EmbeddingLayer *layer, float *output, int input) { int i; diff --git a/dnn/nnet.h b/dnn/nnet.h index 2b43308a7d6e5f1917e5406cf6437560db5c21de..386d204de5f878b0143a69f28eb6a1db3f8c455f 100644 --- a/dnn/nnet.h +++ b/dnn/nnet.h @@ -75,6 +75,16 @@ typedef struct { int nb_outputs; } LinearLayer; +/* Generic sparse affine transformation. */ +typedef struct { + const float *bias; + const float *float_weights; + int in_channels; + int out_channels; + int ktime; + int kheight; +} Conv2DLayer; + typedef struct { const float *bias; const float *input_weights;