diff --git a/dnn/nnet.c b/dnn/nnet.c
index 73c49fc305e617165e1d98c99b3440ef81688e9e..60bde585d70b9bbc01abed8d7a5bf3b549cdeea2 100644
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -384,6 +384,58 @@ void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const f
    compute_generic_conv1d(&matrix, output, mem, input, layer->nb_inputs, layer->activation);
 }
 
+/* Computes non-padded convolution for input [ ksize1 x in_channels x (len2+ksize2) ],
+   kernel [ out_channels x in_channels x ksize1 x ksize2 ],
+   storing the output as [ out_channels x len2 ].
+   We assume that the output dimension along the ksize1 axis is 1,
+   i.e. processing one frame at a time. */
+void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int len2)
+{
+   int i;
+   int in_stride;
+   in_stride = len2+kheight-1;
+   OPUS_CLEAR(out, out_channels*len2);
+   for (i=0;i<out_channels;i++) {
+      int m;
+      for (m=0;m<in_channels;m++) {
+         int t;
+         for (t=0;t<ktime;t++) {
+            int h;
+            for (h=0;h<kheight;h++) {
+               int j;
+               for (j=0;j<len2;j++) {
+                  out[i*len2 + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + t*kheight + h] *
+                                     in[t*in_channels*in_stride + m*in_stride + j + h];
+               }
+            }
+         }
+      }
+   }
+}
+
+#define MAX_CONV2D_INPUTS 2048
+
+void compute_conv2d(const Conv2DLayer *conv, float *out, float *mem, const float *in, int len2, int activation)
+{
+   int i;
+   const float *bias;
+   float in_buf[MAX_CONV2D_INPUTS];
+   int time_stride;
+   celt_assert(in != out);
+   time_stride = conv->in_channels*(len2+conv->kheight);
+   celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS);
+   OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride);
+   OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride);
+   OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride);
+   bias = conv->bias;
+   conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, len2);
+   if (bias != NULL) {
+      for (i=0;i<conv->out_channels*len2;i++) out[i] += bias[i];
+   }
+   compute_activation(out, out, conv->out_channels*len2, activation);
+}
+
+
 void compute_embedding(const EmbeddingLayer *layer, float *output, int input)
 {
    int i;
diff --git a/dnn/nnet.h b/dnn/nnet.h
index 2b43308a7d6e5f1917e5406cf6437560db5c21de..386d204de5f878b0143a69f28eb6a1db3f8c455f 100644
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -75,6 +75,16 @@ typedef struct {
   int nb_outputs;
 } LinearLayer;
 
+/* Generic sparse affine transformation. */
+typedef struct {
+  const float *bias;
+  const float *float_weights;
+  int in_channels;
+  int out_channels;
+  int ktime;
+  int kheight;
+} Conv2DLayer;
+
 typedef struct {
   const float *bias;
   const float *input_weights;