warp_plane_sse2.c 16.2 KB
Newer Older
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3
4
5
6
7
8
9
10
11
12
13
14
15
16
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <emmintrin.h>

#include "./av1_rtcd.h"
#include "av1/common/warped_motion.h"

17
18
19
void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
                          int height, int stride, uint8_t *pred, int p_col,
                          int p_row, int p_width, int p_height, int p_stride,
20
21
22
23
                          int subsampling_x, int subsampling_y,
                          ConvolveParams *conv_params, int16_t alpha,
                          int16_t beta, int16_t gamma, int16_t delta) {
  int comp_avg = conv_params->do_average;
24
  __m128i tmp[15];
David Barker's avatar
David Barker committed
25
  int i, j, k;
26
  const int bd = 8;
27
28
  const int use_conv_params =
      (conv_params->round == CONVOLVE_OPT_NO_ROUND && conv_params->dst);
29
30
  const int reduce_bits_horiz =
      use_conv_params ? conv_params->round_0 : HORSHEAR_REDUCE_PREC_BITS;
31
32
33
  const int reduce_bits_vert =
      use_conv_params ? conv_params->round_1
                      : 2 * WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz;
34
35
36
37
38
39
  const int offset_bits_horiz =
      use_conv_params ? bd + FILTER_BITS - 1 : bd + WARPEDPIXEL_FILTER_BITS - 1;
  if (use_conv_params) {
    conv_params->do_post_rounding = 1;
  }
  assert(FILTER_BITS == WARPEDPIXEL_FILTER_BITS);
40
41
42
43
44
45
46
47
48
49
50
51
52

  /* Note: For this code to work, the left/right frame borders need to be
     extended by at least 13 pixels each. By the time we get here, other
     code will have set up this border, but we allow an explicit check
     for debugging purposes.
  */
  /*for (i = 0; i < height; ++i) {
    for (j = 0; j < 13; ++j) {
      assert(ref[i * stride - 13 + j] == ref[i * stride]);
      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
    }
  }*/

David Barker's avatar
David Barker committed
53
54
  for (i = 0; i < p_height; i += 8) {
    for (j = 0; j < p_width; j += 8) {
55
56
57
58
59
60
      const int32_t src_x = (p_col + j + 4) << subsampling_x;
      const int32_t src_y = (p_row + i + 4) << subsampling_y;
      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
      const int32_t x4 = dst_x >> subsampling_x;
      const int32_t y4 = dst_y >> subsampling_y;
David Barker's avatar
David Barker committed
61

62
63
64
65
      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
66

David Barker's avatar
David Barker committed
67
68
69
70
71
      // Add in all the constant terms, including rounding and offset
      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
72

David Barker's avatar
David Barker committed
73
74
      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
75

76
      // Horizontal filter
David Barker's avatar
David Barker committed
77
78
79
80
81
82
83
84
85
86
      // If the block is aligned such that, after clamping, every sample
      // would be taken from the leftmost/rightmost column, then we can
      // skip the expensive horizontal filter.
      if (ix4 <= -7) {
        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
          int iy = iy4 + k;
          if (iy < 0)
            iy = 0;
          else if (iy > height - 1)
            iy = height - 1;
87
          tmp[k + 7] = _mm_set1_epi16(
88
              (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
89
              ref[iy * stride] *
90
                  (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
David Barker's avatar
David Barker committed
91
92
93
94
95
96
97
98
        }
      } else if (ix4 >= width + 6) {
        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
          int iy = iy4 + k;
          if (iy < 0)
            iy = 0;
          else if (iy > height - 1)
            iy = height - 1;
99
          tmp[k + 7] = _mm_set1_epi16(
100
              (1 << (bd + WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz - 1)) +
101
              ref[iy * stride + (width - 1)] *
102
                  (1 << (WARPEDPIXEL_FILTER_BITS - reduce_bits_horiz)));
David Barker's avatar
David Barker committed
103
104
105
106
107
108
109
110
111
        }
      } else {
        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
          int iy = iy4 + k;
          if (iy < 0)
            iy = 0;
          else if (iy > height - 1)
            iy = height - 1;
          int sx = sx4 + beta * (k + 4);
David Barker's avatar
David Barker committed
112
113

          // Load source pixels
114
115
          const __m128i zero = _mm_setzero_si128();
          const __m128i src =
David Barker's avatar
David Barker committed
116
117
118
              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));

          // Filter even-index pixels
119
120
121
122
123
124
125
126
127
128
129
130
          const __m128i tmp_0 = _mm_loadu_si128(
              (__m128i *)(warped_filter +
                          ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
          const __m128i tmp_2 = _mm_loadu_si128(
              (__m128i *)(warped_filter +
                          ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
          const __m128i tmp_4 = _mm_loadu_si128(
              (__m128i *)(warped_filter +
                          ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
          const __m128i tmp_6 = _mm_loadu_si128(
              (__m128i *)(warped_filter +
                          ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
David Barker's avatar
David Barker committed
131
132

          // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
133
          const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
David Barker's avatar
David Barker committed
134
          // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
135
          const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
David Barker's avatar
David Barker committed
136
          // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
137
          const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
David Barker's avatar
David Barker committed
138
          // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
139
          const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
David Barker's avatar
David Barker committed
140
141

          // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
142
          const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
David Barker's avatar
David Barker committed
143
          // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
144
          const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
David Barker's avatar
David Barker committed
145
          // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
146
          const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
David Barker's avatar
David Barker committed
147
          // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
148
          const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
David Barker's avatar
David Barker committed
149

150
151
          const __m128i round_const = _mm_set1_epi32(
              (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
152

David Barker's avatar
David Barker committed
153
          // Calculate filtered results
154
155
156
157
158
159
160
161
          const __m128i src_0 = _mm_unpacklo_epi8(src, zero);
          const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
          const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
          const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
          const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
          const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
          const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
          const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
David Barker's avatar
David Barker committed
162
163
164

          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                           _mm_add_epi32(res_2, res_6));
165
166
          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
                                   _mm_cvtsi32_si128(reduce_bits_horiz));
David Barker's avatar
David Barker committed
167
168

          // Filter odd-index pixels
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
          const __m128i tmp_1 = _mm_loadu_si128(
              (__m128i *)(warped_filter +
                          ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
          const __m128i tmp_3 = _mm_loadu_si128(
              (__m128i *)(warped_filter +
                          ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
          const __m128i tmp_5 = _mm_loadu_si128(
              (__m128i *)(warped_filter +
                          ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
          const __m128i tmp_7 = _mm_loadu_si128(
              (__m128i *)(warped_filter +
                          ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));

          const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
          const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
          const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
          const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);

          const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
          const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
          const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
          const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);

          const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
          const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
          const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
          const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
          const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
          const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
          const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
          const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
David Barker's avatar
David Barker committed
200
201
202

          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                          _mm_add_epi32(res_3, res_7));
203
204
          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
                                  _mm_cvtsi32_si128(reduce_bits_horiz));
David Barker's avatar
David Barker committed
205
206
207
208
209

          // Combine results into one register.
          // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
          // as this order helps with the vertical filter.
          tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
210
211
212
213
        }
      }

      // Vertical filter
David Barker's avatar
David Barker committed
214
      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
David Barker's avatar
David Barker committed
215
        int sy = sy4 + delta * (k + 4);
David Barker's avatar
David Barker committed
216
217
218

        // Load from tmp and rearrange pairs of consecutive rows into the
        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
219
220
221
222
223
        const __m128i *src = tmp + (k + 4);
        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
David Barker's avatar
David Barker committed
224
225

        // Filter even-index pixels
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
        const __m128i tmp_0 = _mm_loadu_si128(
            (__m128i *)(warped_filter +
                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
        const __m128i tmp_2 = _mm_loadu_si128(
            (__m128i *)(warped_filter +
                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
        const __m128i tmp_4 = _mm_loadu_si128(
            (__m128i *)(warped_filter +
                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
        const __m128i tmp_6 = _mm_loadu_si128(
            (__m128i *)(warped_filter +
                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));

        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);

        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);

        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);

        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
                                               _mm_add_epi32(res_4, res_6));
David Barker's avatar
David Barker committed
256
257

        // Filter odd-index pixels
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);

        const __m128i tmp_1 = _mm_loadu_si128(
            (__m128i *)(warped_filter +
                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
        const __m128i tmp_3 = _mm_loadu_si128(
            (__m128i *)(warped_filter +
                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
        const __m128i tmp_5 = _mm_loadu_si128(
            (__m128i *)(warped_filter +
                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
        const __m128i tmp_7 = _mm_loadu_si128(
            (__m128i *)(warped_filter +
                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));

        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);

        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);

        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);

        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
                                              _mm_add_epi32(res_5, res_7));
David Barker's avatar
David Barker committed
293
294

        // Rearrange pixels back into the order 0 ... 7
295
296
297
298
299
300
301
302
303
304
305
        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);

        if (use_conv_params) {
          __m128i *const p =
              (__m128i *)&conv_params
                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
          const __m128i round_const = _mm_set1_epi32(
              -(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)) +
              ((1 << (conv_params->round_1)) >> 1));
          res_lo = _mm_add_epi32(res_lo, round_const);
306
          res_lo =
307
              _mm_sra_epi32(res_lo, _mm_cvtsi32_si128(conv_params->round_1));
308
          if (comp_avg) res_lo = _mm_add_epi32(_mm_loadu_si128(p), res_lo);
309
310
311
          _mm_storeu_si128(p, res_lo);
          if (p_width > 4) {
            res_hi = _mm_add_epi32(res_hi, round_const);
312
            res_hi =
313
                _mm_sra_epi32(res_hi, _mm_cvtsi32_si128(conv_params->round_1));
314
315
            if (comp_avg)
              res_hi = _mm_add_epi32(_mm_loadu_si128(p + 1), res_hi);
316
            _mm_storeu_si128(p + 1, res_hi);
317
318
          }
        } else {
319
320
          // Round and pack into 8 bits
          const __m128i round_const =
321
322
              _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
                             ((1 << reduce_bits_vert) >> 1));
323
324

          const __m128i res_lo_round = _mm_srai_epi32(
325
              _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
326
          const __m128i res_hi_round = _mm_srai_epi32(
327
              _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347

          const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
          __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);

          // Store, blending with 'pred' if needed
          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];

          // Note: If we're outputting a 4x4 block, we need to be very careful
          // to only output 4 pixels at this point, to avoid encode/decode
          // mismatches when encoding with multiple threads.
          if (p_width == 4) {
            if (comp_avg) {
              const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
              res_8bit = _mm_avg_epu8(res_8bit, orig);
            }
            *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
          } else {
            if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
            _mm_storel_epi64(p, res_8bit);
          }
348
        }
349
350
351
352
      }
    }
  }
}