vp9_variance_sse2.c 14.6 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
/*
2
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
John Koleszar's avatar
John Koleszar committed
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
6
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
John Koleszar's avatar
John Koleszar committed
9
10
 */

11
12
#include <emmintrin.h>  // SSE2

Jim Bankoski's avatar
Jim Bankoski committed
13
#include "./vpx_config.h"
14

15
#include "vp9/encoder/vp9_variance.h"
John Koleszar's avatar
John Koleszar committed
16
17
#include "vpx_ports/mem.h"

18
19
20
typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride,
                                       const unsigned char *ref, int ref_stride,
                                       unsigned int *sse, int *sum);
John Koleszar's avatar
John Koleszar committed
21

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#define READ64(p, stride, i) \
  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))

unsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride,
                                const uint8_t *ref, int ref_stride,
                                unsigned int *sse, int *sum) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
  const __m128i diff1 = _mm_sub_epi16(src1, ref1);

  // sum
  __m128i vsum = _mm_add_epi16(diff0, diff1);
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
John Koleszar's avatar
John Koleszar committed
43

44
45
46
47
48
49
50
51
52
  // sse
  vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
                       _mm_madd_epi16(diff1, diff1));
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
  *sse = _mm_cvtsi128_si32(vsum);

  return 0;
}
Yunqing Wang's avatar
Yunqing Wang committed
53

54
55
56
57
58
59
60
61
62
63
64
65
unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride,
                                const unsigned char *ref, int ref_stride,
                                unsigned int *sse, int *sum);

unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
                                  const unsigned char *ref, int ref_stride,
                                  unsigned int *sse, int *sum);

static void variance_sse2(const unsigned char *src, int src_stride,
                          const unsigned char *ref, int ref_stride,
                          int w, int h, unsigned int *sse, int *sum,
                          variance_fn_t var_fn, int block_size) {
Yunqing Wang's avatar
Yunqing Wang committed
66
67
68
69
70
71
72
  int i, j;

  *sse = 0;
  *sum = 0;

  for (i = 0; i < h; i += block_size) {
    for (j = 0; j < w; j += block_size) {
73
74
75
76
      unsigned int sse0;
      int sum0;
      var_fn(src + src_stride * i + j, src_stride,
             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
Yunqing Wang's avatar
Yunqing Wang committed
77
78
79
80
81
82
      *sse += sse0;
      *sum += sum0;
    }
  }
}

83
84
85
86
unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
                                  const unsigned char *ref, int ref_stride,
                                  unsigned int *sse) {
  int sum;
87
  vp9_get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
88
  return *sse - (((unsigned int)sum * sum) >> 4);
Yunqing Wang's avatar
Yunqing Wang committed
89
90
}

91
92
unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
                                  const uint8_t *ref, int ref_stride,
Yunqing Wang's avatar
Yunqing Wang committed
93
                                  unsigned int *sse) {
94
95
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
96
                sse, &sum, vp9_get4x4var_sse2, 4);
97
  return *sse - (((unsigned int)sum * sum) >> 5);
Yunqing Wang's avatar
Yunqing Wang committed
98
99
}

100
101
unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
                                  const uint8_t *ref, int ref_stride,
Yunqing Wang's avatar
Yunqing Wang committed
102
                                  unsigned int *sse) {
103
104
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
105
                sse, &sum, vp9_get4x4var_sse2, 4);
106
  return *sse - (((unsigned int)sum * sum) >> 5);
John Koleszar's avatar
John Koleszar committed
107
108
}

109
110
111
112
113
114
115
unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
                                  const unsigned char *ref, int ref_stride,
                                  unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
                sse, &sum, vp9_get8x8var_sse2, 8);
  return *sse - (((unsigned int)sum * sum) >> 6);
John Koleszar's avatar
John Koleszar committed
116
117
}

118
119
120
121
122
123
124
unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
                                   const unsigned char *ref, int ref_stride,
                                   unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
                sse, &sum, vp9_get8x8var_sse2, 8);
  return *sse - (((unsigned int)sum * sum) >> 7);
John Koleszar's avatar
John Koleszar committed
125
}
126

127
128
129
130
131
132
133
unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
                                   const unsigned char *ref, int ref_stride,
                                   unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
                sse, &sum, vp9_get8x8var_sse2, 8);
  return *sse - (((unsigned int)sum * sum) >> 7);
John Koleszar's avatar
John Koleszar committed
134
135
}

136
137
138
139
140
141
142
unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
                                    const unsigned char *ref, int ref_stride,
                                    unsigned int *sse) {
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((unsigned int)sum * sum) >> 8);
John Koleszar's avatar
John Koleszar committed
143
144
}

145
146
147
148
149
150
unsigned int vp9_mse16x16_sse2(const unsigned char *src, int src_stride,
                               const unsigned char *ref, int ref_stride,
                               unsigned int *sse) {
  int sum;
  vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
  return *sse;
Yunqing Wang's avatar
Yunqing Wang committed
151
152
}

153
154
unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
Yunqing Wang's avatar
Yunqing Wang committed
155
                                    unsigned int *sse) {
156
157
158
159
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 10);
Yunqing Wang's avatar
Yunqing Wang committed
160
161
}

162
163
unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
Yunqing Wang's avatar
Yunqing Wang committed
164
                                    unsigned int *sse) {
165
166
167
168
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 9);
Yunqing Wang's avatar
Yunqing Wang committed
169
170
}

171
172
unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
Yunqing Wang's avatar
Yunqing Wang committed
173
                                    unsigned int *sse) {
174
175
176
177
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 9);
Yunqing Wang's avatar
Yunqing Wang committed
178
179
}

180
181
unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
Yunqing Wang's avatar
Yunqing Wang committed
182
                                    unsigned int *sse) {
183
184
185
186
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 12);
Yunqing Wang's avatar
Yunqing Wang committed
187
188
}

189
190
unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
Yunqing Wang's avatar
Yunqing Wang committed
191
                                    unsigned int *sse) {
192
193
194
195
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 11);
Yunqing Wang's avatar
Yunqing Wang committed
196
197
}

198
199
unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
                                    const uint8_t *ref, int ref_stride,
Yunqing Wang's avatar
Yunqing Wang committed
200
                                    unsigned int *sse) {
201
202
203
204
  int sum;
  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
                sse, &sum, vp9_get16x16var_sse2, 16);
  return *sse - (((int64_t)sum * sum) >> 11);
John Koleszar's avatar
John Koleszar committed
205
206
}

207
208
209
210
211
212
213
#define DECL(w, opt) \
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
                                        ptrdiff_t src_stride, \
                                        int x_offset, int y_offset, \
                                        const uint8_t *dst, \
                                        ptrdiff_t dst_stride, \
                                        int height, unsigned int *sse)
214
#define DECLS(opt1, opt2) \
215
216
217
DECL(4, opt2); \
DECL(8, opt1); \
DECL(16, opt1)
218
219
220
221

DECLS(sse2, sse);
DECLS(ssse3, ssse3);
#undef DECLS
222
#undef DECL
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260

#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
                                                     int src_stride, \
                                                     int x_offset, \
                                                     int y_offset, \
                                                     const uint8_t *dst, \
                                                     int dst_stride, \
                                                     unsigned int *sse_ptr) { \
  unsigned int sse; \
  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
                                                y_offset, dst, dst_stride, \
                                                h, &sse); \
  if (w > wf) { \
    unsigned int sse2; \
    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
                                                   x_offset, y_offset, \
                                                   dst + 16, dst_stride, \
                                                   h, &sse2); \
    se += se2; \
    sse += sse2; \
    if (w > wf * 2) { \
      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
                                                 x_offset, y_offset, \
                                                 dst + 32, dst_stride, \
                                                 h, &sse2); \
      se += se2; \
      sse += sse2; \
      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
                                                 x_offset, y_offset, \
                                                 dst + 48, dst_stride, \
                                                 h, &sse2); \
      se += se2; \
      sse += sse2; \
    } \
  } \
  *sse_ptr = sse; \
  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
John Koleszar's avatar
John Koleszar committed
261
262
}

263
264
265
266
267
268
269
270
#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
Jim Bankoski's avatar
Jim Bankoski committed
271
272
273
274
275
276
FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
FN(4,   4,  4, 2, 2, opt2, (unsigned int))
277
278
279
280
281
282

FNS(sse2, sse);
FNS(ssse3, ssse3);

#undef FNS
#undef FN
283

284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
#define DECL(w, opt) \
int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
                                            ptrdiff_t src_stride, \
                                            int x_offset, int y_offset, \
                                            const uint8_t *dst, \
                                            ptrdiff_t dst_stride, \
                                            const uint8_t *sec, \
                                            ptrdiff_t sec_stride, \
                                            int height, unsigned int *sse)
#define DECLS(opt1, opt2) \
DECL(4, opt2); \
DECL(8, opt1); \
DECL(16, opt1)

DECLS(sse2, sse);
DECLS(ssse3, ssse3);
#undef DECL
#undef DECLS

#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
                                                         int src_stride, \
                                                         int x_offset, \
                                                         int y_offset, \
                                                         const uint8_t *dst, \
                                                         int dst_stride, \
                                                         unsigned int *sseptr, \
                                                         const uint8_t *sec) { \
  unsigned int sse; \
  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
                                                    y_offset, dst, dst_stride, \
                                                    sec, w, h, &sse); \
  if (w > wf) { \
    unsigned int sse2; \
    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
                                                       x_offset, y_offset, \
                                                       dst + 16, dst_stride, \
                                                       sec + 16, w, h, &sse2); \
    se += se2; \
    sse += sse2; \
    if (w > wf * 2) { \
      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
                                                     x_offset, y_offset, \
                                                     dst + 32, dst_stride, \
                                                     sec + 32, w, h, &sse2); \
      se += se2; \
      sse += sse2; \
      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
                                                     x_offset, y_offset, \
                                                     dst + 48, dst_stride, \
                                                     sec + 48, w, h, &sse2); \
      se += se2; \
      sse += sse2; \
    } \
  } \
  *sseptr = sse; \
  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}

#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
Jim Bankoski's avatar
Jim Bankoski committed
351
352
353
354
355
356
FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
FN(4,   4,  4, 2, 2, opt2, (unsigned int))
357
358
359
360
361
362

FNS(sse2, sse);
FNS(ssse3, ssse3);

#undef FNS
#undef FN