highbd_variance_sse4.c 7.86 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
/*
 *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include <smmintrin.h> /* SSE4.1 */

Yaowu Xu's avatar
Yaowu Xu committed
13
14
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
15

16
#include "aom_dsp/variance.h"
Yaowu Xu's avatar
Yaowu Xu committed
17
#include "aom_dsp/aom_filter.h"
18
19
20
21
22
23
24
25
26
27
28
29
30
31

static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
                                         const uint8_t *b8, int b_stride,
                                         uint64_t *sse, int64_t *sum) {
  __m128i u0, u1, u2, u3;
  __m128i s0, s1, s2, s3;
  __m128i t0, t1, x0, y0;
  __m128i a0, a1, a2, a3;
  __m128i b0, b1, b2, b3;
  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);

  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
  uint16_t *b = CONVERT_TO_SHORTPTR(b8);

32
33
34
35
  a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
  a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
  a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
  a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
36

37
38
39
40
  b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
  b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
  b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
  b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

  u0 = _mm_unpacklo_epi16(a0, a1);
  u1 = _mm_unpacklo_epi16(a2, a3);
  u2 = _mm_unpacklo_epi16(b0, b1);
  u3 = _mm_unpacklo_epi16(b2, b3);

  s0 = _mm_sub_epi16(u0, u2);
  s1 = _mm_sub_epi16(u1, u3);

  t0 = _mm_madd_epi16(s0, k_one_epi16);
  t1 = _mm_madd_epi16(s1, k_one_epi16);

  s2 = _mm_hadd_epi32(t0, t1);
  s3 = _mm_hadd_epi32(s2, s2);
  y0 = _mm_hadd_epi32(s3, s3);

  t0 = _mm_madd_epi16(s0, s0);
  t1 = _mm_madd_epi16(s1, s1);

  s2 = _mm_hadd_epi32(t0, t1);
  s3 = _mm_hadd_epi32(s2, s2);
  x0 = _mm_hadd_epi32(s3, s3);

  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
  *sum = (int64_t)_mm_extract_epi32(y0, 0);
}

Yaowu Xu's avatar
Yaowu Xu committed
68
uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
clang-format's avatar
clang-format committed
69
                                         const uint8_t *b, int b_stride,
70
                                         uint32_t *sse) {
71
  int64_t sum, diff;
72
73
74
75
76
  uint64_t local_sse;

  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
  *sse = (uint32_t)local_sse;

77
78
  diff = (int64_t)*sse - ((sum * sum) >> 4);
  return (diff >= 0) ? (uint32_t)diff : 0;
79
80
}

Yaowu Xu's avatar
Yaowu Xu committed
81
uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
clang-format's avatar
clang-format committed
82
                                          const uint8_t *b, int b_stride,
83
                                          uint32_t *sse) {
84
  int64_t sum, diff;
85
86
87
88
89
90
  uint64_t local_sse;

  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
  sum = ROUND_POWER_OF_TWO(sum, 2);

91
92
  diff = (int64_t)*sse - ((sum * sum) >> 4);
  return (diff >= 0) ? (uint32_t)diff : 0;
93
94
}

Yaowu Xu's avatar
Yaowu Xu committed
95
uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
clang-format's avatar
clang-format committed
96
                                          const uint8_t *b, int b_stride,
97
                                          uint32_t *sse) {
98
  int64_t sum, diff;
99
100
101
102
103
104
  uint64_t local_sse;

  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
  sum = ROUND_POWER_OF_TWO(sum, 4);

105
106
  diff = (int64_t)*sse - ((sum * sum) >> 4);
  return diff >= 0 ? (uint32_t)diff : 0;
107
108
109
}

// Sub-pixel
Yaowu Xu's avatar
Yaowu Xu committed
110
uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
clang-format's avatar
clang-format committed
111
112
    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    const uint8_t *dst, int dst_stride, uint32_t *sse) {
113
114
115
  uint16_t fdata3[(4 + 1) * 4];
  uint16_t temp2[4 * 4];

Yaowu Xu's avatar
Yaowu Xu committed
116
  aom_highbd_var_filter_block2d_bil_first_pass(
clang-format's avatar
clang-format committed
117
      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
Yaowu Xu's avatar
Yaowu Xu committed
118
  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
clang-format's avatar
clang-format committed
119
120
                                                bilinear_filters_2t[yoffset]);

Yaowu Xu's avatar
Yaowu Xu committed
121
  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
clang-format's avatar
clang-format committed
122
                                  sse);
123
124
}

Yaowu Xu's avatar
Yaowu Xu committed
125
uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
clang-format's avatar
clang-format committed
126
127
    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    const uint8_t *dst, int dst_stride, uint32_t *sse) {
128
129
130
  uint16_t fdata3[(4 + 1) * 4];
  uint16_t temp2[4 * 4];

Yaowu Xu's avatar
Yaowu Xu committed
131
  aom_highbd_var_filter_block2d_bil_first_pass(
clang-format's avatar
clang-format committed
132
      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
Yaowu Xu's avatar
Yaowu Xu committed
133
  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
clang-format's avatar
clang-format committed
134
135
                                                bilinear_filters_2t[yoffset]);

Yaowu Xu's avatar
Yaowu Xu committed
136
  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
clang-format's avatar
clang-format committed
137
                                   dst_stride, sse);
138
139
}

Yaowu Xu's avatar
Yaowu Xu committed
140
uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
clang-format's avatar
clang-format committed
141
142
    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    const uint8_t *dst, int dst_stride, uint32_t *sse) {
143
144
145
  uint16_t fdata3[(4 + 1) * 4];
  uint16_t temp2[4 * 4];

Yaowu Xu's avatar
Yaowu Xu committed
146
  aom_highbd_var_filter_block2d_bil_first_pass(
clang-format's avatar
clang-format committed
147
      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
Yaowu Xu's avatar
Yaowu Xu committed
148
  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
clang-format's avatar
clang-format committed
149
150
                                                bilinear_filters_2t[yoffset]);

Yaowu Xu's avatar
Yaowu Xu committed
151
  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
clang-format's avatar
clang-format committed
152
                                   dst_stride, sse);
153
154
}

155
156
// Sub-pixel average

Yaowu Xu's avatar
Yaowu Xu committed
157
uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
clang-format's avatar
clang-format committed
158
159
    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    const uint8_t *dst, int dst_stride, uint32_t *sse,
160
161
162
163
164
    const uint8_t *second_pred) {
  uint16_t fdata3[(4 + 1) * 4];
  uint16_t temp2[4 * 4];
  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);

Yaowu Xu's avatar
Yaowu Xu committed
165
  aom_highbd_var_filter_block2d_bil_first_pass(
clang-format's avatar
clang-format committed
166
      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
Yaowu Xu's avatar
Yaowu Xu committed
167
  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
clang-format's avatar
clang-format committed
168
                                                bilinear_filters_2t[yoffset]);
169

Yaowu Xu's avatar
Yaowu Xu committed
170
  aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
clang-format's avatar
clang-format committed
171
                           4);
172

Yaowu Xu's avatar
Yaowu Xu committed
173
  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
clang-format's avatar
clang-format committed
174
                                  sse);
175
176
}

Yaowu Xu's avatar
Yaowu Xu committed
177
uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
clang-format's avatar
clang-format committed
178
179
    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    const uint8_t *dst, int dst_stride, uint32_t *sse,
180
181
182
183
184
    const uint8_t *second_pred) {
  uint16_t fdata3[(4 + 1) * 4];
  uint16_t temp2[4 * 4];
  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);

Yaowu Xu's avatar
Yaowu Xu committed
185
  aom_highbd_var_filter_block2d_bil_first_pass(
clang-format's avatar
clang-format committed
186
      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
Yaowu Xu's avatar
Yaowu Xu committed
187
  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
clang-format's avatar
clang-format committed
188
                                                bilinear_filters_2t[yoffset]);
189

Yaowu Xu's avatar
Yaowu Xu committed
190
  aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
clang-format's avatar
clang-format committed
191
                           4);
192

Yaowu Xu's avatar
Yaowu Xu committed
193
  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
clang-format's avatar
clang-format committed
194
                                   dst_stride, sse);
195
196
}

Yaowu Xu's avatar
Yaowu Xu committed
197
uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
clang-format's avatar
clang-format committed
198
199
    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    const uint8_t *dst, int dst_stride, uint32_t *sse,
200
201
202
203
204
    const uint8_t *second_pred) {
  uint16_t fdata3[(4 + 1) * 4];
  uint16_t temp2[4 * 4];
  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);

Yaowu Xu's avatar
Yaowu Xu committed
205
  aom_highbd_var_filter_block2d_bil_first_pass(
clang-format's avatar
clang-format committed
206
      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
Yaowu Xu's avatar
Yaowu Xu committed
207
  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
clang-format's avatar
clang-format committed
208
                                                bilinear_filters_2t[yoffset]);
209

Yaowu Xu's avatar
Yaowu Xu committed
210
  aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
clang-format's avatar
clang-format committed
211
                           4);
212

Yaowu Xu's avatar
Yaowu Xu committed
213
  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
clang-format's avatar
clang-format committed
214
                                   dst_stride, sse);
215
}