Commit c3bcf3be authored by Urvang Joshi's avatar Urvang Joshi Committed by Fred BARBIER
Browse files

Intra prediction: Remove unused variants.

Directional predictors for 45, 63 and 207 angle had 2 or 3 variants
each, and only one of them was actually being used. So, removed the
C, sse2, ssse3 and neon versions of the unused ones.

Updates to the test:
- test_intra_pred_speed was testing the unused versions, so changed
  it to use the version actually used by code. This meant updating
  some golden MD5 values.
- test_intra_pred_speed was NOT filling up bottom-left and top-right
  pixels randomly, so the predictors using these pixels weren't tested
  properly. This was fixed.

BUG=aomedia:442

Change-Id: I09725d593408b81e0cd636e70a88c28eea5f2222
parent a78287b5
......@@ -46,7 +46,7 @@ if (aom_config("CONFIG_TX64X64") eq "yes") {
push @tx_dims, '64';
}
@pred_names = qw/dc dc_top dc_left dc_128 v h he ve d207 d207e d63 d63e d63f d45 d45e d117 d135 d153/;
@pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153/;
if (aom_config("CONFIG_ALT_INTRA") eq "yes") {
push @pred_names, qw/paeth smooth/;
} else {
......@@ -70,9 +70,7 @@ foreach $dim (@tx_dims) {
}
}
specialize qw/aom_d207_predictor_4x4 sse2/;
specialize qw/aom_d45_predictor_4x4 neon sse2/;
specialize qw/aom_d63_predictor_4x4 ssse3/;
specialize qw/aom_d63e_predictor_4x4 ssse3/;
specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
specialize qw/aom_d135_predictor_4x4 neon/;
specialize qw/aom_d153_predictor_4x4 ssse3/;
......@@ -84,9 +82,6 @@ specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
specialize qw/aom_d207_predictor_8x8 ssse3/;
specialize qw/aom_d45_predictor_8x8 neon sse2/;
specialize qw/aom_d63_predictor_8x8 ssse3/;
specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
specialize qw/aom_d153_predictor_8x8 ssse3/;
specialize qw/aom_v_predictor_8x8 neon msa sse2/;
......@@ -97,9 +92,6 @@ specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
specialize qw/aom_d207_predictor_16x16 ssse3/;
specialize qw/aom_d45_predictor_16x16 neon ssse3/;
specialize qw/aom_d63_predictor_16x16 ssse3/;
specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
specialize qw/aom_d153_predictor_16x16 ssse3/;
specialize qw/aom_v_predictor_16x16 neon msa sse2/;
......@@ -110,9 +102,6 @@ specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
specialize qw/aom_d207_predictor_32x32 ssse3/;
specialize qw/aom_d45_predictor_32x32 ssse3/;
specialize qw/aom_d63_predictor_32x32 ssse3/;
specialize qw/aom_h_predictor_32x32 neon msa sse2/;
specialize qw/aom_d153_predictor_32x32 ssse3/;
specialize qw/aom_v_predictor_32x32 neon msa sse2/;
......
......@@ -314,68 +314,6 @@ void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
// -----------------------------------------------------------------------------
void aom_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row
const uint64x1_t A1 = vshr_n_u64(A0, 8);
const uint64x1_t A2 = vshr_n_u64(A0, 16);
const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
(void)left;
vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
dst[3 * stride + 3] = above[7];
}
void aom_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
const uint8x8_t A0 = vld1_u8(above); // top row
const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
const uint8x8_t avg1 = vhadd_u8(A0, A2);
uint8x8_t row = vrhadd_u8(avg1, A1);
int i;
(void)left;
for (i = 0; i < 7; ++i) {
vst1_u8(dst + i * stride, row);
row = vtbl1_u8(row, sh_12345677);
}
vst1_u8(dst + i * stride, row);
}
void aom_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const uint8x16_t A0 = vld1q_u8(above); // top row
const uint8x16_t above_right = vld1q_dup_u8(above + 15);
const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
const uint8x16_t avg1 = vhaddq_u8(A0, A2);
uint8x16_t row = vrhaddq_u8(avg1, A1);
int i;
(void)left;
for (i = 0; i < 15; ++i) {
vst1q_u8(dst + i * stride, row);
row = vextq_u8(row, above_right, 1);
}
vst1q_u8(dst + i * stride, row);
}
// -----------------------------------------------------------------------------
void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
......
......@@ -23,30 +23,6 @@
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
(void)above;
// first column
for (r = 0; r < bs - 1; ++r) dst[r * stride] = AVG2(left[r], left[r + 1]);
dst[(bs - 1) * stride] = left[bs - 1];
dst++;
// second column
for (r = 0; r < bs - 2; ++r)
dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
dst[(bs - 1) * stride] = left[bs - 1];
dst++;
// rest of last row
for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
for (r = bs - 2; r >= 0; --r)
for (c = 0; c < bs - 2; ++c)
dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
}
static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
......@@ -62,23 +38,6 @@ static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
}
}
static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
int size;
(void)left;
for (c = 0; c < bs; ++c) {
dst[c] = AVG2(above[c], above[c + 1]);
dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]);
}
for (r = 2, size = bs - 2; r < bs; r += 2, --size) {
memcpy(dst + (r + 0) * stride, dst + (r >> 1), size);
memset(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
memset(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
}
}
static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
......@@ -93,25 +52,6 @@ static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
}
}
static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
const uint8_t above_right = above[bs - 1];
const uint8_t *const dst_row0 = dst;
int x, size;
(void)left;
for (x = 0; x < bs - 1; ++x) {
dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
}
dst[bs - 1] = above_right;
dst += stride;
for (x = 1, size = bs - 2; x < bs; ++x, --size) {
memcpy(dst, dst_row0 + x, size);
memset(dst + size, above_right, x + 1);
dst += stride;
}
}
static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
......@@ -411,82 +351,6 @@ static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
}
}
void aom_he_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int H = above[-1];
const int I = left[0];
const int J = left[1];
const int K = left[2];
memset(dst + stride * 0, AVG3(H, I, J), 2);
memset(dst + stride * 1, AVG3(I, J, K), 2);
}
void aom_ve_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int H = above[-1];
const int I = above[0];
const int J = above[1];
const int K = above[2];
(void)left;
dst[0] = AVG3(H, I, J);
dst[1] = AVG3(I, J, K);
memcpy(dst + stride * 1, dst, 2);
}
void aom_d207_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int I = left[0];
const int J = left[1];
const int K = left[2];
const int L = left[3];
(void)above;
DST(0, 0) = AVG2(I, J);
DST(0, 1) = AVG2(J, K);
DST(1, 0) = AVG3(I, J, K);
DST(1, 1) = AVG3(J, K, L);
}
void aom_d63_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int A = above[0];
const int B = above[1];
const int C = above[2];
const int D = above[3];
(void)left;
DST(0, 0) = AVG2(A, B);
DST(1, 0) = AVG2(B, C);
DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = AVG3(B, C, D);
}
void aom_d63f_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int A = above[0];
const int B = above[1];
const int C = above[2];
const int D = above[3];
(void)left;
DST(0, 0) = AVG2(A, B);
DST(1, 0) = AVG2(B, C);
DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = AVG3(B, C, D);
}
void aom_d45_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int A = above[0];
const int B = above[1];
const int C = above[2];
const int D = above[3];
(void)stride;
(void)left;
DST(0, 0) = AVG3(A, B, C);
DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
DST(1, 1) = AVG3(C, D, D);
}
void aom_d45e_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int A = above[0];
......@@ -539,123 +403,6 @@ void aom_d153_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride,
DST(1, 1) = AVG3(J, I, X);
}
void aom_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int H = above[-1];
const int I = left[0];
const int J = left[1];
const int K = left[2];
const int L = left[3];
memset(dst + stride * 0, AVG3(H, I, J), 4);
memset(dst + stride * 1, AVG3(I, J, K), 4);
memset(dst + stride * 2, AVG3(J, K, L), 4);
memset(dst + stride * 3, AVG3(K, L, L), 4);
}
void aom_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int H = above[-1];
const int I = above[0];
const int J = above[1];
const int K = above[2];
const int L = above[3];
const int M = above[4];
(void)left;
dst[0] = AVG3(H, I, J);
dst[1] = AVG3(I, J, K);
dst[2] = AVG3(J, K, L);
dst[3] = AVG3(K, L, M);
memcpy(dst + stride * 1, dst, 4);
memcpy(dst + stride * 2, dst, 4);
memcpy(dst + stride * 3, dst, 4);
}
void aom_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int I = left[0];
const int J = left[1];
const int K = left[2];
const int L = left[3];
(void)above;
DST(0, 0) = AVG2(I, J);
DST(2, 0) = DST(0, 1) = AVG2(J, K);
DST(2, 1) = DST(0, 2) = AVG2(K, L);
DST(1, 0) = AVG3(I, J, K);
DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
void aom_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int A = above[0];
const int B = above[1];
const int C = above[2];
const int D = above[3];
const int E = above[4];
const int F = above[5];
const int G = above[6];
(void)left;
DST(0, 0) = AVG2(A, B);
DST(1, 0) = DST(0, 2) = AVG2(B, C);
DST(2, 0) = DST(1, 2) = AVG2(C, D);
DST(3, 0) = DST(2, 2) = AVG2(D, E);
DST(3, 2) = AVG2(E, F); // differs from vp8
DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
DST(3, 3) = AVG3(E, F, G); // differs from vp8
}
void aom_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int A = above[0];
const int B = above[1];
const int C = above[2];
const int D = above[3];
const int E = above[4];
const int F = above[5];
const int G = above[6];
const int H = above[7];
(void)left;
DST(0, 0) = AVG2(A, B);
DST(1, 0) = DST(0, 2) = AVG2(B, C);
DST(2, 0) = DST(1, 2) = AVG2(C, D);
DST(3, 0) = DST(2, 2) = AVG2(D, E);
DST(3, 2) = AVG3(E, F, G);
DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
DST(3, 3) = AVG3(F, G, H);
}
void aom_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int A = above[0];
const int B = above[1];
const int C = above[2];
const int D = above[3];
const int E = above[4];
const int F = above[5];
const int G = above[6];
const int H = above[7];
(void)stride;
(void)left;
DST(0, 0) = AVG3(A, B, C);
DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
DST(3, 3) = H; // differs from vp8
}
void aom_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const int A = above[0];
......@@ -746,37 +493,6 @@ void aom_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
}
#if CONFIG_HIGHBITDEPTH
static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
(void)above;
(void)bd;
// First column.
for (r = 0; r < bs - 1; ++r) {
dst[r * stride] = AVG2(left[r], left[r + 1]);
}
dst[(bs - 1) * stride] = left[bs - 1];
dst++;
// Second column.
for (r = 0; r < bs - 2; ++r) {
dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
}
dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
dst[(bs - 1) * stride] = left[bs - 1];
dst++;
// Rest of last row.
for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
for (r = bs - 2; r >= 0; --r) {
for (c = 0; c < bs - 2; ++c)
dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
}
}
static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
......@@ -794,9 +510,9 @@ static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
}
}
static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
const uint16_t *above,
const uint16_t *left, int bd) {
static INLINE void highbd_d63e_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
(void)left;
(void)bd;
......@@ -810,24 +526,6 @@ static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
}
}
#define highbd_d63e_predictor highbd_d63_predictor
static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
(void)left;
(void)bd;
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) {
dst[c] = r + c + 2 < bs * 2
? AVG3(above[r + c], above[r + c + 1], above[r + c + 2])
: above[bs * 2 - 1];
}
dst += stride;
}
}
static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
......@@ -1251,9 +949,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
#endif // CONFIG_TX64X64
#endif // CONFIG_HIGHBITDEPTH
intra_pred_above_4x4(d207)
intra_pred_above_4x4(d63)
intra_pred_above_4x4(d45)
intra_pred_allsizes(d207e)
intra_pred_allsizes(d63e)
intra_pred_above_4x4(d45e)
......
......@@ -44,98 +44,6 @@ SECTION .text
pavgb %4, %2
%endmacro
INIT_XMM sse2
cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
GET_GOT goffsetq
movq m0, [aboveq]
DEFINE_ARGS dst, stride, temp
psrldq m1, m0, 1
psrldq m2, m0, 2
X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
; store 4 lines
movd [dstq ], m3
psrlq m3, 8
movd [dstq+strideq ], m3
lea dstq, [dstq+strideq*2]
psrlq m3, 8
movd [dstq ], m3
psrlq m3, 8
movd [dstq+strideq ], m3
psrlq m0, 56
movd tempd, m0
mov [dstq+strideq+3], tempb
RESTORE_GOT
RET
INIT_XMM sse2
cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
GET_GOT goffsetq
movu m1, [aboveq]
pslldq m0, m1, 1
psrldq m2, m1, 1
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
punpckhbw m0, m0 ; 7 7
punpcklwd m0, m0 ; 7 7 7 7
punpckldq m0, m0 ; 7 7 7 7 7 7 7 7
punpcklqdq m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
; store 4 lines
psrldq m3, 1
movq [dstq ], m3
psrldq m3, 1
movq [dstq+strideq ], m3
psrldq m3, 1
movq [dstq+strideq*2], m3
psrldq m3, 1
movq [dstq+stride3q ], m3
lea dstq, [dstq+strideq*4]
; store next 4 lines
psrldq m3, 1
movq [dstq ], m3
psrldq m3, 1
movq [dstq+strideq ], m3
psrldq m3, 1
movq [dstq+strideq*2], m3
psrldq m3, 1
movq [dstq+stride3q ], m3
RESTORE_GOT
RET
INIT_XMM sse2
cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
GET_GOT goffsetq
movd m0, [leftq] ; abcd [byte]
punpcklbw m4, m0, m0 ; aabb ccdd
punpcklwd m4, m4 ; aaaa bbbb cccc dddd
psrldq m4, 12 ; dddd
punpckldq m0, m4 ; abcd dddd
psrldq m1, m0, 1 ; bcdd
psrldq m2, m0, 2 ; cddd
X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; a2bc b2cd c3d d
pavgb m1, m0 ; ab, bc, cd, d [byte]
punpcklbw m1, m3 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
movd [dstq ], m1
psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
movd [dstq+strideq], m1
lea dstq, [dstq+strideq*2]
psrlq m1, 16 ; cd, c3d, d, d
movd [dstq ], m1
movd [dstq+strideq], m4 ; d, d, d, d
RESTORE_GOT
RET
INIT_XMM sse2
cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
GET_GOT goffsetq
......