Commit 93b543ab authored by Urvang Joshi's avatar Urvang Joshi

Remove ALT_INTRA flag.

This experiment has been adopted as it has been cleared by Tapas.

Change-Id: I0682face60f62dd43091efa0a92d09d846396850
parent 9f262c5b
......@@ -62,14 +62,9 @@ foreach $w (@tx_dims) {
}
}
@pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153/;
if (aom_config("CONFIG_ALT_INTRA") eq "yes") {
push @pred_names, qw/paeth smooth/;
if (aom_config("CONFIG_SMOOTH_HV") eq "yes") {
@pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153 paeth smooth/;
if (aom_config("CONFIG_SMOOTH_HV") eq "yes") {
push @pred_names, qw/smooth_v smooth_h/;
}
} else {
push @pred_names, 'tm';
}
#
......@@ -93,9 +88,6 @@ specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
specialize qw/aom_d135_predictor_4x4 neon/;
specialize qw/aom_d153_predictor_4x4 ssse3/;
specialize qw/aom_v_predictor_4x4 neon msa sse2/;
if (aom_config("CONFIG_ALT_INTRA") eq "") {
specialize qw/aom_tm_predictor_4x4 neon dspr2 msa sse2/;
} # CONFIG_ALT_INTRA
specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
......@@ -103,9 +95,6 @@ specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
specialize qw/aom_d153_predictor_8x8 ssse3/;
specialize qw/aom_v_predictor_8x8 neon msa sse2/;
if (aom_config("CONFIG_ALT_INTRA") eq "") {
specialize qw/aom_tm_predictor_8x8 neon dspr2 msa sse2/;
} # CONFIG_ALT_INTRA
specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
......@@ -113,9 +102,6 @@ specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
specialize qw/aom_d153_predictor_16x16 ssse3/;
specialize qw/aom_v_predictor_16x16 neon msa sse2/;
if (aom_config("CONFIG_ALT_INTRA") eq "") {
specialize qw/aom_tm_predictor_16x16 neon msa sse2/;
} # CONFIG_ALT_INTRA
specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
......@@ -123,9 +109,6 @@ specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
specialize qw/aom_h_predictor_32x32 neon msa sse2/;
specialize qw/aom_d153_predictor_32x32 ssse3/;
specialize qw/aom_v_predictor_32x32 neon msa sse2/;
if (aom_config("CONFIG_ALT_INTRA") eq "") {
specialize qw/aom_tm_predictor_32x32 neon msa sse2/;
} # CONFIG_ALT_INTRA
specialize qw/aom_dc_predictor_32x32 msa neon sse2/;
specialize qw/aom_dc_top_predictor_32x32 msa neon sse2/;
specialize qw/aom_dc_left_predictor_32x32 msa neon sse2/;
......@@ -133,24 +116,12 @@ specialize qw/aom_dc_128_predictor_32x32 msa neon sse2/;
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_highbd_v_predictor_4x4 sse2/;
if (aom_config("CONFIG_ALT_INTRA") eq "") {
specialize qw/aom_highbd_tm_predictor_4x4 sse2/;
} # CONFIG_ALT_INTRA
specialize qw/aom_highbd_dc_predictor_4x4 sse2/;
specialize qw/aom_highbd_v_predictor_8x8 sse2/;
if (aom_config("CONFIG_ALT_INTRA") eq "") {
specialize qw/aom_highbd_tm_predictor_8x8 sse2/;
} # CONFIG_ALT_INTRA
specialize qw/aom_highbd_dc_predictor_8x8 sse2/;;
specialize qw/aom_highbd_v_predictor_16x16 sse2/;
if (aom_config("CONFIG_ALT_INTRA") eq "") {
specialize qw/aom_highbd_tm_predictor_16x16 sse2/;
} # CONFIG_ALT_INTRA
specialize qw/aom_highbd_dc_predictor_16x16 sse2/;
specialize qw/aom_highbd_v_predictor_32x32 sse2/;
if (aom_config("CONFIG_ALT_INTRA") eq "") {
specialize qw/aom_highbd_tm_predictor_32x32 sse2/;
} # CONFIG_ALT_INTRA
specialize qw/aom_highbd_dc_predictor_32x32 sse2/;
} # CONFIG_HIGHBITDEPTH
......
......@@ -529,229 +529,4 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
}
}
}
void aom_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int i;
uint16x8_t q1u16, q3u16;
int16x8_t q1s16;
uint8x8_t d0u8 = vdup_n_u8(0);
uint32x2_t d2u32 = vdup_n_u32(0);
d0u8 = vld1_dup_u8(above - 1);
d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
for (i = 0; i < 4; i++, dst += stride) {
q1u16 = vdupq_n_u16((uint16_t)left[i]);
q1s16 =
vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
d0u8 = vqmovun_s16(q1s16);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
}
}
void aom_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int j;
uint16x8_t q0u16, q3u16, q10u16;
int16x8_t q0s16;
uint16x4_t d20u16;
uint8x8_t d0u8, d2u8, d30u8;
d0u8 = vld1_dup_u8(above - 1);
d30u8 = vld1_u8(left);
d2u8 = vld1_u8(above);
q10u16 = vmovl_u8(d30u8);
q3u16 = vsubl_u8(d2u8, d0u8);
d20u16 = vget_low_u16(q10u16);
for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
q0u16 = vdupq_lane_u16(d20u16, 0);
q0s16 =
vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 1);
q0s16 =
vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 2);
q0s16 =
vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 3);
q0s16 =
vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride;
}
}
void aom_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int j, k;
uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
uint8x16_t q0u8, q1u8;
int16x8_t q0s16, q1s16, q8s16, q11s16;
uint16x4_t d20u16;
uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
q0u8 = vld1q_dup_u8(above - 1);
q1u8 = vld1q_u8(above);
q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
for (k = 0; k < 2; k++, left += 8) {
d18u8 = vld1_u8(left);
q10u16 = vmovl_u8(d18u8);
d20u16 = vget_low_u16(q10u16);
for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
q0u16 = vdupq_lane_u16(d20u16, 0);
q8u16 = vdupq_lane_u16(d20u16, 1);
q1s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
q0s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
q11s16 =
vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
q8s16 =
vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
d2u8 = vqmovun_s16(q1s16);
d3u8 = vqmovun_s16(q0s16);
d22u8 = vqmovun_s16(q11s16);
d23u8 = vqmovun_s16(q8s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
dst += stride;
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 2);
q8u16 = vdupq_lane_u16(d20u16, 3);
q1s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
q0s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
q11s16 =
vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
q8s16 =
vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
d2u8 = vqmovun_s16(q1s16);
d3u8 = vqmovun_s16(q0s16);
d22u8 = vqmovun_s16(q11s16);
d23u8 = vqmovun_s16(q8s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
dst += stride;
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
dst += stride;
}
}
}
void aom_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int j, k;
uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
uint8x16_t q0u8, q1u8, q2u8;
int16x8_t q12s16, q13s16, q14s16, q15s16;
uint16x4_t d6u16;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
q0u8 = vld1q_dup_u8(above - 1);
q1u8 = vld1q_u8(above);
q2u8 = vld1q_u8(above + 16);
q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
for (k = 0; k < 4; k++, left += 8) {
d26u8 = vld1_u8(left);
q3u16 = vmovl_u8(d26u8);
d6u16 = vget_low_u16(q3u16);
for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
q0u16 = vdupq_lane_u16(d6u16, 0);
q12s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
q13s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += stride;
q0u16 = vdupq_lane_u16(d6u16, 1);
q12s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
q13s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += stride;
q0u16 = vdupq_lane_u16(d6u16, 2);
q12s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
q13s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += stride;
q0u16 = vdupq_lane_u16(d6u16, 3);
q12s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
q13s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += stride;
}
}
}
#endif // !HAVE_NEON_ASM
This diff is collapsed.
......@@ -179,7 +179,6 @@ static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
}
}
#if CONFIG_ALT_INTRA
static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
......@@ -344,21 +343,6 @@ static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
}
#endif // CONFIG_SMOOTH_HV
#else
static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
const uint8_t *above, const uint8_t *left) {
int r, c;
int ytop_left = above[-1];
for (r = 0; r < bh; r++) {
for (c = 0; c < bw; c++)
dst[c] = clip_pixel(left[r] + above[c] - ytop_left);
dst += stride;
}
}
#endif // CONFIG_ALT_INTRA
static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
int bh, const uint8_t *above,
const uint8_t *left) {
......@@ -794,7 +778,6 @@ void aom_highbd_d153_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride,
DST(1, 1) = AVG3(J, I, X);
}
#if CONFIG_ALT_INTRA
static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
int bw, int bh, const uint16_t *above,
const uint16_t *left, int bd) {
......@@ -901,23 +884,7 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
dst += stride;
}
}
#endif
#else
static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
int bh, const uint16_t *above,
const uint16_t *left, int bd) {
int r, c;
int ytop_left = above[-1];
(void)bd;
for (r = 0; r < bh; r++) {
for (c = 0; c < bw; c++)
dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd);
dst += stride;
}
}
#endif // CONFIG_ALT_INTRA
#endif // CONFIG_SMOOTH_HV
static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
int bw, int bh,
......@@ -1118,16 +1085,12 @@ intra_pred_above_4x4(d135)
intra_pred_above_4x4(d153)
intra_pred_allsizes(v)
intra_pred_allsizes(h)
#if CONFIG_ALT_INTRA
intra_pred_allsizes(smooth)
#if CONFIG_SMOOTH_HV
intra_pred_allsizes(smooth_v)
intra_pred_allsizes(smooth_h)
#endif // CONFIG_SMOOTH_HV
intra_pred_allsizes(paeth)
#else
intra_pred_allsizes(tm)
#endif // CONFIG_ALT_INTRA
intra_pred_allsizes(dc_128)
intra_pred_allsizes(dc_left)
intra_pred_allsizes(dc_top)
......
......@@ -78,148 +78,4 @@ void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
: [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
[stride] "r"(stride));
}
void aom_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int32_t abovel, abover;
int32_t left0, left1, left2, left3;
int32_t res0, res1;
int32_t resl;
int32_t resr;
int32_t top_left;
uint8_t *cm = aom_ff_cropTbl;
__asm__ __volatile__(
"ulw %[resl], (%[above]) \n\t"
"lbu %[left0], (%[left]) \n\t"
"lbu %[left1], 1(%[left]) \n\t"
"lbu %[left2], 2(%[left]) \n\t"
"lbu %[left3], 3(%[left]) \n\t"
"lbu %[top_left], -1(%[above]) \n\t"
"preceu.ph.qbl %[abovel], %[resl] \n\t"
"preceu.ph.qbr %[abover], %[resl] \n\t"
"replv.ph %[left0], %[left0] \n\t"
"replv.ph %[left1], %[left1] \n\t"
"replv.ph %[left2], %[left2] \n\t"
"replv.ph %[left3], %[left3] \n\t"
"replv.ph %[top_left], %[top_left] \n\t"
"addu.ph %[resl], %[abovel], %[left0] \n\t"
"subu.ph %[resl], %[resl], %[top_left] \n\t"
"addu.ph %[resr], %[abover], %[left0] \n\t"
"subu.ph %[resr], %[resr], %[top_left] \n\t"
"sll %[res0], %[resr], 16 \n\t"
"sra %[res0], %[res0], 16 \n\t"
"lbux %[res0], %[res0](%[cm]) \n\t"
"sra %[res1], %[resr], 16 \n\t"
"lbux %[res1], %[res1](%[cm]) \n\t"
"sb %[res0], (%[dst]) \n\t"
"sll %[res0], %[resl], 16 \n\t"
"sra %[res0], %[res0], 16 \n\t"
"lbux %[res0], %[res0](%[cm]) \n\t"
"sb %[res1], 1(%[dst]) \n\t"
"sra %[res1], %[resl], 16 \n\t"
"lbux %[res1], %[res1](%[cm]) \n\t"
"addu.ph %[resl], %[abovel], %[left1] \n\t"
"subu.ph %[resl], %[resl], %[top_left] \n\t"
"addu.ph %[resr], %[abover], %[left1] \n\t"
"subu.ph %[resr], %[resr], %[top_left] \n\t"
"sb %[res0], 2(%[dst]) \n\t"
"sb %[res1], 3(%[dst]) \n\t"
"add %[dst], %[dst], %[stride] \n\t"
"sll %[res0], %[resr], 16 \n\t"
"sra %[res0], %[res0], 16 \n\t"
"lbux %[res0], %[res0](%[cm]) \n\t"
"sra %[res1], %[resr], 16 \n\t"
"lbux %[res1], %[res1](%[cm]) \n\t"
"sb %[res0], (%[dst]) \n\t"
"sll %[res0], %[resl], 16 \n\t"
"sra %[res0], %[res0], 16 \n\t"
"lbux %[res0], %[res0](%[cm]) \n\t"
"sb %[res1], 1(%[dst]) \n\t"
"sra %[res1], %[resl], 16 \n\t"
"lbux %[res1], %[res1](%[cm]) \n\t"
"addu.ph %[resl], %[abovel], %[left2] \n\t"
"subu.ph %[resl], %[resl], %[top_left] \n\t"
"addu.ph %[resr], %[abover], %[left2] \n\t"
"subu.ph %[resr], %[resr], %[top_left] \n\t"
"sb %[res0], 2(%[dst]) \n\t"
"sb %[res1], 3(%[dst]) \n\t"
"add %[dst], %[dst], %[stride] \n\t"
"sll %[res0], %[resr], 16 \n\t"
"sra %[res0], %[res0], 16 \n\t"
"lbux %[res0], %[res0](%[cm]) \n\t"
"sra %[res1], %[resr], 16 \n\t"
"lbux %[res1], %[res1](%[cm]) \n\t"
"sb %[res0], (%[dst]) \n\t"
"sll %[res0], %[resl], 16 \n\t"
"sra %[res0], %[res0], 16 \n\t"
"lbux %[res0], %[res0](%[cm]) \n\t"
"sb %[res1], 1(%[dst]) \n\t"
"sra %[res1], %[resl], 16 \n\t"
"lbux %[res1], %[res1](%[cm]) \n\t"
"addu.ph %[resl], %[abovel], %[left3] \n\t"
"subu.ph %[resl], %[resl], %[top_left] \n\t"
"addu.ph %[resr], %[abover], %[left3] \n\t"
"subu.ph %[resr], %[resr], %[top_left] \n\t"
"sb %[res0], 2(%[dst]) \n\t"
"sb %[res1], 3(%[dst]) \n\t"
"add %[dst], %[dst], %[stride] \n\t"
"sll %[res0], %[resr], 16 \n\t"
"sra %[res0], %[res0], 16 \n\t"
"lbux %[res0], %[res0](%[cm]) \n\t"
"sra %[res1], %[resr], 16 \n\t"
"lbux %[res1], %[res1](%[cm]) \n\t"
"sb %[res0], (%[dst]) \n\t"
"sll %[res0], %[resl], 16 \n\t"
"sra %[res0], %[res0], 16 \n\t"
"lbux %[res0], %[res0](%[cm]) \n\t"
"sb %[res1], 1(%[dst]) \n\t"
"sra %[res1], %[resl], 16 \n\t"
"lbux %[res1], %[res1](%[cm]) \n\t"
"sb %[res0], 2(%[dst]) \n\t"
"sb %[res1], 3(%[dst]) \n\t"
: [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0),
[left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0),
[res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl),
[resr] "=&r"(resr), [top_left] "=&r"(top_left)
: [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
[stride] "r"(stride), [cm] "r"(cm));
}
#endif // #if HAVE_DSPR2
This diff is collapsed.
......@@ -382,176 +382,6 @@ static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
}
}
static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
const uint8_t *src_left, uint8_t *dst,
int32_t dst_stride) {
uint32_t val;
uint8_t top_left = src_top_ptr[-1];
v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
v16u8 src0, src1, src2, src3;
v8u16 src_top_left, vec0, vec1, vec2, vec3;
src_top_left = (v8u16)__msa_fill_h(top_left);
val = LW(src_top_ptr);
src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
src_left0 = __msa_fill_b(src_left[0]);
src_left1 = __msa_fill_b(src_left[1]);
src_left2 = __msa_fill_b(src_left[2]);
src_left3 = __msa_fill_b(src_left[3]);
ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
src_left3, src_top, src0, src1, src2, src3);
HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
}
static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
const uint8_t *src_left, uint8_t *dst,
int32_t dst_stride) {
uint64_t val;
uint8_t top_left = src_top_ptr[-1];
uint32_t loop_cnt;
v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
v8u16 src_top_left, vec0, vec1, vec2, vec3;
v16u8 src0, src1, src2, src3;
val = LD(src_top_ptr);
src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
src_top_left = (v8u16)__msa_fill_h(top_left);
for (loop_cnt = 2; loop_cnt--;) {
src_left0 = __msa_fill_b(src_left[0]);
src_left1 = __msa_fill_b(src_left[1]);
src_left2 = __msa_fill_b(src_left[2]);
src_left3 = __msa_fill_b(src_left[3]);
src_left += 4;
ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
src_left3, src_top, src0, src1, src2, src3);
HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
const uint8_t *src_left, uint8_t *dst,
int32_t dst_stride) {
uint8_t top_left = src_top_ptr[-1];
uint32_t loop_cnt;
v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
v8u16 src_top_left, res_r, res_l;
src_top = LD_SB(src_top_ptr);
src_top_left = (v8u16)__msa_fill_h(top_left);
for (loop_cnt = 4; loop_cnt--;) {
src_left0 = __msa_fill_b(src_left[0]);
src_left1 = __msa_fill_b(src_left[1]);
src_left2 = __msa_fill_b(src_left[2]);
src_left3 = __msa_fill_b(src_left[3]);
src_left += 4;
ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
HADD_UB2_UH(res_r, res_l, res_r, res_l);
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
SAT_UH2_UH(res_r, res_l, 7);
PCKEV_ST_SB(res_r, res_l, dst);
dst += dst_stride;
ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
HADD_UB2_UH(res_r, res_l, res_r, res_l);
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
SAT_UH2_UH(res_r, res_l, 7);
PCKEV_ST_SB(res_r, res_l, dst);
dst += dst_stride;
ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
HADD_UB2_UH(res_r, res_l, res_r, res_l);
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
SAT_UH2_UH(res_r, res_l, 7);
PCKEV_ST_SB(res_r, res_l, dst);
dst += dst_stride;
ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
HADD_UB2_UH(res_r, res_l, res_r, res_l);
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
SAT_UH2_UH(res_r, res_l, 7);
PCKEV_ST_SB(res_r, res_l, dst);
dst += dst_stride;
}
}