Commit 248cf6f6 authored by Parag Salasakar's avatar Parag Salasakar
Browse files

mips dsp-ase r2 vp9 decoder loopfilter module optimizations (rebase)

Change-Id: Ia7f640ca395e8deaac5986f19d11ab18d85eec2d
parent 3f3d14e1
/*
* Copyright (c) 2013 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
#if HAVE_DSPR2
void vp9_loop_filter_horizontal_edge_dspr2(unsigned char *s,
int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh,
int count) {
uint8_t i;
uint32_t mask;
uint32_t hev;
uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
uint32_t thresh_vec, flimit_vec, limit_vec;
uint32_t uflimit, ulimit, uthresh;
uflimit = *blimit;
ulimit = *limit;
uthresh = *thresh;
/* create quad-byte */
__asm__ __volatile__ (
"replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t"
: [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
[limit_vec] "=r" (limit_vec)
: [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
);
/* prefetch data for store */
vp9_prefetch_store(s);
/* loop filter designed to work using chars so that we can make maximum use
of 8 bit simd instructions. */
for (i = 0; i < 2; i++) {
sm1 = s - (pitch << 2);
s0 = sm1 + pitch;
s1 = s0 + pitch;
s2 = s - pitch;
s3 = s;
s4 = s + pitch;
s5 = s4 + pitch;
s6 = s5 + pitch;
__asm__ __volatile__ (
"lw %[p1], (%[s1]) \n\t"
"lw %[p2], (%[s2]) \n\t"
"lw %[p3], (%[s3]) \n\t"
"lw %[p4], (%[s4]) \n\t"
: [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
: [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
);
/* if (p1 - p4 == 0) and (p2 - p3 == 0)
mask will be zero and filtering is not needed */
if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
__asm__ __volatile__ (
"lw %[pm1], (%[sm1]) \n\t"
"lw %[p0], (%[s0]) \n\t"
"lw %[p5], (%[s5]) \n\t"
"lw %[p6], (%[s6]) \n\t"
: [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
[p6] "=&r" (p6)
: [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
);
vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
pm1, p0, p3, p4, p5, p6,
thresh_vec, &hev, &mask);
/* if mask == 0 do filtering is not needed */
if (mask) {
/* filtering */
vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
__asm__ __volatile__ (
"sw %[p1], (%[s1]) \n\t"
"sw %[p2], (%[s2]) \n\t"
"sw %[p3], (%[s3]) \n\t"
"sw %[p4], (%[s4]) \n\t"
:
: [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
[s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
);
}
}
s = s + 4;
}
}
void vp9_loop_filter_vertical_edge_dspr2(unsigned char *s,
int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh,
int count) {
uint8_t i;
uint32_t mask, hev;
uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
uint8_t *s1, *s2, *s3, *s4;
uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
uint32_t thresh_vec, flimit_vec, limit_vec;
uint32_t uflimit, ulimit, uthresh;
uflimit = *blimit;
ulimit = *limit;
uthresh = *thresh;
/* create quad-byte */
__asm__ __volatile__ (
"replv.qb %[thresh_vec], %[uthresh] \n\t"
"replv.qb %[flimit_vec], %[uflimit] \n\t"
"replv.qb %[limit_vec], %[ulimit] \n\t"
: [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
[limit_vec] "=r" (limit_vec)
: [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
);
/* prefetch data for store */
vp9_prefetch_store(s + pitch);
for (i = 0; i < 2; i++) {
s1 = s;
s2 = s + pitch;
s3 = s2 + pitch;
s4 = s3 + pitch;
s = s4 + pitch;
/* load quad-byte vectors
* memory is 4 byte aligned
*/
p2 = *((uint32_t *)(s1 - 4));
p6 = *((uint32_t *)(s1));
p1 = *((uint32_t *)(s2 - 4));
p5 = *((uint32_t *)(s2));
p0 = *((uint32_t *)(s3 - 4));
p4 = *((uint32_t *)(s3));
pm1 = *((uint32_t *)(s4 - 4));
p3 = *((uint32_t *)(s4));
/* transpose pm1, p0, p1, p2 */
__asm__ __volatile__ (
"precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
"precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
"precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
"precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
"precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
"precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
"precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
"precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
"precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
"precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
"append %[p1], %[sec3], 16 \n\t"
"append %[pm1], %[sec4], 16 \n\t"
: [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
[prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
[p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
[sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
:
);
/* transpose p3, p4, p5, p6 */
__asm__ __volatile__ (
"precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
"precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
"precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
"precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
"precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
"precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
"precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
"precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
"precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
"precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
"append %[p5], %[sec3], 16 \n\t"
"append %[p3], %[sec4], 16 \n\t"
: [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
[prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
[p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
[sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
:
);
/* if (p1 - p4 == 0) and (p2 - p3 == 0)
* mask will be zero and filtering is not needed
*/
if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
p0, p3, p4, p5, p6, thresh_vec,
&hev, &mask);
/* if mask == 0 do filtering is not needed */
if (mask) {
/* filtering */
vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
/* unpack processed 4x4 neighborhood
* don't use transpose on output data
* because memory isn't aligned
*/
__asm__ __volatile__ (
"sb %[p4], 1(%[s4]) \n\t"
"sb %[p3], 0(%[s4]) \n\t"
"sb %[p2], -1(%[s4]) \n\t"
"sb %[p1], -2(%[s4]) \n\t"
:
: [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
[s4] "r" (s4)
);
__asm__ __volatile__ (
"srl %[p4], %[p4], 8 \n\t"
"srl %[p3], %[p3], 8 \n\t"
"srl %[p2], %[p2], 8 \n\t"
"srl %[p1], %[p1], 8 \n\t"
: [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
:
);
__asm__ __volatile__ (
"sb %[p4], 1(%[s3]) \n\t"
"sb %[p3], 0(%[s3]) \n\t"
"sb %[p2], -1(%[s3]) \n\t"
"sb %[p1], -2(%[s3]) \n\t"
: [p1] "+r" (p1)
: [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
);
__asm__ __volatile__ (
"srl %[p4], %[p4], 8 \n\t"
"srl %[p3], %[p3], 8 \n\t"
"srl %[p2], %[p2], 8 \n\t"
"srl %[p1], %[p1], 8 \n\t"
: [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
:
);
__asm__ __volatile__ (
"sb %[p4], 1(%[s2]) \n\t"
"sb %[p3], 0(%[s2]) \n\t"
"sb %[p2], -1(%[s2]) \n\t"
"sb %[p1], -2(%[s2]) \n\t"
:
: [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
[s2] "r" (s2)
);
__asm__ __volatile__ (
"srl %[p4], %[p4], 8 \n\t"
"srl %[p3], %[p3], 8 \n\t"
"srl %[p2], %[p2], 8 \n\t"
"srl %[p1], %[p1], 8 \n\t"
: [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
:
);
__asm__ __volatile__ (
"sb %[p4], 1(%[s1]) \n\t"
"sb %[p3], 0(%[s1]) \n\t"
"sb %[p2], -1(%[s1]) \n\t"
"sb %[p1], -2(%[s1]) \n\t"
:
: [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
[s1] "r" (s1)
);
}
}
}
}
#endif // #if HAVE_DSPR2
This diff is collapsed.
This diff is collapsed.
/*
* Copyright (c) 2013 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
#include <stdlib.h>
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_onyxc_int.h"
#if HAVE_DSPR2
/* processing 4 pixels at the same time
* compute hev and mask in the same function */
static INLINE void vp9_filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
uint32_t p1, uint32_t p0,
uint32_t p3, uint32_t p2,
uint32_t q0, uint32_t q1,
uint32_t q2, uint32_t q3,
uint32_t thresh, uint32_t *hev,
uint32_t *mask) {
uint32_t c, r, r3, r_k;
uint32_t s1, s2, s3;
uint32_t ones = 0xFFFFFFFF;
uint32_t hev1;
__asm__ __volatile__ (
/* mask |= (abs(p3 - p2) > limit) */
"subu_s.qb %[c], %[p3], %[p2] \n\t"
"subu_s.qb %[r_k], %[p2], %[p3] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], $0, %[c] \n\t"
/* mask |= (abs(p2 - p1) > limit) */
"subu_s.qb %[c], %[p2], %[p1] \n\t"
"subu_s.qb %[r_k], %[p1], %[p2] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t"
/* mask |= (abs(p1 - p0) > limit)
* hev |= (abs(p1 - p0) > thresh)
*/
"subu_s.qb %[c], %[p1], %[p0] \n\t"
"subu_s.qb %[r_k], %[p0], %[p1] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
"or %[r3], $0, %[c] \n\t"
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t"
/* mask |= (abs(q1 - q0) > limit)
* hev |= (abs(q1 - q0) > thresh)
*/
"subu_s.qb %[c], %[q1], %[q0] \n\t"
"subu_s.qb %[r_k], %[q0], %[q1] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
"or %[r3], %[r3], %[c] \n\t"
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t"
/* mask |= (abs(q2 - q1) > limit) */
"subu_s.qb %[c], %[q2], %[q1] \n\t"
"subu_s.qb %[r_k], %[q1], %[q2] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t"
"sll %[r3], %[r3], 24 \n\t"
/* mask |= (abs(q3 - q2) > limit) */
"subu_s.qb %[c], %[q3], %[q2] \n\t"
"subu_s.qb %[r_k], %[q2], %[q3] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t"
: [c] "=&r" (c), [r_k] "=&r" (r_k),
[r] "=&r" (r), [r3] "=&r" (r3)
: [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
[p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
[q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
);
__asm__ __volatile__ (
/* abs(p0 - q0) */
"subu_s.qb %[c], %[p0], %[q0] \n\t"
"subu_s.qb %[r_k], %[q0], %[p0] \n\t"
"wrdsp %[r3] \n\t"
"or %[s1], %[r_k], %[c] \n\t"
/* abs(p1 - q1) */
"subu_s.qb %[c], %[p1], %[q1] \n\t"
"addu_s.qb %[s3], %[s1], %[s1] \n\t"
"pick.qb %[hev1], %[ones], $0 \n\t"
"subu_s.qb %[r_k], %[q1], %[p1] \n\t"
"or %[s2], %[r_k], %[c] \n\t"
/* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
"shrl.qb %[s2], %[s2], 1 \n\t"
"addu_s.qb %[s1], %[s2], %[s3] \n\t"
"cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
"or %[r], %[r], %[c] \n\t"
"sll %[r], %[r], 24 \n\t"
"wrdsp %[r] \n\t"
"pick.qb %[s2], $0, %[ones] \n\t"
: [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
[s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
: [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
[q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
);
*hev = hev1;
*mask = s2;
}
static INLINE void vp9_filter_hev_mask_flatmask4_dspr2(uint32_t limit,
uint32_t flimit,
uint32_t thresh,
uint32_t p1, uint32_t p0,
uint32_t p3, uint32_t p2,
uint32_t q0, uint32_t q1,
uint32_t q2, uint32_t q3,
uint32_t *hev,
uint32_t *mask,
uint32_t *flat) {
uint32_t c, r, r3, r_k, r_flat;
uint32_t s1, s2, s3;
uint32_t ones = 0xFFFFFFFF;
uint32_t flat_thresh = 0x01010101;
uint32_t hev1;
uint32_t flat1;
__asm__ __volatile__ (
/* mask |= (abs(p3 - p2) > limit) */
"subu_s.qb %[c], %[p3], %[p2] \n\t"
"subu_s.qb %[r_k], %[p2], %[p3] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], $0, %[c] \n\t"
/* mask |= (abs(p2 - p1) > limit) */
"subu_s.qb %[c], %[p2], %[p1] \n\t"
"subu_s.qb %[r_k], %[p1], %[p2] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t"
/* mask |= (abs(p1 - p0) > limit)
* hev |= (abs(p1 - p0) > thresh)
* flat |= (abs(p1 - p0) > thresh)
*/
"subu_s.qb %[c], %[p1], %[p0] \n\t"
"subu_s.qb %[r_k], %[p0], %[p1] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
"or %[r3], $0, %[c] \n\t"
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t"
"cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
"or %[r_flat], $0, %[c] \n\t"
/* mask |= (abs(q1 - q0) > limit)
* hev |= (abs(q1 - q0) > thresh)
* flat |= (abs(q1 - q0) > thresh)
*/
"subu_s.qb %[c], %[q1], %[q0] \n\t"
"subu_s.qb %[r_k], %[q0], %[q1] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
"or %[r3], %[r3], %[c] \n\t"
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t"
"cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
"or %[r_flat], %[r_flat], %[c] \n\t"
/* flat |= (abs(p0 - p2) > thresh) */
"subu_s.qb %[c], %[p0], %[p2] \n\t"
"subu_s.qb %[r_k], %[p2], %[p0] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
"or %[r_flat], %[r_flat], %[c] \n\t"
/* flat |= (abs(q0 - q2) > thresh) */
"subu_s.qb %[c], %[q0], %[q2] \n\t"
"subu_s.qb %[r_k], %[q2], %[q0] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
"or %[r_flat], %[r_flat], %[c] \n\t"
/* flat |= (abs(p3 - p0) > thresh) */
"subu_s.qb %[c], %[p3], %[p0] \n\t"
"subu_s.qb %[r_k], %[p0], %[p3] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
"or %[r_flat], %[r_flat], %[c] \n\t"
/* flat |= (abs(q3 - q0) > thresh) */
"subu_s.qb %[c], %[q3], %[q0] \n\t"
"subu_s.qb %[r_k], %[q0], %[q3] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
"or %[r_flat], %[r_flat], %[c] \n\t"
"sll %[r_flat], %[r_flat], 24 \n\t"
/* look at stall here */
"wrdsp %[r_flat] \n\t"
"pick.qb %[flat1], $0, %[ones] \n\t"
/* mask |= (abs(q2 - q1) > limit) */
"subu_s.qb %[c], %[q2], %[q1] \n\t"
"subu_s.qb %[r_k], %[q1], %[q2] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t"
"sll %[r3], %[r3], 24 \n\t"
/* mask |= (abs(q3 - q2) > limit) */
"subu_s.qb %[c], %[q3], %[q2] \n\t"
"subu_s.qb %[r_k], %[q2], %[q3] \n\t"
"or %[r_k], %[r_k], %[c] \n\t"
"cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
"or %[r], %[r], %[c] \n\t"
: [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3),
[r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1)
: [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
[p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
[q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh),
[flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
);
__asm__ __volatile__ (
/* abs(p0 - q0) */
"subu_s.qb %[c], %[p0], %[q0] \n\t"
"subu_s.qb %[r_k], %[q0], %[p0] \n\t"
"wrdsp %[r3] \n\t"
"or %[s1], %[r_k], %[c] \n\t"
/* abs(p1 - q1) */
"subu_s.qb %[c], %[p1], %[q1] \n\t"
"addu_s.qb %[s3], %[s1], %[s1] \n\t"
"pick.qb %[hev1], %[ones], $0 \n\t"
"subu_s.qb %[r_k], %[q1], %[p1] \n\t"
"or %[s2], %[r_k], %[c] \n\t"
/* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
"shrl.qb %[s2], %[s2], 1 \n\t"
"addu_s.qb %[s1], %[s2], %[s3] \n\t"
"cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
"or %[r], %[r], %[c] \n\t"
"sll %[r], %[r], 24 \n\t"
"wrdsp %[r] \n\t"
"pick.qb %[s2], $0, %[ones] \n\t"
: [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
[s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
: [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
[q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
);
*hev = hev1;
*mask = s2;
*flat = flat1;
}