Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
248cf6f6
Commit
248cf6f6
authored
Nov 13, 2013
by
Parag Salasakar
Browse files
mips dsp-ase r2 vp9 decoder loopfilter module optimizations (rebase)
Change-Id: Ia7f640ca395e8deaac5986f19d11ab18d85eec2d
parent
3f3d14e1
Changes
9
Expand all
Hide whitespace changes
Inline
Side-by-side
vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
0 → 100644
View file @
248cf6f6
/*
* Copyright (c) 2013 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
#if HAVE_DSPR2
void
vp9_loop_filter_horizontal_edge_dspr2
(
unsigned
char
*
s
,
int
pitch
,
const
uint8_t
*
blimit
,
const
uint8_t
*
limit
,
const
uint8_t
*
thresh
,
int
count
)
{
uint8_t
i
;
uint32_t
mask
;
uint32_t
hev
;
uint32_t
pm1
,
p0
,
p1
,
p2
,
p3
,
p4
,
p5
,
p6
;
uint8_t
*
sm1
,
*
s0
,
*
s1
,
*
s2
,
*
s3
,
*
s4
,
*
s5
,
*
s6
;
uint32_t
thresh_vec
,
flimit_vec
,
limit_vec
;
uint32_t
uflimit
,
ulimit
,
uthresh
;
uflimit
=
*
blimit
;
ulimit
=
*
limit
;
uthresh
=
*
thresh
;
/* create quad-byte */
__asm__
__volatile__
(
"replv.qb %[thresh_vec], %[uthresh]
\n\t
"
"replv.qb %[flimit_vec], %[uflimit]
\n\t
"
"replv.qb %[limit_vec], %[ulimit]
\n\t
"
:
[
thresh_vec
]
"=&r"
(
thresh_vec
),
[
flimit_vec
]
"=&r"
(
flimit_vec
),
[
limit_vec
]
"=r"
(
limit_vec
)
:
[
uthresh
]
"r"
(
uthresh
),
[
uflimit
]
"r"
(
uflimit
),
[
ulimit
]
"r"
(
ulimit
)
);
/* prefetch data for store */
vp9_prefetch_store
(
s
);
/* loop filter designed to work using chars so that we can make maximum use
of 8 bit simd instructions. */
for
(
i
=
0
;
i
<
2
;
i
++
)
{
sm1
=
s
-
(
pitch
<<
2
);
s0
=
sm1
+
pitch
;
s1
=
s0
+
pitch
;
s2
=
s
-
pitch
;
s3
=
s
;
s4
=
s
+
pitch
;
s5
=
s4
+
pitch
;
s6
=
s5
+
pitch
;
__asm__
__volatile__
(
"lw %[p1], (%[s1])
\n\t
"
"lw %[p2], (%[s2])
\n\t
"
"lw %[p3], (%[s3])
\n\t
"
"lw %[p4], (%[s4])
\n\t
"
:
[
p1
]
"=&r"
(
p1
),
[
p2
]
"=&r"
(
p2
),
[
p3
]
"=&r"
(
p3
),
[
p4
]
"=&r"
(
p4
)
:
[
s1
]
"r"
(
s1
),
[
s2
]
"r"
(
s2
),
[
s3
]
"r"
(
s3
),
[
s4
]
"r"
(
s4
)
);
/* if (p1 - p4 == 0) and (p2 - p3 == 0)
mask will be zero and filtering is not needed */
if
(
!
(((
p1
-
p4
)
==
0
)
&&
((
p2
-
p3
)
==
0
)))
{
__asm__
__volatile__
(
"lw %[pm1], (%[sm1])
\n\t
"
"lw %[p0], (%[s0])
\n\t
"
"lw %[p5], (%[s5])
\n\t
"
"lw %[p6], (%[s6])
\n\t
"
:
[
pm1
]
"=&r"
(
pm1
),
[
p0
]
"=&r"
(
p0
),
[
p5
]
"=&r"
(
p5
),
[
p6
]
"=&r"
(
p6
)
:
[
sm1
]
"r"
(
sm1
),
[
s0
]
"r"
(
s0
),
[
s5
]
"r"
(
s5
),
[
s6
]
"r"
(
s6
)
);
vp9_filter_hev_mask_dspr2
(
limit_vec
,
flimit_vec
,
p1
,
p2
,
pm1
,
p0
,
p3
,
p4
,
p5
,
p6
,
thresh_vec
,
&
hev
,
&
mask
);
/* if mask == 0 do filtering is not needed */
if
(
mask
)
{
/* filtering */
vp9_filter_dspr2
(
mask
,
hev
,
&
p1
,
&
p2
,
&
p3
,
&
p4
);
__asm__
__volatile__
(
"sw %[p1], (%[s1])
\n\t
"
"sw %[p2], (%[s2])
\n\t
"
"sw %[p3], (%[s3])
\n\t
"
"sw %[p4], (%[s4])
\n\t
"
:
:
[
p1
]
"r"
(
p1
),
[
p2
]
"r"
(
p2
),
[
p3
]
"r"
(
p3
),
[
p4
]
"r"
(
p4
),
[
s1
]
"r"
(
s1
),
[
s2
]
"r"
(
s2
),
[
s3
]
"r"
(
s3
),
[
s4
]
"r"
(
s4
)
);
}
}
s
=
s
+
4
;
}
}
void
vp9_loop_filter_vertical_edge_dspr2
(
unsigned
char
*
s
,
int
pitch
,
const
uint8_t
*
blimit
,
const
uint8_t
*
limit
,
const
uint8_t
*
thresh
,
int
count
)
{
uint8_t
i
;
uint32_t
mask
,
hev
;
uint32_t
pm1
,
p0
,
p1
,
p2
,
p3
,
p4
,
p5
,
p6
;
uint8_t
*
s1
,
*
s2
,
*
s3
,
*
s4
;
uint32_t
prim1
,
prim2
,
sec3
,
sec4
,
prim3
,
prim4
;
uint32_t
thresh_vec
,
flimit_vec
,
limit_vec
;
uint32_t
uflimit
,
ulimit
,
uthresh
;
uflimit
=
*
blimit
;
ulimit
=
*
limit
;
uthresh
=
*
thresh
;
/* create quad-byte */
__asm__
__volatile__
(
"replv.qb %[thresh_vec], %[uthresh]
\n\t
"
"replv.qb %[flimit_vec], %[uflimit]
\n\t
"
"replv.qb %[limit_vec], %[ulimit]
\n\t
"
:
[
thresh_vec
]
"=&r"
(
thresh_vec
),
[
flimit_vec
]
"=&r"
(
flimit_vec
),
[
limit_vec
]
"=r"
(
limit_vec
)
:
[
uthresh
]
"r"
(
uthresh
),
[
uflimit
]
"r"
(
uflimit
),
[
ulimit
]
"r"
(
ulimit
)
);
/* prefetch data for store */
vp9_prefetch_store
(
s
+
pitch
);
for
(
i
=
0
;
i
<
2
;
i
++
)
{
s1
=
s
;
s2
=
s
+
pitch
;
s3
=
s2
+
pitch
;
s4
=
s3
+
pitch
;
s
=
s4
+
pitch
;
/* load quad-byte vectors
* memory is 4 byte aligned
*/
p2
=
*
((
uint32_t
*
)(
s1
-
4
));
p6
=
*
((
uint32_t
*
)(
s1
));
p1
=
*
((
uint32_t
*
)(
s2
-
4
));
p5
=
*
((
uint32_t
*
)(
s2
));
p0
=
*
((
uint32_t
*
)(
s3
-
4
));
p4
=
*
((
uint32_t
*
)(
s3
));
pm1
=
*
((
uint32_t
*
)(
s4
-
4
));
p3
=
*
((
uint32_t
*
)(
s4
));
/* transpose pm1, p0, p1, p2 */
__asm__
__volatile__
(
"precrq.qb.ph %[prim1], %[p2], %[p1]
\n\t
"
"precr.qb.ph %[prim2], %[p2], %[p1]
\n\t
"
"precrq.qb.ph %[prim3], %[p0], %[pm1]
\n\t
"
"precr.qb.ph %[prim4], %[p0], %[pm1]
\n\t
"
"precrq.qb.ph %[p1], %[prim1], %[prim2]
\n\t
"
"precr.qb.ph %[pm1], %[prim1], %[prim2]
\n\t
"
"precrq.qb.ph %[sec3], %[prim3], %[prim4]
\n\t
"
"precr.qb.ph %[sec4], %[prim3], %[prim4]
\n\t
"
"precrq.ph.w %[p2], %[p1], %[sec3]
\n\t
"
"precrq.ph.w %[p0], %[pm1], %[sec4]
\n\t
"
"append %[p1], %[sec3], 16
\n\t
"
"append %[pm1], %[sec4], 16
\n\t
"
:
[
prim1
]
"=&r"
(
prim1
),
[
prim2
]
"=&r"
(
prim2
),
[
prim3
]
"=&r"
(
prim3
),
[
prim4
]
"=&r"
(
prim4
),
[
p2
]
"+r"
(
p2
),
[
p1
]
"+r"
(
p1
),
[
p0
]
"+r"
(
p0
),
[
pm1
]
"+r"
(
pm1
),
[
sec3
]
"=&r"
(
sec3
),
[
sec4
]
"=&r"
(
sec4
)
:
);
/* transpose p3, p4, p5, p6 */
__asm__
__volatile__
(
"precrq.qb.ph %[prim1], %[p6], %[p5]
\n\t
"
"precr.qb.ph %[prim2], %[p6], %[p5]
\n\t
"
"precrq.qb.ph %[prim3], %[p4], %[p3]
\n\t
"
"precr.qb.ph %[prim4], %[p4], %[p3]
\n\t
"
"precrq.qb.ph %[p5], %[prim1], %[prim2]
\n\t
"
"precr.qb.ph %[p3], %[prim1], %[prim2]
\n\t
"
"precrq.qb.ph %[sec3], %[prim3], %[prim4]
\n\t
"
"precr.qb.ph %[sec4], %[prim3], %[prim4]
\n\t
"
"precrq.ph.w %[p6], %[p5], %[sec3]
\n\t
"
"precrq.ph.w %[p4], %[p3], %[sec4]
\n\t
"
"append %[p5], %[sec3], 16
\n\t
"
"append %[p3], %[sec4], 16
\n\t
"
:
[
prim1
]
"=&r"
(
prim1
),
[
prim2
]
"=&r"
(
prim2
),
[
prim3
]
"=&r"
(
prim3
),
[
prim4
]
"=&r"
(
prim4
),
[
p6
]
"+r"
(
p6
),
[
p5
]
"+r"
(
p5
),
[
p4
]
"+r"
(
p4
),
[
p3
]
"+r"
(
p3
),
[
sec3
]
"=&r"
(
sec3
),
[
sec4
]
"=&r"
(
sec4
)
:
);
/* if (p1 - p4 == 0) and (p2 - p3 == 0)
* mask will be zero and filtering is not needed
*/
if
(
!
(((
p1
-
p4
)
==
0
)
&&
((
p2
-
p3
)
==
0
)))
{
vp9_filter_hev_mask_dspr2
(
limit_vec
,
flimit_vec
,
p1
,
p2
,
pm1
,
p0
,
p3
,
p4
,
p5
,
p6
,
thresh_vec
,
&
hev
,
&
mask
);
/* if mask == 0 do filtering is not needed */
if
(
mask
)
{
/* filtering */
vp9_filter_dspr2
(
mask
,
hev
,
&
p1
,
&
p2
,
&
p3
,
&
p4
);
/* unpack processed 4x4 neighborhood
* don't use transpose on output data
* because memory isn't aligned
*/
__asm__
__volatile__
(
"sb %[p4], 1(%[s4])
\n\t
"
"sb %[p3], 0(%[s4])
\n\t
"
"sb %[p2], -1(%[s4])
\n\t
"
"sb %[p1], -2(%[s4])
\n\t
"
:
:
[
p4
]
"r"
(
p4
),
[
p3
]
"r"
(
p3
),
[
p2
]
"r"
(
p2
),
[
p1
]
"r"
(
p1
),
[
s4
]
"r"
(
s4
)
);
__asm__
__volatile__
(
"srl %[p4], %[p4], 8
\n\t
"
"srl %[p3], %[p3], 8
\n\t
"
"srl %[p2], %[p2], 8
\n\t
"
"srl %[p1], %[p1], 8
\n\t
"
:
[
p4
]
"+r"
(
p4
),
[
p3
]
"+r"
(
p3
),
[
p2
]
"+r"
(
p2
),
[
p1
]
"+r"
(
p1
)
:
);
__asm__
__volatile__
(
"sb %[p4], 1(%[s3])
\n\t
"
"sb %[p3], 0(%[s3])
\n\t
"
"sb %[p2], -1(%[s3])
\n\t
"
"sb %[p1], -2(%[s3])
\n\t
"
:
[
p1
]
"+r"
(
p1
)
:
[
p4
]
"r"
(
p4
),
[
p3
]
"r"
(
p3
),
[
p2
]
"r"
(
p2
),
[
s3
]
"r"
(
s3
)
);
__asm__
__volatile__
(
"srl %[p4], %[p4], 8
\n\t
"
"srl %[p3], %[p3], 8
\n\t
"
"srl %[p2], %[p2], 8
\n\t
"
"srl %[p1], %[p1], 8
\n\t
"
:
[
p4
]
"+r"
(
p4
),
[
p3
]
"+r"
(
p3
),
[
p2
]
"+r"
(
p2
),
[
p1
]
"+r"
(
p1
)
:
);
__asm__
__volatile__
(
"sb %[p4], 1(%[s2])
\n\t
"
"sb %[p3], 0(%[s2])
\n\t
"
"sb %[p2], -1(%[s2])
\n\t
"
"sb %[p1], -2(%[s2])
\n\t
"
:
:
[
p4
]
"r"
(
p4
),
[
p3
]
"r"
(
p3
),
[
p2
]
"r"
(
p2
),
[
p1
]
"r"
(
p1
),
[
s2
]
"r"
(
s2
)
);
__asm__
__volatile__
(
"srl %[p4], %[p4], 8
\n\t
"
"srl %[p3], %[p3], 8
\n\t
"
"srl %[p2], %[p2], 8
\n\t
"
"srl %[p1], %[p1], 8
\n\t
"
:
[
p4
]
"+r"
(
p4
),
[
p3
]
"+r"
(
p3
),
[
p2
]
"+r"
(
p2
),
[
p1
]
"+r"
(
p1
)
:
);
__asm__
__volatile__
(
"sb %[p4], 1(%[s1])
\n\t
"
"sb %[p3], 0(%[s1])
\n\t
"
"sb %[p2], -1(%[s1])
\n\t
"
"sb %[p1], -2(%[s1])
\n\t
"
:
:
[
p4
]
"r"
(
p4
),
[
p3
]
"r"
(
p3
),
[
p2
]
"r"
(
p2
),
[
p1
]
"r"
(
p1
),
[
s1
]
"r"
(
s1
)
);
}
}
}
}
#endif // #if HAVE_DSPR2
vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
0 → 100644
View file @
248cf6f6
This diff is collapsed.
Click to expand it.
vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
0 → 100644
View file @
248cf6f6
This diff is collapsed.
Click to expand it.
vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
0 → 100644
View file @
248cf6f6
/*
* Copyright (c) 2013 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
#include <stdlib.h>
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_onyxc_int.h"
#if HAVE_DSPR2
/* processing 4 pixels at the same time
* compute hev and mask in the same function */
static
INLINE
void
vp9_filter_hev_mask_dspr2
(
uint32_t
limit
,
uint32_t
flimit
,
uint32_t
p1
,
uint32_t
p0
,
uint32_t
p3
,
uint32_t
p2
,
uint32_t
q0
,
uint32_t
q1
,
uint32_t
q2
,
uint32_t
q3
,
uint32_t
thresh
,
uint32_t
*
hev
,
uint32_t
*
mask
)
{
uint32_t
c
,
r
,
r3
,
r_k
;
uint32_t
s1
,
s2
,
s3
;
uint32_t
ones
=
0xFFFFFFFF
;
uint32_t
hev1
;
__asm__
__volatile__
(
/* mask |= (abs(p3 - p2) > limit) */
"subu_s.qb %[c], %[p3], %[p2]
\n\t
"
"subu_s.qb %[r_k], %[p2], %[p3]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[limit], %[r_k]
\n\t
"
"or %[r], $0, %[c]
\n\t
"
/* mask |= (abs(p2 - p1) > limit) */
"subu_s.qb %[c], %[p2], %[p1]
\n\t
"
"subu_s.qb %[r_k], %[p1], %[p2]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[limit], %[r_k]
\n\t
"
"or %[r], %[r], %[c]
\n\t
"
/* mask |= (abs(p1 - p0) > limit)
* hev |= (abs(p1 - p0) > thresh)
*/
"subu_s.qb %[c], %[p1], %[p0]
\n\t
"
"subu_s.qb %[r_k], %[p0], %[p1]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[thresh], %[r_k]
\n\t
"
"or %[r3], $0, %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[limit], %[r_k]
\n\t
"
"or %[r], %[r], %[c]
\n\t
"
/* mask |= (abs(q1 - q0) > limit)
* hev |= (abs(q1 - q0) > thresh)
*/
"subu_s.qb %[c], %[q1], %[q0]
\n\t
"
"subu_s.qb %[r_k], %[q0], %[q1]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[thresh], %[r_k]
\n\t
"
"or %[r3], %[r3], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[limit], %[r_k]
\n\t
"
"or %[r], %[r], %[c]
\n\t
"
/* mask |= (abs(q2 - q1) > limit) */
"subu_s.qb %[c], %[q2], %[q1]
\n\t
"
"subu_s.qb %[r_k], %[q1], %[q2]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[limit], %[r_k]
\n\t
"
"or %[r], %[r], %[c]
\n\t
"
"sll %[r3], %[r3], 24
\n\t
"
/* mask |= (abs(q3 - q2) > limit) */
"subu_s.qb %[c], %[q3], %[q2]
\n\t
"
"subu_s.qb %[r_k], %[q2], %[q3]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[limit], %[r_k]
\n\t
"
"or %[r], %[r], %[c]
\n\t
"
:
[
c
]
"=&r"
(
c
),
[
r_k
]
"=&r"
(
r_k
),
[
r
]
"=&r"
(
r
),
[
r3
]
"=&r"
(
r3
)
:
[
limit
]
"r"
(
limit
),
[
p3
]
"r"
(
p3
),
[
p2
]
"r"
(
p2
),
[
p1
]
"r"
(
p1
),
[
p0
]
"r"
(
p0
),
[
q1
]
"r"
(
q1
),
[
q0
]
"r"
(
q0
),
[
q2
]
"r"
(
q2
),
[
q3
]
"r"
(
q3
),
[
thresh
]
"r"
(
thresh
)
);
__asm__
__volatile__
(
/* abs(p0 - q0) */
"subu_s.qb %[c], %[p0], %[q0]
\n\t
"
"subu_s.qb %[r_k], %[q0], %[p0]
\n\t
"
"wrdsp %[r3]
\n\t
"
"or %[s1], %[r_k], %[c]
\n\t
"
/* abs(p1 - q1) */
"subu_s.qb %[c], %[p1], %[q1]
\n\t
"
"addu_s.qb %[s3], %[s1], %[s1]
\n\t
"
"pick.qb %[hev1], %[ones], $0
\n\t
"
"subu_s.qb %[r_k], %[q1], %[p1]
\n\t
"
"or %[s2], %[r_k], %[c]
\n\t
"
/* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
"shrl.qb %[s2], %[s2], 1
\n\t
"
"addu_s.qb %[s1], %[s2], %[s3]
\n\t
"
"cmpgu.lt.qb %[c], %[flimit], %[s1]
\n\t
"
"or %[r], %[r], %[c]
\n\t
"
"sll %[r], %[r], 24
\n\t
"
"wrdsp %[r]
\n\t
"
"pick.qb %[s2], $0, %[ones]
\n\t
"
:
[
c
]
"=&r"
(
c
),
[
r_k
]
"=&r"
(
r_k
),
[
s1
]
"=&r"
(
s1
),
[
hev1
]
"=&r"
(
hev1
),
[
s2
]
"=&r"
(
s2
),
[
r
]
"+r"
(
r
),
[
s3
]
"=&r"
(
s3
)
:
[
p0
]
"r"
(
p0
),
[
q0
]
"r"
(
q0
),
[
p1
]
"r"
(
p1
),
[
r3
]
"r"
(
r3
),
[
q1
]
"r"
(
q1
),
[
ones
]
"r"
(
ones
),
[
flimit
]
"r"
(
flimit
)
);
*
hev
=
hev1
;
*
mask
=
s2
;
}
static
INLINE
void
vp9_filter_hev_mask_flatmask4_dspr2
(
uint32_t
limit
,
uint32_t
flimit
,
uint32_t
thresh
,
uint32_t
p1
,
uint32_t
p0
,
uint32_t
p3
,
uint32_t
p2
,
uint32_t
q0
,
uint32_t
q1
,
uint32_t
q2
,
uint32_t
q3
,
uint32_t
*
hev
,
uint32_t
*
mask
,
uint32_t
*
flat
)
{
uint32_t
c
,
r
,
r3
,
r_k
,
r_flat
;
uint32_t
s1
,
s2
,
s3
;
uint32_t
ones
=
0xFFFFFFFF
;
uint32_t
flat_thresh
=
0x01010101
;
uint32_t
hev1
;
uint32_t
flat1
;
__asm__
__volatile__
(
/* mask |= (abs(p3 - p2) > limit) */
"subu_s.qb %[c], %[p3], %[p2]
\n\t
"
"subu_s.qb %[r_k], %[p2], %[p3]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[limit], %[r_k]
\n\t
"
"or %[r], $0, %[c]
\n\t
"
/* mask |= (abs(p2 - p1) > limit) */
"subu_s.qb %[c], %[p2], %[p1]
\n\t
"
"subu_s.qb %[r_k], %[p1], %[p2]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[limit], %[r_k]
\n\t
"
"or %[r], %[r], %[c]
\n\t
"
/* mask |= (abs(p1 - p0) > limit)
* hev |= (abs(p1 - p0) > thresh)
* flat |= (abs(p1 - p0) > thresh)
*/
"subu_s.qb %[c], %[p1], %[p0]
\n\t
"
"subu_s.qb %[r_k], %[p0], %[p1]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[thresh], %[r_k]
\n\t
"
"or %[r3], $0, %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[limit], %[r_k]
\n\t
"
"or %[r], %[r], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[flat_thresh], %[r_k]
\n\t
"
"or %[r_flat], $0, %[c]
\n\t
"
/* mask |= (abs(q1 - q0) > limit)
* hev |= (abs(q1 - q0) > thresh)
* flat |= (abs(q1 - q0) > thresh)
*/
"subu_s.qb %[c], %[q1], %[q0]
\n\t
"
"subu_s.qb %[r_k], %[q0], %[q1]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[thresh], %[r_k]
\n\t
"
"or %[r3], %[r3], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[limit], %[r_k]
\n\t
"
"or %[r], %[r], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[flat_thresh], %[r_k]
\n\t
"
"or %[r_flat], %[r_flat], %[c]
\n\t
"
/* flat |= (abs(p0 - p2) > thresh) */
"subu_s.qb %[c], %[p0], %[p2]
\n\t
"
"subu_s.qb %[r_k], %[p2], %[p0]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[flat_thresh], %[r_k]
\n\t
"
"or %[r_flat], %[r_flat], %[c]
\n\t
"
/* flat |= (abs(q0 - q2) > thresh) */
"subu_s.qb %[c], %[q0], %[q2]
\n\t
"
"subu_s.qb %[r_k], %[q2], %[q0]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[flat_thresh], %[r_k]
\n\t
"
"or %[r_flat], %[r_flat], %[c]
\n\t
"
/* flat |= (abs(p3 - p0) > thresh) */
"subu_s.qb %[c], %[p3], %[p0]
\n\t
"
"subu_s.qb %[r_k], %[p0], %[p3]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[flat_thresh], %[r_k]
\n\t
"
"or %[r_flat], %[r_flat], %[c]
\n\t
"
/* flat |= (abs(q3 - q0) > thresh) */
"subu_s.qb %[c], %[q3], %[q0]
\n\t
"
"subu_s.qb %[r_k], %[q0], %[q3]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[flat_thresh], %[r_k]
\n\t
"
"or %[r_flat], %[r_flat], %[c]
\n\t
"
"sll %[r_flat], %[r_flat], 24
\n\t
"
/* look at stall here */
"wrdsp %[r_flat]
\n\t
"
"pick.qb %[flat1], $0, %[ones]
\n\t
"
/* mask |= (abs(q2 - q1) > limit) */
"subu_s.qb %[c], %[q2], %[q1]
\n\t
"
"subu_s.qb %[r_k], %[q1], %[q2]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[limit], %[r_k]
\n\t
"
"or %[r], %[r], %[c]
\n\t
"
"sll %[r3], %[r3], 24
\n\t
"
/* mask |= (abs(q3 - q2) > limit) */
"subu_s.qb %[c], %[q3], %[q2]
\n\t
"
"subu_s.qb %[r_k], %[q2], %[q3]
\n\t
"
"or %[r_k], %[r_k], %[c]
\n\t
"
"cmpgu.lt.qb %[c], %[limit], %[r_k]
\n\t
"
"or %[r], %[r], %[c]
\n\t
"
:
[
c
]
"=&r"
(
c
),
[
r_k
]
"=&r"
(
r_k
),
[
r
]
"=&r"
(
r
),
[
r3
]
"=&r"
(
r3
),
[
r_flat
]
"=&r"
(
r_flat
),
[
flat1
]
"=&r"
(
flat1
)
:
[
limit
]
"r"
(
limit
),
[
p3
]
"r"
(
p3
),
[
p2
]
"r"
(
p2
),
[
p1
]
"r"
(
p1
),
[
p0
]
"r"
(
p0
),
[
q1
]
"r"
(
q1
),
[
q0
]
"r"
(
q0
),
[
q2
]
"r"
(
q2
),
[
q3
]
"r"
(
q3
),
[
thresh
]
"r"
(
thresh
),
[
flat_thresh
]
"r"
(
flat_thresh
),
[
ones
]
"r"
(
ones
)
);
__asm__
__volatile__
(
/* abs(p0 - q0) */
"subu_s.qb %[c], %[p0], %[q0]
\n\t
"
"subu_s.qb %[r_k], %[q0], %[p0]
\n\t
"
"wrdsp %[r3]
\n\t
"
"or %[s1], %[r_k], %[c]
\n\t
"
/* abs(p1 - q1) */
"subu_s.qb %[c], %[p1], %[q1]
\n\t
"
"addu_s.qb %[s3], %[s1], %[s1]
\n\t
"
"pick.qb %[hev1], %[ones], $0
\n\t
"
"subu_s.qb %[r_k], %[q1], %[p1]
\n\t
"
"or %[s2], %[r_k], %[c]
\n\t
"
/* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
"shrl.qb %[s2], %[s2], 1
\n\t
"
"addu_s.qb %[s1], %[s2], %[s3]
\n\t
"
"cmpgu.lt.qb %[c], %[flimit], %[s1]
\n\t
"
"or %[r], %[r], %[c]
\n\t
"
"sll %[r], %[r], 24
\n\t
"
"wrdsp %[r]
\n\t
"
"pick.qb %[s2], $0, %[ones]
\n\t
"
:
[
c
]
"=&r"
(
c
),
[
r_k
]
"=&r"
(
r_k
),
[
s1
]
"=&r"
(
s1
),
[
hev1
]
"=&r"
(
hev1
),
[
s2
]
"=&r"
(
s2
),
[
r
]
"+r"
(
r
),
[
s3
]
"=&r"
(
s3
)
:
[
p0
]
"r"
(
p0
),
[
q0
]
"r"
(
q0
),
[
p1
]
"r"
(
p1
),
[
r3
]
"r"
(
r3
),
[
q1
]
"r"
(
q1
),
[
ones
]
"r"
(
ones
),
[
flimit
]
"r"
(
flimit
)
);
*
hev
=
hev1
;
*
mask
=
s2
;
*
flat
=
flat1
;
}