Commit b1ba4cc3 authored by Johann's avatar Johann
Browse files

Rearrange loopfilter functions

Separate functions and rename files. This will make it easier to disable
some functions later to help work around a compiler issue in chromium.

Change-Id: I7f30e109f77c4cd22e2eda7bd006672f090c1dc5
parent fca0037e
......@@ -124,7 +124,6 @@ static INLINE void vp9_loop_filter_neon_16(
return;
}
#if !HAVE_NEON_ASM
void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
const uint8_t *blimit0,
const uint8_t *limit0,
......@@ -178,47 +177,3 @@ void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
vst1q_u8(s, q8u8);
return;
}
#endif // !HAVE_NEON_ASM
void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
const uint8_t *blimit0,
const uint8_t *limit0,
const uint8_t *thresh0,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
}
void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
const uint8_t *blimit0,
const uint8_t *limit0,
const uint8_t *thresh0,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
}
void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
const uint8_t *blimit0,
const uint8_t *limit0,
const uint8_t *thresh0,
const uint8_t *blimit1,
const uint8_t *limit1,
const uint8_t *thresh1) {
vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
}
#if HAVE_NEON_ASM
void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
}
#endif // HAVE_NEON_ASM
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
static INLINE void vp9_loop_filter_neon(
uint8x8_t dblimit, // flimit
uint8x8_t dlimit, // limit
uint8x8_t dthresh, // thresh
uint8x8_t d3u8, // p3
uint8x8_t d4u8, // p2
uint8x8_t d5u8, // p1
uint8x8_t d6u8, // p0
uint8x8_t d7u8, // q0
uint8x8_t d16u8, // q1
uint8x8_t d17u8, // q2
uint8x8_t d18u8, // q3
uint8x8_t *d4ru8, // p1
uint8x8_t *d5ru8, // p0
uint8x8_t *d6ru8, // q0
uint8x8_t *d7ru8) { // q1
uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
int16x8_t q12s16;
int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
d19u8 = vabd_u8(d3u8, d4u8);
d20u8 = vabd_u8(d4u8, d5u8);
d21u8 = vabd_u8(d5u8, d6u8);
d22u8 = vabd_u8(d16u8, d7u8);
d3u8 = vabd_u8(d17u8, d16u8);
d4u8 = vabd_u8(d18u8, d17u8);
d19u8 = vmax_u8(d19u8, d20u8);
d20u8 = vmax_u8(d21u8, d22u8);
d3u8 = vmax_u8(d3u8, d4u8);
d23u8 = vmax_u8(d19u8, d20u8);
d17u8 = vabd_u8(d6u8, d7u8);
d21u8 = vcgt_u8(d21u8, dthresh);
d22u8 = vcgt_u8(d22u8, dthresh);
d23u8 = vmax_u8(d23u8, d3u8);
d28u8 = vabd_u8(d5u8, d16u8);
d17u8 = vqadd_u8(d17u8, d17u8);
d23u8 = vcge_u8(dlimit, d23u8);
d18u8 = vdup_n_u8(0x80);
d5u8 = veor_u8(d5u8, d18u8);
d6u8 = veor_u8(d6u8, d18u8);
d7u8 = veor_u8(d7u8, d18u8);
d16u8 = veor_u8(d16u8, d18u8);
d28u8 = vshr_n_u8(d28u8, 1);
d17u8 = vqadd_u8(d17u8, d28u8);
d19u8 = vdup_n_u8(3);
d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
vreinterpret_s8_u8(d6u8));
d17u8 = vcge_u8(dblimit, d17u8);
d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
vreinterpret_s8_u8(d16u8));
d22u8 = vorr_u8(d21u8, d22u8);
q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
d23u8 = vand_u8(d23u8, d17u8);
q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
d17u8 = vdup_n_u8(4);
d27s8 = vqmovn_s16(q12s16);
d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
d27s8 = vreinterpret_s8_u8(d27u8);
d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
d28s8 = vshr_n_s8(d28s8, 3);
d27s8 = vshr_n_s8(d27s8, 3);
d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
d27s8 = vrshr_n_s8(d27s8, 1);
d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
*d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
*d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
*d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
*d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
return;
}
void vp9_lpf_horizontal_4_neon(
unsigned char *src,
int pitch,
unsigned char *blimit,
unsigned char *limit,
unsigned char *thresh,
int count) {
int i;
uint8_t *s, *psrc;
uint8x8_t dblimit, dlimit, dthresh;
uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
if (count == 0) // end_vp9_lf_h_edge
return;
dblimit = vld1_u8(blimit);
dlimit = vld1_u8(limit);
dthresh = vld1_u8(thresh);
psrc = src - (pitch << 2);
for (i = 0; i < count; i++) {
s = psrc + i * 8;
d3u8 = vld1_u8(s);
s += pitch;
d4u8 = vld1_u8(s);
s += pitch;
d5u8 = vld1_u8(s);
s += pitch;
d6u8 = vld1_u8(s);
s += pitch;
d7u8 = vld1_u8(s);
s += pitch;
d16u8 = vld1_u8(s);
s += pitch;
d17u8 = vld1_u8(s);
s += pitch;
d18u8 = vld1_u8(s);
vp9_loop_filter_neon(dblimit, dlimit, dthresh,
d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
&d4u8, &d5u8, &d6u8, &d7u8);
s -= (pitch * 5);
vst1_u8(s, d4u8);
s += pitch;
vst1_u8(s, d5u8);
s += pitch;
vst1_u8(s, d6u8);
s += pitch;
vst1_u8(s, d7u8);
}
return;
}
void vp9_lpf_vertical_4_neon(
unsigned char *src,
int pitch,
unsigned char *blimit,
unsigned char *limit,
unsigned char *thresh,
int count) {
int i, pitch8;
uint8_t *s;
uint8x8_t dblimit, dlimit, dthresh;
uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
uint8x8x4_t d4Result;
if (count == 0) // end_vp9_lf_h_edge
return;
dblimit = vld1_u8(blimit);
dlimit = vld1_u8(limit);
dthresh = vld1_u8(thresh);
pitch8 = pitch * 8;
for (i = 0; i < count; i++, src += pitch8) {
s = src - (i + 1) * 4;
d3u8 = vld1_u8(s);
s += pitch;
d4u8 = vld1_u8(s);
s += pitch;
d5u8 = vld1_u8(s);
s += pitch;
d6u8 = vld1_u8(s);
s += pitch;
d7u8 = vld1_u8(s);
s += pitch;
d16u8 = vld1_u8(s);
s += pitch;
d17u8 = vld1_u8(s);
s += pitch;
d18u8 = vld1_u8(s);
d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
vreinterpret_u32_u8(d7u8));
d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
vreinterpret_u32_u8(d16u8));
d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
vreinterpret_u32_u8(d17u8));
d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
vreinterpret_u32_u8(d18u8));
d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
vreinterpret_u16_u32(d2tmp2.val[0]));
d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
vreinterpret_u16_u32(d2tmp3.val[0]));
d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
vreinterpret_u16_u32(d2tmp2.val[1]));
d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
vreinterpret_u16_u32(d2tmp3.val[1]));
d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
vreinterpret_u8_u16(d2tmp5.val[0]));
d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
vreinterpret_u8_u16(d2tmp5.val[1]));
d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
vreinterpret_u8_u16(d2tmp7.val[0]));
d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
vreinterpret_u8_u16(d2tmp7.val[1]));
d3u8 = d2tmp8.val[0];
d4u8 = d2tmp8.val[1];
d5u8 = d2tmp9.val[0];
d6u8 = d2tmp9.val[1];
d7u8 = d2tmp10.val[0];
d16u8 = d2tmp10.val[1];
d17u8 = d2tmp11.val[0];
d18u8 = d2tmp11.val[1];
vp9_loop_filter_neon(dblimit, dlimit, dthresh,
d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
&d4u8, &d5u8, &d6u8, &d7u8);
d4Result.val[0] = d4u8;
d4Result.val[1] = d5u8;
d4Result.val[2] = d6u8;
d4Result.val[3] = d7u8;
src -= 2;
vst4_lane_u8(src, d4Result, 0);
src += pitch;
vst4_lane_u8(src, d4Result, 1);
src += pitch;
vst4_lane_u8(src, d4Result, 2);
src += pitch;
vst4_lane_u8(src, d4Result, 3);
src += pitch;
vst4_lane_u8(src, d4Result, 4);
src += pitch;
vst4_lane_u8(src, d4Result, 5);
src += pitch;
vst4_lane_u8(src, d4Result, 6);
src += pitch;
vst4_lane_u8(src, d4Result, 7);
}
return;
}
;
; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_lpf_horizontal_4_neon|
EXPORT |vp9_lpf_vertical_4_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
; works on 16 iterations at a time.
; TODO(fgalligan): See about removing the count code as this function is only
; called with a count of 1.
;
; void vp9_lpf_horizontal_4_neon(uint8_t *s,
; int p /* pitch */,
; const uint8_t *blimit,
; const uint8_t *limit,
; const uint8_t *thresh,
; int count)
;
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
; sp+4 int count
|vp9_lpf_horizontal_4_neon| PROC
push {lr}
vld1.8 {d0[]}, [r2] ; duplicate *blimit
ldr r12, [sp, #8] ; load count
ldr r2, [sp, #4] ; load thresh
add r1, r1, r1 ; double pitch
cmp r12, #0
beq end_vp9_lf_h_edge
vld1.8 {d1[]}, [r3] ; duplicate *limit
vld1.8 {d2[]}, [r2] ; duplicate *thresh
count_lf_h_loop
sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
add r3, r2, r1, lsr #1 ; set to 3 lines down
vld1.u8 {d3}, [r2@64], r1 ; p3
vld1.u8 {d4}, [r3@64], r1 ; p2
vld1.u8 {d5}, [r2@64], r1 ; p1
vld1.u8 {d6}, [r3@64], r1 ; p0
vld1.u8 {d7}, [r2@64], r1 ; q0
vld1.u8 {d16}, [r3@64], r1 ; q1
vld1.u8 {d17}, [r2@64] ; q2
vld1.u8 {d18}, [r3@64] ; q3
sub r2, r2, r1, lsl #1
sub r3, r3, r1, lsl #1
bl vp9_loop_filter_neon
vst1.u8 {d4}, [r2@64], r1 ; store op1
vst1.u8 {d5}, [r3@64], r1 ; store op0
vst1.u8 {d6}, [r2@64], r1 ; store oq0
vst1.u8 {d7}, [r3@64], r1 ; store oq1
add r0, r0, #8
subs r12, r12, #1
bne count_lf_h_loop
end_vp9_lf_h_edge
pop {pc}
ENDP ; |vp9_lpf_horizontal_4_neon|
; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
; works on 16 iterations at a time.
; TODO(fgalligan): See about removing the count code as this function is only
; called with a count of 1.
;
; void vp9_lpf_vertical_4_neon(uint8_t *s,
; int p /* pitch */,
; const uint8_t *blimit,
; const uint8_t *limit,
; const uint8_t *thresh,
; int count)
;
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
; sp+4 int count
|vp9_lpf_vertical_4_neon| PROC
push {lr}
vld1.8 {d0[]}, [r2] ; duplicate *blimit
ldr r12, [sp, #8] ; load count
vld1.8 {d1[]}, [r3] ; duplicate *limit
ldr r3, [sp, #4] ; load thresh
sub r2, r0, #4 ; move s pointer down by 4 columns
cmp r12, #0
beq end_vp9_lf_v_edge
vld1.8 {d2[]}, [r3] ; duplicate *thresh
count_lf_v_loop
vld1.u8 {d3}, [r2], r1 ; load s data
vld1.u8 {d4}, [r2], r1
vld1.u8 {d5}, [r2], r1
vld1.u8 {d6}, [r2], r1
vld1.u8 {d7}, [r2], r1
vld1.u8 {d16}, [r2], r1
vld1.u8 {d17}, [r2], r1
vld1.u8 {d18}, [r2]
;transpose to 8x16 matrix
vtrn.32 d3, d7
vtrn.32 d4, d16
vtrn.32 d5, d17
vtrn.32 d6, d18
vtrn.16 d3, d5
vtrn.16 d4, d6
vtrn.16 d7, d17
vtrn.16 d16, d18
vtrn.8 d3, d4
vtrn.8 d5, d6
vtrn.8 d7, d16
vtrn.8 d17, d18
bl vp9_loop_filter_neon
sub r0, r0, #2
;store op1, op0, oq0, oq1
vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
add r0, r0, r1, lsl #3 ; s += pitch * 8
subs r12, r12, #1
subne r2, r0, #4 ; move s pointer down by 4 columns
bne count_lf_v_loop
end_vp9_lf_v_edge
pop {pc}
ENDP ; |vp9_lpf_vertical_4_neon|
; void vp9_loop_filter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
; necessary load, transpose (if necessary) and store. The function does not use
; registers d8-d15.
;
; Inputs:
; r0-r3, r12 PRESERVE
; d0 blimit
; d1 limit
; d2 thresh
; d3 p3
; d4 p2
; d5 p1
; d6 p0
; d7 q0
; d16 q1
; d17 q2
; d18 q3
;
; Outputs:
; d4 op1
; d5 op0
; d6 oq0
; d7 oq1
|vp9_loop_filter_neon| PROC
; filter_mask
vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
; only compare the largest value to limit
vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
vabd.u8 d17, d6, d7 ; abs(p0 - q0)
vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
vmov.u8 d18, #0x80
vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
; hevmask
vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
veor d7, d7, d18 ; qs0
vcge.u8 d23, d1, d23 ; abs(m1) > limit
; filter() function
; convert to signed
vshr.u8 d28, d28, #1 ; a = a / 2
veor d6, d6, d18 ; ps0
veor d5, d5, d18 ; ps1
vqadd.u8 d17, d17, d28 ; a = b + a
veor d16, d16, d18 ; qs1
vmov.u8 d19, #3
vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
vcge.u8 d17, d0, d17 ; a > blimit
vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
vorr d22, d21, d22 ; hevmask
vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
vand d27, d27, d22 ; filter &= hev
vand d23, d23, d17 ; filter_mask
vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
vmov.u8 d17, #4
; filter = clamp(filter + 3 * ( qs0 - ps0))
vqmovn.s16 d27, q12
vand d27, d27, d23 ; filter &= mask
vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
vshr.s8 d28, d28, #3 ; filter2 >>= 3
vshr.s8 d27, d27, #3 ; filter1 >>= 3
vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
; outer tap adjustments