Commit 3556deac authored by Johann's avatar Johann
Browse files

combine loopfilter data access

The data processed by the loopfilter overlaps. At the block level, this
results in some redundant transforms. Grouping the filtering allows for
a single 16x16 transpose (and inversion) instead of three 16x8 transposes
(and three more inversions).

This implementation is x86_64 only. We retain the previous
implementation for x86.

Improvements are obviously material dependant, but it seems to be ~%1 in
tests here.

Change-Id: I467b7ec3655be98fb5f1a94b5d145e5e5a660007
parent 6f9457ec
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%macro LF_ABS 2
; %1 value not preserved
; %2 value preserved
; output in %1
movdqa scratch1, %2 ; v2
psubusb scratch1, %1 ; v2 - v1
psubusb %1, %2 ; v1 - v2
por %1, scratch1 ; abs(v2 - v1)
%endmacro
%macro LF_FILTER_HEV_MASK 8-9
LF_ABS %1, %2 ; abs(p3 - p2)
LF_ABS %2, %3 ; abs(p2 - p1)
pmaxub %1, %2 ; accumulate mask
%if %0 == 8
movdqa scratch2, %3 ; save p1
LF_ABS scratch2, %4 ; abs(p1 - p0)
%endif
LF_ABS %4, %5 ; abs(p0 - q0)
LF_ABS %5, %6 ; abs(q0 - q1)
%if %0 == 8
pmaxub %5, scratch2 ; accumulate hev
%else
pmaxub %5, %9
%endif
pmaxub %1, %5 ; accumulate mask
LF_ABS %3, %6 ; abs(p1 - q1)
LF_ABS %6, %7 ; abs(q1 - q2)
pmaxub %1, %6 ; accumulate mask
LF_ABS %7, %8 ; abs(q2 - q3)
pmaxub %1, %7 ; accumulate mask
paddusb %4, %4 ; 2 * abs(p0 - q0)
pand %3, [GLOBAL(tfe)]
psrlw %3, 1 ; abs(p1 - q1) / 2
paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
psubusb %1, [limit]
psubusb %4, [blimit]
por %1, %4
pcmpeqb %1, zero ; mask
psubusb %5, [thresh]
pcmpeqb %5, zero ; ~hev
%endmacro
%macro LF_FILTER 6
; %1-%4: p1-q1
; %5: mask
; %6: hev
movdqa scratch2, %6 ; save hev
pxor %1, [GLOBAL(t80)] ; ps1
pxor %4, [GLOBAL(t80)] ; qs1
movdqa scratch1, %1
psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
pandn scratch2, scratch1 ; vp8_filter &= hev
pxor %2, [GLOBAL(t80)] ; ps0
pxor %3, [GLOBAL(t80)] ; qs0
movdqa scratch1, %3
psubsb scratch1, %2 ; qs0 - ps0
paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
pand %5, scratch2 ; &= mask
movdqa scratch2, %5
paddsb %5, [GLOBAL(t4)] ; Filter1
paddsb scratch2, [GLOBAL(t3)] ; Filter2
; Filter1 >> 3
movdqa scratch1, zero
pcmpgtb scratch1, %5
psrlw %5, 3
pand scratch1, [GLOBAL(te0)]
pand %5, [GLOBAL(t1f)]
por %5, scratch1
psubsb %3, %5 ; qs0 - Filter1
pxor %3, [GLOBAL(t80)]
; Filter2 >> 3
movdqa scratch1, zero
pcmpgtb scratch1, scratch2
psrlw scratch2, 3
pand scratch1, [GLOBAL(te0)]
pand scratch2, [GLOBAL(t1f)]
por scratch2, scratch1
paddsb %2, scratch2 ; ps0 + Filter2
pxor %2, [GLOBAL(t80)]
; outer tap adjustments
paddsb %5, [GLOBAL(t1)]
movdqa scratch1, zero
pcmpgtb scratch1, %5
psrlw %5, 1
pand scratch1, [GLOBAL(t80)]
pand %5, [GLOBAL(t7f)]
por %5, scratch1
pand %5, %6 ; vp8_filter &= ~hev
psubsb %4, %5 ; qs1 - vp8_filter
pxor %4, [GLOBAL(t80)]
paddsb %1, %5 ; ps1 + vp8_filter
pxor %1, [GLOBAL(t80)]
%endmacro
;void vp8_loop_filter_bh_y_sse2
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit,
; const char *limit,
; const char *thresh
;)
global sym(vp8_loop_filter_bh_y_sse2)
sym(vp8_loop_filter_bh_y_sse2):
%ifidn __OUTPUT_FORMAT__,x64
%define src rcx ; src_ptr
%define stride rdx ; src_pixel_step
%define blimit r8
%define limit r9
%define thresh r10
%define spp rax
%define stride3 r11
%define stride5 r12
%define stride7 r13
push rbp
mov rbp, rsp
push r12
push r13
mov thresh, arg(4)
%else
%define src rdi ; src_ptr
%define stride rsi ; src_pixel_step
%define blimit rdx
%define limit rcx
%define thresh r8
%define spp rax
%define stride3 r9
%define stride5 r10
%define stride7 r11
%endif
%define scratch1 xmm5
%define scratch2 xmm6
%define zero xmm7
%define i0 [src]
%define i1 [spp]
%define i2 [src + 2 * stride]
%define i3 [spp + 2 * stride]
%define i4 [src + 4 * stride]
%define i5 [spp + 4 * stride]
%define i6 [src + 2 * stride3]
%define i7 [spp + 2 * stride3]
%define i8 [src + 8 * stride]
%define i9 [spp + 8 * stride]
%define i10 [src + 2 * stride5]
%define i11 [spp + 2 * stride5]
%define i12 [src + 4 * stride3]
%define i13 [spp + 4 * stride3]
%define i14 [src + 2 * stride7]
%define i15 [spp + 2 * stride7]
; prep work
lea spp, [src + stride]
lea stride3, [stride + 2 * stride]
lea stride5, [stride3 + 2 * stride]
lea stride7, [stride3 + 4 * stride]
pxor zero, zero
; load the first set into registers
movdqa xmm0, i0
movdqa xmm1, i1
movdqa xmm2, i2
movdqa xmm3, i3
movdqa xmm4, i4
movdqa xmm8, i5
movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
movdqa xmm10, i7
LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
movdqa xmm1, i2
movdqa xmm2, i3
movdqa xmm3, i4
movdqa xmm8, i5
LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
movdqa i2, xmm1
movdqa i3, xmm2
; second set
movdqa i4, xmm3
movdqa i5, xmm8
movdqa xmm0, i6
movdqa xmm1, i7
movdqa xmm2, i8
movdqa xmm4, i9
movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
movdqa xmm11, i11
LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
movdqa xmm0, i6
movdqa xmm1, i7
movdqa xmm4, i8
movdqa xmm8, i9
LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
movdqa i6, xmm0
movdqa i7, xmm1
; last set
movdqa i8, xmm4
movdqa i9, xmm8
movdqa xmm0, i10
movdqa xmm1, i11
movdqa xmm2, i12
movdqa xmm3, i13
movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
movdqa xmm11, i15
LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
movdqa xmm0, i10
movdqa xmm1, i11
movdqa xmm3, i12
movdqa xmm8, i13
LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
movdqa i10, xmm0
movdqa i11, xmm1
movdqa i12, xmm3
movdqa i13, xmm8
%ifidn __OUTPUT_FORMAT__,x64
pop r13
pop r12
pop rbp
%endif
ret
;void vp8_loop_filter_bv_y_sse2
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *blimit,
; const char *limit,
; const char *thresh
;)
global sym(vp8_loop_filter_bv_y_sse2)
sym(vp8_loop_filter_bv_y_sse2):
%ifidn __OUTPUT_FORMAT__,x64
%define src rcx ; src_ptr
%define stride rdx ; src_pixel_step
%define blimit r8
%define limit r9
%define thresh r10
%define spp rax
%define stride3 r11
%define stride5 r12
%define stride7 r13
push rbp
mov rbp, rsp
SAVE_XMM 15
push r12
push r13
mov thresh, arg(4)
%else
%define src rdi
%define stride rsi
%define blimit rdx
%define limit rcx
%define thresh r8
%define spp rax
%define stride3 r9
%define stride5 r10
%define stride7 r11
%endif
%define scratch1 xmm5
%define scratch2 xmm6
%define zero xmm7
%define s0 [src]
%define s1 [spp]
%define s2 [src + 2 * stride]
%define s3 [spp + 2 * stride]
%define s4 [src + 4 * stride]
%define s5 [spp + 4 * stride]
%define s6 [src + 2 * stride3]
%define s7 [spp + 2 * stride3]
%define s8 [src + 8 * stride]
%define s9 [spp + 8 * stride]
%define s10 [src + 2 * stride5]
%define s11 [spp + 2 * stride5]
%define s12 [src + 4 * stride3]
%define s13 [spp + 4 * stride3]
%define s14 [src + 2 * stride7]
%define s15 [spp + 2 * stride7]
%define i0 [rsp]
%define i1 [rsp + 16]
%define i2 [rsp + 32]
%define i3 [rsp + 48]
%define i4 [rsp + 64]
%define i5 [rsp + 80]
%define i6 [rsp + 96]
%define i7 [rsp + 112]
%define i8 [rsp + 128]
%define i9 [rsp + 144]
%define i10 [rsp + 160]
%define i11 [rsp + 176]
%define i12 [rsp + 192]
%define i13 [rsp + 208]
%define i14 [rsp + 224]
%define i15 [rsp + 240]
ALIGN_STACK 16, rax
; reserve stack space
%define temp_storage 0 ; size is 256 (16*16)
%define stack_size 256
sub rsp, stack_size
; prep work
lea spp, [src + stride]
lea stride3, [stride + 2 * stride]
lea stride5, [stride3 + 2 * stride]
lea stride7, [stride3 + 4 * stride]
; 8-f
movdqa xmm0, s8
movdqa xmm1, xmm0
punpcklbw xmm0, s9 ; 80 90
punpckhbw xmm1, s9 ; 88 98
movdqa xmm2, s10
movdqa xmm3, xmm2
punpcklbw xmm2, s11 ; a0 b0
punpckhbw xmm3, s11 ; a8 b8
movdqa xmm4, xmm0
punpcklwd xmm0, xmm2 ; 80 90 a0 b0
punpckhwd xmm4, xmm2 ; 84 94 a4 b4
movdqa xmm2, xmm1
punpcklwd xmm1, xmm3 ; 88 98 a8 b8
punpckhwd xmm2, xmm3 ; 8c 9c ac bc
; using xmm[0124]
; work on next 4 rows
movdqa xmm3, s12
movdqa xmm5, xmm3
punpcklbw xmm3, s13 ; c0 d0
punpckhbw xmm5, s13 ; c8 d8
movdqa xmm6, s14
movdqa xmm7, xmm6
punpcklbw xmm6, s15 ; e0 f0
punpckhbw xmm7, s15 ; e8 f8
movdqa xmm8, xmm3
punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
movdqa xmm6, xmm5
punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
punpckhwd xmm6, xmm7 ; cc dc ec fc
; pull the third and fourth sets together
movdqa xmm7, xmm0
punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
movdqa xmm3, xmm4
punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
movdqa xmm8, xmm1
punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
movdqa xmm5, xmm2
punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
; save the calculations. we only have 15 registers ...
movdqa i0, xmm0
movdqa i1, xmm7
movdqa i2, xmm4
movdqa i3, xmm3
movdqa i4, xmm1
movdqa i5, xmm8
movdqa i6, xmm2
movdqa i7, xmm5
; 0-7
movdqa xmm0, s0
movdqa xmm1, xmm0
punpcklbw xmm0, s1 ; 00 10
punpckhbw xmm1, s1 ; 08 18
movdqa xmm2, s2
movdqa xmm3, xmm2
punpcklbw xmm2, s3 ; 20 30
punpckhbw xmm3, s3 ; 28 38
movdqa xmm4, xmm0
punpcklwd xmm0, xmm2 ; 00 10 20 30
punpckhwd xmm4, xmm2 ; 04 14 24 34
movdqa xmm2, xmm1
punpcklwd xmm1, xmm3 ; 08 18 28 38
punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
; using xmm[0124]
; work on next 4 rows
movdqa xmm3, s4
movdqa xmm5, xmm3
punpcklbw xmm3, s5 ; 40 50
punpckhbw xmm5, s5 ; 48 58
movdqa xmm6, s6
movdqa xmm7, xmm6
punpcklbw xmm6, s7 ; 60 70
punpckhbw xmm7, s7 ; 68 78
movdqa xmm8, xmm3
punpcklwd xmm3, xmm6 ; 40 50 60 70
punpckhwd xmm8, xmm6 ; 44 54 64 74
movdqa xmm6, xmm5
punpcklwd xmm5, xmm7 ; 48 58 68 78
punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
; pull the first two sets together
movdqa xmm7, xmm0
punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
movdqa xmm3, xmm4
punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
movdqa xmm8, xmm1
punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
movdqa xmm5, xmm2
punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
; final combination
movdqa xmm6, xmm0
punpcklqdq xmm0, i0
punpckhqdq xmm6, i0
movdqa xmm9, xmm7
punpcklqdq xmm7, i1
punpckhqdq xmm9, i1
movdqa xmm10, xmm4
punpcklqdq xmm4, i2
punpckhqdq xmm10, i2
movdqa xmm11, xmm3
punpcklqdq xmm3, i3
punpckhqdq xmm11, i3
movdqa xmm12, xmm1
punpcklqdq xmm1, i4
punpckhqdq xmm12, i4
movdqa xmm13, xmm8
punpcklqdq xmm8, i5
punpckhqdq xmm13, i5
movdqa xmm14, xmm2
punpcklqdq xmm2, i6
punpckhqdq xmm14, i6
movdqa xmm15, xmm5
punpcklqdq xmm5, i7
punpckhqdq xmm15, i7
movdqa i0, xmm0
movdqa i1, xmm6
movdqa i2, xmm7
movdqa i3, xmm9
movdqa i4, xmm4
movdqa i5, xmm10
movdqa i6, xmm3
movdqa i7, xmm11
movdqa i8, xmm1
movdqa i9, xmm12
movdqa i10, xmm8
movdqa i11, xmm13
movdqa i12, xmm2
movdqa i13, xmm14
movdqa i14, xmm5
movdqa i15, xmm15
; TRANSPOSED DATA AVAILABLE ON THE STACK
movdqa xmm12, xmm6
movdqa xmm13, xmm7
pxor zero, zero
LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
movdqa xmm1, i2
movdqa xmm2, i3
movdqa xmm8, i4
movdqa xmm9, i5
LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
movdqa i2, xmm1
movdqa i3, xmm2
; second set
movdqa i4, xmm8
movdqa i5, xmm9
movdqa xmm0, i6
movdqa xmm1, i7
movdqa xmm2, i8
movdqa xmm4, i9
movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
movdqa xmm11, i11
LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
movdqa xmm0, i6
movdqa xmm1, i7
movdqa xmm3, i8
movdqa xmm4, i9
LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
movdqa i6, xmm0
movdqa i7, xmm1
; last set
movdqa i8, xmm3
movdqa i9, xmm4
movdqa xmm0, i10
movdqa xmm1, i11
movdqa xmm2, i12
movdqa xmm8, i13
movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
movdqa xmm11, i15
LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
movdqa xmm0, i10
movdqa xmm1, i11
movdqa xmm4, i12
movdqa xmm8, i13
LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
movdqa i10, xmm0
movdqa i11, xmm1
movdqa i12, xmm4
movdqa i13, xmm8
; RESHUFFLE AND WRITE OUT
; 8-f
movdqa xmm0, i8
movdqa xmm1, xmm0
punpcklbw xmm0, i9 ; 80 90
punpckhbw xmm1, i9 ; 88 98
movdqa xmm2, i10
movdqa xmm3, xmm2
punpcklbw xmm2, i11 ; a0 b0
punpckhbw xmm3, i11 ; a8 b8
movdqa xmm4, xmm0
punpcklwd xmm0, xmm2 ; 80 90 a0 b0
punpckhwd xmm4, xmm2 ; 84 94 a4 b4
movdqa xmm2, xmm1
punpcklwd xmm1, xmm3 ; 88 98 a8 b8