Commit c3bdffb0 authored by Johann's avatar Johann
Browse files

Move variance functions to vpx_dsp

subpel functions will be moved in another patch.

Change-Id: Idb2e049bad0b9b32ac42cc7731cd6903de2826ce
parent 976f7f42
This diff is collapsed.
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_variance8x8_armv6|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp8_variance8x8_armv6| PROC
push {r4-r10, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r12, #8 ; set loop counter to 8 (=block height)
mov r4, #0 ; initialize sum = 0
mov r5, #0 ; initialize sse = 0
loop
; 1st 4 pixels
ldr r6, [r0, #0x0] ; load 4 src pixels
ldr r7, [r2, #0x0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r8, r6, r7 ; calculate difference
pld [r0, r1, lsl #1]
sel r10, r8, lr ; select bytes with positive difference
usub8 r9, r7, r6 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r6, r10, lr ; calculate sum of positive differences
usad8 r7, r8, lr ; calculate sum of negative differences
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r6, [r0, #0x4] ; load 4 src pixels
ldr r7, [r2, #0x4] ; load 4 ref pixels
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
usub8 r8, r6, r7 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r10, r8, lr ; select bytes with positive difference
usub8 r9, r7, r6 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r6, r10, lr ; calculate sum of positive differences
usad8 r7, r8, lr ; calculate sum of negative differences
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
subs r12, r12, #1 ; next row
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
bne loop
; return stuff
ldr r8, [sp, #32] ; get address of sse
mul r1, r4, r4 ; sum * sum
str r5, [r8] ; store sse
sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
pop {r4-r10, pc}
ENDP
END
......@@ -9,10 +9,14 @@
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "./vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "vp8/common/variance.h"
#include "vp8/common/filter.h"
// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
#if CONFIG_VP8_ENCODER
#if HAVE_MEDIA
#include "vp8/common/arm/bilinearfilter_arm.h"
......@@ -40,8 +44,8 @@ unsigned int vp8_sub_pixel_variance8x8_armv6
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
8, 8, 8, VFilter);
return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
dst_pixels_per_line, sse);
return vpx_variance8x8_media(second_pass, 8, dst_ptr,
dst_pixels_per_line, sse);
}
unsigned int vp8_sub_pixel_variance16x16_armv6
......@@ -86,13 +90,13 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
16, 16, 16, VFilter);
var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
dst_pixels_per_line, sse);
var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
dst_pixels_per_line, sse);
}
return var;
}
#endif /* HAVE_MEDIA */
#endif // HAVE_MEDIA
#if HAVE_NEON
......@@ -129,4 +133,5 @@ unsigned int vp8_sub_pixel_variance16x16_neon
return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
}
#endif
#endif // HAVE_NEON
#endif // CONFIG_VP8_ENCODER
......@@ -151,14 +151,14 @@ static void multiframe_quality_enhance_block
if (blksize == 16)
{
actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
#ifdef USE_SSD
vp8_variance16x16(y, y_stride, yd, yd_stride, &sse);
vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
sad = (sse + 128)>>8;
vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
usad = (sse + 32)>>6;
vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
vsad = (sse + 32)>>6;
#else
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
......@@ -168,14 +168,14 @@ static void multiframe_quality_enhance_block
}
else /* if (blksize == 8) */
{
actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
#ifdef USE_SSD
vp8_variance8x8(y, y_stride, yd, yd_stride, &sse);
vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
sad = (sse + 32)>>6;
vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
usad = (sse + 8)>>4;
vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
vsad = (sse + 8)>>4;
#else
sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
......
......@@ -236,31 +236,6 @@ add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch,
specialize qw/vp8_bilinear_predict4x4 mmx media neon/;
$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
#
# Whole-pixel Variance
#
add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance4x4 mmx sse2/;
$vp8_variance4x4_sse2=vp8_variance4x4_wmt;
add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance8x8 mmx sse2 media neon/;
$vp8_variance8x8_sse2=vp8_variance8x8_wmt;
$vp8_variance8x8_media=vp8_variance8x8_armv6;
add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance8x16 mmx sse2 neon/;
$vp8_variance8x16_sse2=vp8_variance8x16_wmt;
add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance16x8 mmx sse2 neon/;
$vp8_variance16x8_sse2=vp8_variance16x8_wmt;
add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_variance16x16 mmx sse2 media neon/;
$vp8_variance16x16_sse2=vp8_variance16x16_wmt;
$vp8_variance16x16_media=vp8_variance16x16_armv6;
#
# Sub-pixel Variance
#
......@@ -308,12 +283,6 @@ $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
#
if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
#
# Sum of squares (vector)
#
add_proto qw/unsigned int vp8_get_mb_ss/, "const short *";
specialize qw/vp8_get_mb_ss mmx sse2/;
#
# SSE (Sum Squared Error)
#
......@@ -321,14 +290,6 @@ add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char *src_pt
specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/;
$vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;
add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp8_mse16x16 mmx sse2 media neon/;
$vp8_mse16x16_sse2=vp8_mse16x16_wmt;
$vp8_mse16x16_media=vp8_mse16x16_armv6;
add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
specialize qw/vp8_get4x4sse_cs mmx neon/;
#
# Block copy
#
......
......@@ -39,6 +39,7 @@ typedef void (*vpx_sad_multi_fn_t)(
const unsigned char *ref_array,
int ref_stride,
unsigned int *sad_array);
typedef void (*vpx_sad_multi_d_fn_t)
(
const unsigned char *src_ptr,
......@@ -48,7 +49,7 @@ typedef void (*vpx_sad_multi_d_fn_t)
unsigned int *sad_array
);
typedef unsigned int (*vp8_variance_fn_t)
typedef unsigned int (*vpx_variance_fn_t)
(
const unsigned char *src_ptr,
int source_stride,
......@@ -68,37 +69,14 @@ typedef unsigned int (*vp8_subpixvariance_fn_t)
unsigned int *sse
);
typedef void (*vp8_ssimpf_fn_t)
(
unsigned char *s,
int sp,
unsigned char *r,
int rp,
unsigned long *sum_s,
unsigned long *sum_r,
unsigned long *sum_sq_s,
unsigned long *sum_sq_r,
unsigned long *sum_sxr
);
typedef unsigned int (*vp8_getmbss_fn_t)(const short *);
typedef unsigned int (*vp8_get16x16prederror_fn_t)
(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int ref_stride
);
typedef struct variance_vtable
{
vpx_sad_fn_t sdf;
vp8_variance_fn_t vf;
vpx_variance_fn_t vf;
vp8_subpixvariance_fn_t svf;
vp8_variance_fn_t svf_halfpix_h;
vp8_variance_fn_t svf_halfpix_v;
vp8_variance_fn_t svf_halfpix_hv;
vpx_variance_fn_t svf_halfpix_h;
vpx_variance_fn_t svf_halfpix_v;
vpx_variance_fn_t svf_halfpix_hv;
vpx_sad_multi_fn_t sdx3f;
vpx_sad_multi_fn_t sdx8f;
vpx_sad_multi_d_fn_t sdx4df;
......
......@@ -8,44 +8,34 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vp8_rtcd.h"
#include "filter.h"
#include "variance.h"
unsigned int vp8_get_mb_ss_c
(
const short *src_ptr
)
{
unsigned int i = 0, sum = 0;
do
{
sum += (src_ptr[i] * src_ptr[i]);
i++;
}
while (i < 256);
return sum;
/* This is a bad idea.
* ctz = count trailing zeros */
static int ctz(int a) {
int b = 0;
while (a != 1) {
a >>= 1;
b++;
}
return b;
}
static void variance(
static unsigned int variance(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
int w,
int h,
unsigned int *sse,
int *sum)
unsigned int *sse)
{
int i, j;
int diff;
int diff, sum;
*sum = 0;
sum = 0;
*sse = 0;
for (i = 0; i < h; i++)
......@@ -53,114 +43,17 @@ static void variance(
for (j = 0; j < w; j++)
{
diff = src_ptr[j] - ref_ptr[j];
*sum += diff;
sum += diff;
*sse += diff * diff;
}
src_ptr += source_stride;
ref_ptr += recon_stride;
}
}
unsigned int vp8_variance16x16_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 8));
}
unsigned int vp8_variance8x16_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
}
unsigned int vp8_variance16x8_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
}
unsigned int vp8_variance8x8_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 6));
}
unsigned int vp8_variance4x4_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 4));
}
unsigned int vp8_mse16x16_c(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse)
{
unsigned int var;
int avg;
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
*sse = var;
return var;
}
/****************************************************************************
*
* ROUTINE : filter_block2d_bil_first_pass
......@@ -304,7 +197,7 @@ unsigned int vp8_sub_pixel_variance4x4_c
/* Now filter Verticaly */
var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
}
......@@ -329,7 +222,7 @@ unsigned int vp8_sub_pixel_variance8x8_c
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
}
unsigned int vp8_sub_pixel_variance16x16_c
......@@ -353,7 +246,7 @@ unsigned int vp8_sub_pixel_variance16x16_c
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
}
......@@ -429,7 +322,7 @@ unsigned int vp8_sub_pixel_variance16x8_c
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
}
unsigned int vp8_sub_pixel_variance8x16_c
......@@ -455,5 +348,5 @@ unsigned int vp8_sub_pixel_variance8x16_c
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
}
This diff is collapsed.
......@@ -13,393 +13,6 @@
%define xmm_filter_shift 7
;unsigned int vp8_get_mb_ss_sse2
;(
; short *src_ptr
;)
global sym(vp8_get_mb_ss_sse2) PRIVATE
sym(vp8_get_mb_ss_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 1
GET_GOT rbx
push rsi
push rdi
sub rsp, 16
; end prolog
mov rax, arg(0) ;[src_ptr]
mov rcx, 8
pxor xmm4, xmm4
.NEXTROW:
movdqa xmm0, [rax]
movdqa xmm1, [rax+16]
movdqa xmm2, [rax+32]
movdqa xmm3, [rax+48]
pmaddwd xmm0, xmm0
pmaddwd xmm1, xmm1
pmaddwd xmm2, xmm2
pmaddwd xmm3, xmm3
paddd xmm0, xmm1
paddd xmm2, xmm3
paddd xmm4, xmm0
paddd xmm4, xmm2
add rax, 0x40
dec rcx
ja .NEXTROW
movdqa xmm3,xmm4
psrldq xmm4,8
paddd xmm4,xmm3
movdqa xmm3,xmm4
psrldq xmm4,4
paddd xmm4,xmm3
movq rax,xmm4
; begin epilog
add rsp, 16
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp8_get16x16var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
; unsigned char * ref_ptr,
; int recon_stride,
; unsigned int * SSE,
; int * Sum
;)
global sym(vp8_get16x16var_sse2) PRIVATE
sym(vp8_get16x16var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rbx
push rsi
push rdi
; end prolog