Commit decead73 authored by Ronald S. Bultje's avatar Ronald S. Bultje Committed by Gerrit Code Review

Replace copy_memNxM functions with a generic copy/avg function.

Change-Id: I3ce849452ed4f08527de9565a9914d5ee36170aa
parent c13e0bcb
......@@ -22,8 +22,8 @@ extern "C" {
}
namespace {
typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h);
......
......@@ -38,8 +38,8 @@
*/
#define ALIGN_FILTERS_256 1
static void convolve_horiz_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x0, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
......@@ -80,8 +80,8 @@ static void convolve_horiz_c(const uint8_t *src, int src_stride,
}
}
static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x0, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
......@@ -122,8 +122,8 @@ static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
}
}
static void convolve_vert_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y0, int y_step_q4,
int w, int h, int taps) {
......@@ -164,8 +164,8 @@ static void convolve_vert_c(const uint8_t *src, int src_stride,
}
}
static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y0, int y_step_q4,
int w, int h, int taps) {
......@@ -207,8 +207,8 @@ static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
}
}
static void convolve_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
......@@ -237,8 +237,8 @@ static void convolve_c(const uint8_t *src, int src_stride,
w, h, taps);
}
static void convolve_avg_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
......@@ -267,8 +267,8 @@ static void convolve_avg_c(const uint8_t *src, int src_stride,
w, h, taps);
}
void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
......@@ -277,8 +277,8 @@ void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
w, h, 8);
}
void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
......@@ -287,8 +287,8 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
w, h, 8);
}
void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
......@@ -297,8 +297,8 @@ void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
w, h, 8);
}
void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
......@@ -307,8 +307,8 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
w, h, 8);
}
void vp9_convolve8_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
......@@ -317,8 +317,8 @@ void vp9_convolve8_c(const uint8_t *src, int src_stride,
w, h, 8);
}
void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
......@@ -339,33 +339,25 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
w, h);
}
void vp9_convolve_copy(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h) {
if (w == 16 && h == 16) {
vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
} else if (w == 8 && h == 8) {
vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
} else if (w == 8 && h == 4) {
vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
} else {
int r;
for (r = h; r > 0; --r) {
memcpy(dst, src, w);
src += src_stride;
dst += dst_stride;
}
void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h) {
int r;
for (r = h; r > 0; --r) {
memcpy(dst, src, w);
src += src_stride;
dst += dst_stride;
}
}
void vp9_convolve_avg(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h) {
void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h) {
int x, y;
for (y = 0; y < h; ++y) {
......
......@@ -13,26 +13,12 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h);
// Not a convolution, a block copy conforming to the convolution prototype
void vp9_convolve_copy(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h);
// Not a convolution, a block average conforming to the convolution prototype
void vp9_convolve_avg(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h);
struct subpix_fn_table {
const int16_t (*filter_x)[8];
const int16_t (*filter_y)[8];
......
......@@ -194,93 +194,6 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
}
void vp9_copy_mem16x16_c(const uint8_t *src,
int src_stride,
uint8_t *dst,
int dst_stride) {
int r;
for (r = 0; r < 16; r++) {
#if !(CONFIG_FAST_UNALIGNED)
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
dst[3] = src[3];
dst[4] = src[4];
dst[5] = src[5];
dst[6] = src[6];
dst[7] = src[7];
dst[8] = src[8];
dst[9] = src[9];
dst[10] = src[10];
dst[11] = src[11];
dst[12] = src[12];
dst[13] = src[13];
dst[14] = src[14];
dst[15] = src[15];
#else
((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];
((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];
#endif
src += src_stride;
dst += dst_stride;
}
}
void vp9_copy_mem8x8_c(const uint8_t *src,
int src_stride,
uint8_t *dst,
int dst_stride) {
int r;
for (r = 0; r < 8; r++) {
#if !(CONFIG_FAST_UNALIGNED)
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
dst[3] = src[3];
dst[4] = src[4];
dst[5] = src[5];
dst[6] = src[6];
dst[7] = src[7];
#else
((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
#endif
src += src_stride;
dst += dst_stride;
}
}
void vp9_copy_mem8x4_c(const uint8_t *src,
int src_stride,
uint8_t *dst,
int dst_stride) {
int r;
for (r = 0; r < 4; r++) {
#if !(CONFIG_FAST_UNALIGNED)
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
dst[3] = src[3];
dst[4] = src[4];
dst[5] = src[5];
dst[6] = src[6];
dst[7] = src[7];
#else
((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
#endif
src += src_stride;
dst += dst_stride;
}
}
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int_mv *src_mv,
......
......@@ -43,17 +43,6 @@ specialize vp9_idct_add_32x32
#
# RECON
#
prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
specialize vp9_copy_mem16x16 mmx sse2 dspr2
vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
specialize vp9_copy_mem8x8 mmx dspr2
vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
specialize vp9_copy_mem8x4 mmx
prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_d27_predictor_4x4
......@@ -275,22 +264,28 @@ specialize vp9_blend_b
#
# Sub Pixel Filters
#
prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve_copy sse2
prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve_avg sse2
prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8 ssse3
prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_horiz ssse3
prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_vert ssse3
prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg ssse3
prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_horiz ssse3
prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_vert ssse3
#
......
......@@ -121,8 +121,8 @@ void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
unsigned int output_height,
const short *filter);
void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
......@@ -159,8 +159,8 @@ void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
}
}
void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
......@@ -197,8 +197,8 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
}
}
void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
......@@ -235,8 +235,8 @@ void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
}
}
void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
......@@ -273,8 +273,8 @@ void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
}
}
void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
......@@ -294,8 +294,8 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
}
}
void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
......
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
SECTION .text
%macro convolve_fn 1
INIT_XMM sse2
cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
fx, fxs, fy, fys, w, h
mov r4d, dword wm
cmp r4d, 4
je .w4
cmp r4d, 8
je .w8
cmp r4d, 16
je .w16
cmp r4d, 32
je .w32
mov r4d, dword hm
.loop64:
movu m0, [srcq]
movu m1, [srcq+16]
movu m2, [srcq+32]
movu m3, [srcq+48]
add srcq, src_strideq
%ifidn %1, avg
pavgb m0, [dstq]
pavgb m1, [dstq+16]
pavgb m2, [dstq+32]
pavgb m3, [dstq+48]
%endif
mova [dstq ], m0
mova [dstq+16], m1
mova [dstq+32], m2
mova [dstq+48], m3
add dstq, dst_strideq
dec r4d
jnz .loop64
RET
.w32:
mov r4d, dword hm
.loop32:
movu m0, [srcq]
movu m1, [srcq+16]
movu m2, [srcq+src_strideq]
movu m3, [srcq+src_strideq+16]
lea srcq, [srcq+src_strideq*2]
%ifidn %1, avg
pavgb m0, [dstq]
pavgb m1, [dstq +16]
pavgb m2, [dstq+dst_strideq]
pavgb m3, [dstq+dst_strideq+16]
%endif
mova [dstq ], m0
mova [dstq +16], m1
mova [dstq+dst_strideq ], m2
mova [dstq+dst_strideq+16], m3
lea dstq, [dstq+dst_strideq*2]
sub r4d, 2
jnz .loop32
RET
.w16:
mov r4d, dword hm
lea r5q, [src_strideq*3]
lea r6q, [dst_strideq*3]
.loop16:
movu m0, [srcq]
movu m1, [srcq+src_strideq]
movu m2, [srcq+src_strideq*2]
movu m3, [srcq+r5q]
lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
pavgb m0, [dstq]
pavgb m1, [dstq+dst_strideq]
pavgb m2, [dstq+dst_strideq*2]
pavgb m3, [dstq+r6q]
%endif
mova [dstq ], m0
mova [dstq+dst_strideq ], m1
mova [dstq+dst_strideq*2], m2
mova [dstq+r6q ], m3
lea dstq, [dstq+dst_strideq*4]
sub r4d, 4
jnz .loop16
RET
INIT_MMX sse
.w8:
mov r4d, dword hm
lea r5q, [src_strideq*3]
lea r6q, [dst_strideq*3]
.loop8:
movu m0, [srcq]
movu m1, [srcq+src_strideq]
movu m2, [srcq+src_strideq*2]
movu m3, [srcq+r5q]
lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
pavgb m0, [dstq]
pavgb m1, [dstq+dst_strideq]
pavgb m2, [dstq+dst_strideq*2]
pavgb m3, [dstq+r6q]
%endif
mova [dstq ], m0
mova [dstq+dst_strideq ], m1
mova [dstq+dst_strideq*2], m2
mova [dstq+r6q ], m3
lea dstq, [dstq+dst_strideq*4]
sub r4d, 4
jnz .loop8
RET
.w4:
mov r4d, dword hm
lea r5q, [src_strideq*3]
lea r6q, [dst_strideq*3]
.loop4:
movh m0, [srcq]
movh m1, [srcq+src_strideq]
movh m2, [srcq+src_strideq*2]
movh m3, [srcq+r5q]
lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
pavgb m0, [dstq]
pavgb m1, [dstq+dst_strideq]
pavgb m2, [dstq+dst_strideq*2]
pavgb m3, [dstq+r6q]
%endif
movh [dstq ], m0
movh [dstq+dst_strideq ], m1
movh [dstq+dst_strideq*2], m2
movh [dstq+r6q ], m3
lea dstq, [dstq+dst_strideq*4]
sub r4d, 4
jnz .loop4
RET
%endmacro
convolve_fn copy
convolve_fn avg
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void copy_mem8x8_mmx(
; unsigned char *src,
; int src_stride,
; unsigned char *dst,
; int dst_stride
; )
global sym(vp9_copy_mem8x8_mmx) PRIVATE
sym(vp9_copy_mem8x8_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;src;
movq mm0, [rsi]
movsxd rax, dword ptr arg(1) ;src_stride;
mov rdi, arg(2) ;dst;
movq mm1, [rsi+rax]
movq mm2, [rsi+rax*2]
movsxd rcx, dword ptr arg(3) ;dst_stride
lea rsi, [rsi+rax*2]
movq [rdi], mm0
add rsi, rax
movq [rdi+rcx], mm1
movq [rdi+rcx*2], mm2