Commit 938b8dfc authored by Geza Lore's avatar Geza Lore

Extend convolution functions to 128x128 for ext-partition.

Change-Id: I7f7e26cd1d58eb38417200550c6fbf4108c9f942
parent 697bf5be
This diff is collapsed.
......@@ -22,8 +22,6 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#define MAX_CU_SIZE 128
using libvpx_test::ACMRandom;
namespace {
......
......@@ -25,8 +25,6 @@
#include "vpx_dsp/vpx_filter.h"
#include "vpx_mem/vpx_mem.h"
#define MAX_CU_SIZE 128
using libvpx_test::ACMRandom;
namespace {
......
......@@ -130,18 +130,21 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride,
// --Must round-up because block may be located at sub-pixel position.
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
uint8_t temp[135 * 64];
uint8_t temp[MAX_EXT_SIZE * MAX_CU_SIZE];
int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
assert(w <= 64);
assert(h <= 64);
assert(w <= MAX_CU_SIZE);
assert(h <= MAX_CU_SIZE);
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
temp, MAX_CU_SIZE,
x_filters, x0_q4, x_step_q4, w, intermediate_height);
convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
convolve_vert(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_CU_SIZE,
dst, dst_stride,
y_filters, y0_q4, y_step_q4, w, h);
}
......@@ -237,13 +240,14 @@ void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
/* Fixed size intermediate buffer places limits on parameters. */
DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
assert(w <= 64);
assert(h <= 64);
DECLARE_ALIGNED(16, uint8_t, temp[MAX_CU_SIZE * MAX_CU_SIZE]);
assert(w <= MAX_CU_SIZE);
assert(h <= MAX_CU_SIZE);
vpx_convolve8_c(src, src_stride, temp, 64,
vpx_convolve8_c(src, src_stride, temp, MAX_CU_SIZE,
filter_x, x_step_q4, filter_y, y_step_q4, w, h);
vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
vpx_convolve_avg_c(temp, MAX_CU_SIZE, dst, dst_stride,
NULL, 0, NULL, 0, w, h);
}
void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
......@@ -459,22 +463,23 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
// --Must round-up because block may be located at sub-pixel position.
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
uint16_t temp[64 * 135];
uint16_t temp[MAX_EXT_SIZE * MAX_CU_SIZE];
int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
assert(w <= 64);
assert(h <= 64);
assert(w <= MAX_CU_SIZE);
assert(h <= MAX_CU_SIZE);
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
src_stride, CONVERT_TO_BYTEPTR(temp), 64,
highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
x_filters, x0_q4, x_step_q4, w,
intermediate_height, bd);
highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
w, h, bd);
highbd_convolve_vert(
CONVERT_TO_BYTEPTR(temp) + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_CU_SIZE,
dst, dst_stride,
y_filters, y0_q4, y_step_q4, w, h, bd);
}
......@@ -556,13 +561,15 @@ void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
// Fixed size intermediate buffer places limits on parameters.
DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
assert(w <= 64);
assert(h <= 64);
DECLARE_ALIGNED(16, uint16_t, temp[MAX_CU_SIZE * MAX_CU_SIZE]);
assert(w <= MAX_CU_SIZE);
assert(h <= MAX_CU_SIZE);
vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
vpx_highbd_convolve8_c(src, src_stride,
CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
dst, dst_stride,
NULL, 0, NULL, 0, w, h, bd);
}
......
......@@ -17,6 +17,24 @@
extern "C" {
#endif
// Note: Fixed size intermediate buffers, place limits on parameters
// of some functions. 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
// (2) Interpolate temp vertically to derive the sub-pixel result.
// Deriving the maximum number of rows in the temp buffer (135):
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
// --Largest block size is 64x64 pixels.
// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
// original frame (in 1/16th pixel units).
// --Must round-up because block may be located at sub-pixel position.
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
# define MAX_EXT_SIZE 263
#else
# define MAX_EXT_SIZE 135
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
......
......@@ -20,6 +20,12 @@
extern "C" {
#endif
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
# define MAX_CU_SIZE 128
#else
# define MAX_CU_SIZE 64
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
#define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
#define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
......
......@@ -466,52 +466,44 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
# Sub Pixel Filters
#
add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/;
add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_scaled_2d ssse3/;
add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_scaled_horiz/;
add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_scaled_vert/;
add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_scaled_avg_2d/;
add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_scaled_avg_horiz/;
add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_scaled_avg_vert/;
add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vpx_convolve_copy /, "$sse2_x86inc";
specialize qw/vpx_convolve_avg /, "$sse2_x86inc";
specialize qw/vpx_convolve8 sse2 ssse3/, "$avx2_ssse3";
specialize qw/vpx_convolve8_horiz sse2 ssse3/, "$avx2_ssse3";
specialize qw/vpx_convolve8_vert sse2 ssse3/, "$avx2_ssse3";
specialize qw/vpx_convolve8_avg sse2 ssse3/;
specialize qw/vpx_convolve8_avg_horiz sse2 ssse3/;
specialize qw/vpx_convolve8_avg_vert sse2 ssse3/;
specialize qw/vpx_scaled_2d ssse3/;
# TODO(any): These need to be extended to up to 128x128 block sizes
if (!(vpx_config("CONFIG_VP10") eq "yes" && vpx_config("CONFIG_EXT_PARTITION") eq "yes")) {
specialize qw/vpx_convolve_copy neon dspr2 msa/;
specialize qw/vpx_convolve_avg neon dspr2 msa/;
specialize qw/vpx_convolve8 neon dspr2 msa/;
specialize qw/vpx_convolve8_horiz neon dspr2 msa/;
specialize qw/vpx_convolve8_vert neon dspr2 msa/;
specialize qw/vpx_convolve8_avg neon dspr2 msa/;
specialize qw/vpx_convolve8_avg_horiz neon dspr2 msa/;
specialize qw/vpx_convolve8_avg_vert neon dspr2 msa/;
}
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
# Sub Pixel Filters
#
add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve_copy/, "$sse2_x86inc";
......
......@@ -15,6 +15,7 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
#include "vpx_dsp/vpx_convolve.h"
typedef void filter8_1dfunction (
const uint8_t *src_ptr,
......@@ -112,25 +113,27 @@ void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
int w, int h) { \
assert(filter_x[3] != 128); \
assert(filter_y[3] != 128); \
assert(w <= 64); \
assert(h <= 64); \
assert(w <= MAX_CU_SIZE); \
assert(h <= MAX_CU_SIZE); \
assert(x_step_q4 == 16); \
assert(y_step_q4 == 16); \
if (filter_x[0] || filter_x[1] || filter_x[2]|| \
filter_y[0] || filter_y[1] || filter_y[2]) { \
DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+7)]); \
vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
fdata2, MAX_CU_SIZE, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 7); \
vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_CU_SIZE, MAX_CU_SIZE, \
dst, dst_stride, \
filter_x, x_step_q4, filter_y, \
y_step_q4, w, h); \
} else { \
DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+1)]); \
vpx_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_CU_SIZE, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 1); \
vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
vpx_convolve8_##avg##vert_##opt(fdata2, MAX_CU_SIZE, dst, dst_stride, \
filter_x, x_step_q4, filter_y, \
y_step_q4, w, h); \
} \
......@@ -250,31 +253,40 @@ void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, \
int w, int h, int bd) { \
assert(w <= 64); \
assert(h <= 64); \
assert(w <= MAX_CU_SIZE); \
assert(h <= MAX_CU_SIZE); \
if (x_step_q4 == 16 && y_step_q4 == 16) { \
if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
CONVERT_TO_BYTEPTR(fdata2), 64, \
DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+7)]); \
vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, \
src_stride, \
CONVERT_TO_BYTEPTR(fdata2), \
MAX_CU_SIZE, \
filter_x, x_step_q4, \
filter_y, y_step_q4, \
w, h + 7, bd); \
vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
64, dst, dst_stride, \
filter_x, x_step_q4, \
filter_y, y_step_q4, \
w, h, bd); \
vpx_highbd_convolve8_##avg##vert_##opt( \
CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_CU_SIZE, \
MAX_CU_SIZE, \
dst, \
dst_stride, \
filter_x, x_step_q4, \
filter_y, y_step_q4, \
w, h, bd); \
} else { \
DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
CONVERT_TO_BYTEPTR(fdata2), 64, \
DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+1)]); \
vpx_highbd_convolve8_horiz_##opt(src, \
src_stride, \
CONVERT_TO_BYTEPTR(fdata2), \
MAX_CU_SIZE, \
filter_x, x_step_q4, \
filter_y, y_step_q4, \
w, h + 1, bd); \
vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
dst, dst_stride, \
vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), \
MAX_CU_SIZE, \
dst, \
dst_stride, \
filter_x, x_step_q4, \
filter_y, y_step_q4, \
w, h, bd); \
......
......@@ -46,6 +46,119 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
je .w16
cmp r4d, 32
je .w32
%if CONFIG_VP10 && CONFIG_EXT_PARTITION
cmp r4d, 64
je .w64
%ifidn %2, highbd
cmp r4d, 128
je .w128
.w256:
mov r4d, dword hm
.loop256:
movu m0, [srcq]
movu m1, [srcq+16]
movu m2, [srcq+32]
movu m3, [srcq+48]
%ifidn %1, avg
pavg m0, [dstq]
pavg m1, [dstq+16]
pavg m2, [dstq+32]
pavg m3, [dstq+48]
%endif
mova [dstq ], m0
mova [dstq+16], m1
mova [dstq+32], m2
mova [dstq+48], m3
movu m0, [srcq+64]
movu m1, [srcq+80]
movu m2, [srcq+96]
movu m3, [srcq+112]
%ifidn %1, avg
pavg m0, [dstq+64]
pavg m1, [dstq+80]
pavg m2, [dstq+96]
pavg m3, [dstq+112]
%endif
mova [dstq+64], m0
mova [dstq+80], m1
mova [dstq+96], m2
mova [dstq+112], m3
movu m0, [srcq+128]
movu m1, [srcq+128+16]
movu m2, [srcq+128+32]
movu m3, [srcq+128+48]
%ifidn %1, avg
pavg m0, [dstq+128]
pavg m1, [dstq+128+16]
pavg m2, [dstq+128+32]
pavg m3, [dstq+128+48]
%endif
mova [dstq+128 ], m0
mova [dstq+128+16], m1
mova [dstq+128+32], m2
mova [dstq+128+48], m3
movu m0, [srcq+128+64]
movu m1, [srcq+128+80]
movu m2, [srcq+128+96]
movu m3, [srcq+128+112]
add srcq, src_strideq
%ifidn %1, avg
pavg m0, [dstq+128+64]
pavg m1, [dstq+128+80]
pavg m2, [dstq+128+96]
pavg m3, [dstq+128+112]
%endif
mova [dstq+128+64], m0
mova [dstq+128+80], m1
mova [dstq+128+96], m2
mova [dstq+128+112], m3
add dstq, dst_strideq
sub r4d, 1
jnz .loop256
RET
%endif
.w128:
mov r4d, dword hm
.loop128:
movu m0, [srcq]
movu m1, [srcq+16]
movu m2, [srcq+32]
movu m3, [srcq+48]
%ifidn %1, avg
pavg m0, [dstq]
pavg m1, [dstq+16]
pavg m2, [dstq+32]
pavg m3, [dstq+48]
%endif
mova [dstq ], m0
mova [dstq+16], m1
mova [dstq+32], m2
mova [dstq+48], m3
movu m0, [srcq+64]
movu m1, [srcq+80]
movu m2, [srcq+96]
movu m3, [srcq+112]
add srcq, src_strideq
%ifidn %1, avg
pavg m0, [dstq+64]
pavg m1, [dstq+80]
pavg m2, [dstq+96]
pavg m3, [dstq+112]
%endif
mova [dstq+64], m0
mova [dstq+80], m1
mova [dstq+96], m2
mova [dstq+112], m3
add dstq, dst_strideq
sub r4d, 1
jnz .loop128
RET
%else ; CONFIG_VP10 && CONFIG_EXT_PARTITION
%ifidn %2, highbd
cmp r4d, 64
je .w64
......@@ -82,10 +195,11 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
mova [dstq+96], m2
mova [dstq+112], m3
add dstq, dst_strideq
dec r4d
sub r4d, 1
jnz .loop128
RET
%endif
%endif ; CONFIG_VP10 && CONFIG_EXT_PARTITION
.w64
mov r4d, dword hm
......@@ -106,7 +220,7 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
mova [dstq+32], m2
mova [dstq+48], m3
add dstq, dst_strideq
dec r4d
sub r4d, 1
jnz .loop64
RET
......
......@@ -844,34 +844,49 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
// --Require an additional 8 rows for the horiz_w8 transpose tail.
DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_CU_SIZE]);
const int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
assert(w <= 64);
assert(h <= 64);
assert(w <= MAX_CU_SIZE);
assert(h <= MAX_CU_SIZE);
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
if (w >= 8) {
scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
src_stride,
temp,
MAX_CU_SIZE,
x_filters, x0_q4, x_step_q4,
w, intermediate_height);
} else {
scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
src_stride,
temp,
MAX_CU_SIZE,
x_filters, x0_q4, x_step_q4,
w, intermediate_height);
}
if (w >= 16) {
scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
dst_stride, y_filters, y0_q4, y_step_q4, w, h);
scaledconvolve_vert_w16(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
MAX_CU_SIZE,
dst,
dst_stride,
y_filters, y0_q4, y_step_q4, w, h);
} else if (w == 8) {
scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
dst_stride, y_filters, y0_q4, y_step_q4, w, h);
scaledconvolve_vert_w8(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
MAX_CU_SIZE,
dst,
dst_stride,
y_filters, y0_q4, y_step_q4, w, h);
} else {
scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
dst_stride, y_filters, y0_q4, y_step_q4, w, h);
scaledconvolve_vert_w4(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
MAX_CU_SIZE,
dst,
dst_stride,
y_filters, y0_q4, y_step_q4, w, h);
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment