Commit 5d24b6f0 authored by Timothy B. Terriberry's avatar Timothy B. Terriberry Committed by Tim Terriberry
Browse files

encoder: Remove 64x upsampled reference buffers

They do not handle border extension correctly (interpolation and
border extension do not commute unless you upsample into the
border), nor do they handle crop dimensions that are not a multiple
of 8 (the upsampled version is not sufficiently large), in addition
to using massive amounts of memory and being a criminal waste of
cache (1 byte used for every 8 bytes fetched).

This commit reimplements use_upsampled_references by computing the
subpixel samples on the fly. This implementation not only corrects
the border handling, but is also faster, while maintaining the
same quality.

HL AWCY results are basically noise:
    PSNR | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
  0.0188 |   0.0187 | 0.0045 |  0.0063 |     0.0228

Change-Id: I7527db9f83b87a7bb8b35342f7e6457cd0bef9cd
parent 0eac3199
......@@ -911,15 +911,15 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
#
# ...
#
add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, const uint8_t *ref, int ref_stride";
add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride";
specialize qw/aom_upsampled_pred sse2/;
add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride";
specialize qw/aom_comp_avg_upsampled_pred sse2/;
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, const uint8_t *ref8, int ref_stride";
add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd";
specialize qw/aom_highbd_upsampled_pred sse2/;
add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd";
specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
}
......@@ -1480,10 +1480,10 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
if (aom_config("CONFIG_EXT_INTER") eq "yes") {
add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
add_proto qw/void aom_comp_mask_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
add_proto qw/void aom_comp_mask_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd";
}
}
......
......@@ -9,6 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
......@@ -20,6 +22,9 @@
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/blend.h"
#include "./av1_rtcd.h"
#include "av1/common/filter.h"
uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride) {
int distortion = 0;
......@@ -271,33 +276,66 @@ void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
// Get pred block from up-sampled reference.
void aom_upsampled_pred_c(uint8_t *comp_pred, int width, int height,
const uint8_t *ref, int ref_stride) {
int i, j, k;
int stride = ref_stride << 3;
for (i = 0; i < height; i++) {
for (j = 0, k = 0; j < width; j++, k += 8) {
comp_pred[j] = ref[k];
int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
int ref_stride) {
if (!subpel_x_q3 && !subpel_y_q3) {
int i;
for (i = 0; i < height; i++) {
memcpy(comp_pred, ref, width * sizeof(*comp_pred));
comp_pred += width;
ref += ref_stride;
}
} else {
InterpFilterParams filter;
filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
if (!subpel_y_q3) {
const int16_t *kernel;
kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
/*Directly call C version to allow this to work for small (2x2) sizes.*/
aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
-1, width, height);
} else if (!subpel_x_q3) {
const int16_t *kernel;
kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
/*Directly call C version to allow this to work for small (2x2) sizes.*/
aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
16, width, height);
} else {
DECLARE_ALIGNED(16, uint8_t,
temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
const int16_t *kernel_x;
const int16_t *kernel_y;
int intermediate_height;
kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
intermediate_height =
(((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
/*Directly call C versions to allow this to work for small (2x2) sizes.*/
aom_convolve8_horiz_c(ref - ref_stride * ((filter.taps >> 1) - 1),
ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL,
-1, width, intermediate_height);
aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1),
MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y,
16, width, height);
}
comp_pred += width;
ref += stride;
}
}
void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
int width, int height, const uint8_t *ref,
int width, int height, int subpel_x_q3,
int subpel_y_q3, const uint8_t *ref,
int ref_stride) {
int i, j;
int stride = ref_stride << 3;
aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
ref_stride);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
const int tmp = ref[(j << 3)] + pred[j];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
}
comp_pred += width;
pred += width;
ref += stride;
}
}
......@@ -637,37 +675,76 @@ void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
}
void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height,
const uint8_t *ref8, int ref_stride) {
int i, j;
int stride = ref_stride << 3;
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
comp_pred[j] = ref[(j << 3)];
int subpel_x_q3, int subpel_y_q3,
const uint8_t *ref8, int ref_stride, int bd) {
if (!subpel_x_q3 && !subpel_y_q3) {
const uint16_t *ref;
int i;
ref = CONVERT_TO_SHORTPTR(ref8);
for (i = 0; i < height; i++) {
memcpy(comp_pred, ref, width * sizeof(*comp_pred));
comp_pred += width;
ref += ref_stride;
}
} else {
InterpFilterParams filter;
filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
if (!subpel_y_q3) {
const int16_t *kernel;
kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
/*Directly call C version to allow this to work for small (2x2) sizes.*/
aom_highbd_convolve8_horiz_c(ref8, ref_stride,
CONVERT_TO_BYTEPTR(comp_pred), width, kernel,
16, NULL, -1, width, height, bd);
} else if (!subpel_x_q3) {
const int16_t *kernel;
kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
/*Directly call C version to allow this to work for small (2x2) sizes.*/
aom_highbd_convolve8_vert_c(ref8, ref_stride,
CONVERT_TO_BYTEPTR(comp_pred), width, NULL,
-1, kernel, 16, width, height, bd);
} else {
DECLARE_ALIGNED(16, uint8_t,
temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
const uint16_t *ref;
const int16_t *kernel_x;
const int16_t *kernel_y;
int intermediate_height;
ref = CONVERT_TO_SHORTPTR(ref8);
kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
intermediate_height =
(((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
/*Directly call C versions to allow this to work for small (2x2) sizes.*/
aom_highbd_convolve8_horiz_c(
CONVERT_TO_BYTEPTR(ref - ref_stride * ((filter.taps >> 1) - 1)),
ref_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL,
-1, width, intermediate_height, bd);
aom_highbd_convolve8_vert_c(
CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
16, width, height, bd);
}
comp_pred += width;
ref += stride;
}
}
void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
const uint8_t *pred8, int width,
int height, const uint8_t *ref8,
int ref_stride) {
int height, int subpel_x_q3,
int subpel_y_q3, const uint8_t *ref8,
int ref_stride, int bd) {
int i, j;
int stride = ref_stride << 3;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
ref8, ref_stride, bd);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int tmp = pred[j] + ref[(j << 3)];
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
}
comp_pred += width;
pred += width;
ref += stride;
}
}
#endif // CONFIG_HIGHBITDEPTH
......@@ -694,22 +771,23 @@ void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
}
void aom_comp_mask_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
int width, int height, const uint8_t *ref,
int width, int height, int subpel_x_q3,
int subpel_y_q3, const uint8_t *ref,
int ref_stride, const uint8_t *mask,
int mask_stride, int invert_mask) {
int i, j;
int stride = ref_stride << 3;
aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
ref_stride);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
if (!invert_mask)
comp_pred[j] = AOM_BLEND_A64(mask[j], ref[(j << 3)], pred[j]);
comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]);
else
comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[(j << 3)]);
comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]);
}
comp_pred += width;
pred += width;
ref += stride;
mask += mask_stride;
}
}
......@@ -775,26 +853,24 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
}
}
void aom_highbd_comp_mask_upsampled_pred_c(uint16_t *comp_pred,
const uint8_t *pred8, int width,
int height, const uint8_t *ref8,
int ref_stride, const uint8_t *mask,
int mask_stride, int invert_mask) {
void aom_highbd_comp_mask_upsampled_pred_c(
uint16_t *comp_pred, const uint8_t *pred8, int width, int height,
int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride,
const uint8_t *mask, int mask_stride, int invert_mask, int bd) {
int i, j;
int stride = ref_stride << 3;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
ref8, ref_stride, bd);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
if (!invert_mask)
comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j << 3], pred[j]);
comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]);
else
comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j << 3]);
comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]);
}
comp_pred += width;
pred += width;
ref += stride;
mask += mask_stride;
}
}
......
......@@ -9,6 +9,7 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include <emmintrin.h> // SSE2
#include "./aom_config.h"
......@@ -16,6 +17,9 @@
#include "aom_ports/mem.h"
#include "./av1_rtcd.h"
#include "av1/common/filter.h"
typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
uint32_t *sse, int *sum);
......@@ -565,131 +569,96 @@ FNS(sse2);
#undef FN
void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height,
const uint8_t *ref8, int ref_stride) {
int i, j;
int stride = ref_stride << 3;
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
if (width >= 8) {
// read 8 points at one time
for (i = 0; i < height; i++) {
for (j = 0; j < width; j += 8) {
__m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
__m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
__m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
__m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
__m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
__m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
__m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
__m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
__m128i t0, t1, t2, t3;
t0 = _mm_unpacklo_epi16(s0, s1);
t1 = _mm_unpacklo_epi16(s2, s3);
t2 = _mm_unpacklo_epi16(s4, s5);
t3 = _mm_unpacklo_epi16(s6, s7);
t0 = _mm_unpacklo_epi32(t0, t1);
t2 = _mm_unpacklo_epi32(t2, t3);
t0 = _mm_unpacklo_epi64(t0, t2);
_mm_storeu_si128((__m128i *)(comp_pred), t0);
int subpel_x_q3, int subpel_y_q3,
const uint8_t *ref8, int ref_stride,
int bd) {
if (!subpel_x_q3 && !subpel_y_q3) {
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
if (width >= 8) {
int i;
assert(!(width & 7));
/*Read 8 pixels one row at a time.*/
for (i = 0; i < height; i++) {
int j;
for (j = 0; j < width; j += 8) {
__m128i s0 = _mm_loadu_si128((const __m128i *)ref);
_mm_storeu_si128((__m128i *)comp_pred, s0);
comp_pred += 8;
ref += 8;
}
ref += ref_stride - width;
}
} else {
int i;
assert(!(width & 3));
/*Read 4 pixels two rows at a time.*/
for (i = 0; i < height; i += 2) {
__m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
__m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
__m128i t0 = _mm_unpacklo_epi64(s0, s1);
_mm_storeu_si128((__m128i *)comp_pred, t0);
comp_pred += 8;
ref += 64; // 8 * 8;
ref += 2 * ref_stride;
}
ref += stride - (width << 3);
}
} else {
// read 4 points at one time
for (i = 0; i < height; i++) {
for (j = 0; j < width; j += 4) {
__m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
__m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
__m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
__m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
__m128i t0, t1;
t0 = _mm_unpacklo_epi16(s0, s1);
t1 = _mm_unpacklo_epi16(s2, s3);
t0 = _mm_unpacklo_epi32(t0, t1);
_mm_storel_epi64((__m128i *)(comp_pred), t0);
comp_pred += 4;
ref += 4 * 8;
}
ref += stride - (width << 3);
InterpFilterParams filter;
filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR);
if (!subpel_y_q3) {
const int16_t *kernel;
kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
aom_highbd_convolve8_horiz(ref8, ref_stride,
CONVERT_TO_BYTEPTR(comp_pred), width, kernel,
16, NULL, -1, width, height, bd);
} else if (!subpel_x_q3) {
const int16_t *kernel;
kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred),
width, NULL, -1, kernel, 16, width, height, bd);
} else {
DECLARE_ALIGNED(16, uint16_t,
temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
const uint16_t *ref;
const int16_t *kernel_x;
const int16_t *kernel_y;
int intermediate_height;
ref = CONVERT_TO_SHORTPTR(ref8);
kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
intermediate_height =
(((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps;
assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
aom_highbd_convolve8_horiz(
CONVERT_TO_BYTEPTR(ref - ref_stride * ((filter.taps >> 1) - 1)),
ref_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL,
-1, width, intermediate_height, bd);
aom_highbd_convolve8_vert(
CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)),
MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y,
16, width, height, bd);
}
}
}
void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
const uint8_t *pred8, int width,
int height, const uint8_t *ref8,
int ref_stride) {
const __m128i one = _mm_set1_epi16(1);
int i, j;
int stride = ref_stride << 3;
int height, int subpel_x_q3,
int subpel_y_q3,
const uint8_t *ref8,
int ref_stride, int bd) {
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
if (width >= 8) {
// read 8 points at one time
for (i = 0; i < height; i++) {
for (j = 0; j < width; j += 8) {
__m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
__m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
__m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
__m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
__m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
__m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
__m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
__m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
__m128i p0 = _mm_loadu_si128((const __m128i *)pred);
__m128i t0, t1, t2, t3;
t0 = _mm_unpacklo_epi16(s0, s1);
t1 = _mm_unpacklo_epi16(s2, s3);
t2 = _mm_unpacklo_epi16(s4, s5);
t3 = _mm_unpacklo_epi16(s6, s7);
t0 = _mm_unpacklo_epi32(t0, t1);
t2 = _mm_unpacklo_epi32(t2, t3);
t0 = _mm_unpacklo_epi64(t0, t2);
p0 = _mm_adds_epu16(t0, p0);
p0 = _mm_adds_epu16(p0, one);
p0 = _mm_srli_epi16(p0, 1);
_mm_storeu_si128((__m128i *)(comp_pred), p0);
comp_pred += 8;
pred += 8;
ref += 8 * 8;
}
ref += stride - (width << 3);
}
} else {
// read 4 points at one time
for (i = 0; i < height; i++) {
for (j = 0; j < width; j += 4) {
__m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
__m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
__m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
__m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
__m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
__m128i t0, t1;
t0 = _mm_unpacklo_epi16(s0, s1);
t1 = _mm_unpacklo_epi16(s2, s3);
t0 = _mm_unpacklo_epi32(t0, t1);
p0 = _mm_adds_epu16(t0, p0);
p0 = _mm_adds_epu16(p0, one);
p0 = _mm_srli_epi16(p0, 1);
_mm_storel_epi64((__m128i *)(comp_pred), p0);
comp_pred += 4;
pred += 4;
ref += 4 * 8;
}
ref += stride - (width << 3);
}
int n;
int i;
aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3,
ref8, ref_stride, bd);
/*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
assert(!(width * height & 7));
n = width * height >> 3;
for (i = 0; i < n; i++) {
__m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred);
__m128i p0 = _mm_loadu_si128((const __m128i *)pred);
_mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu16(s0, p0));
comp_pred += 8;
pred += 8;
}
}
......@@ -17,6 +17,9 @@
#include "aom_ports/mem.h"
#include "./av1_rtcd.h"
#include "av1/common/filter.h"
typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
unsigned int *sse, int *sum);
......@@ -477,214 +480,106 @@ FNS(ssse3, ssse3);
#undef FN
void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
int subpel_x_q3, int subpel_y_q3,
const uint8_t *ref, int ref_stride) {
int i, j;
int stride = ref_stride << 3;
if (width >= 16) {
// read 16 points at one time
for (i = 0; i < height; i++) {
for (j = 0; j < width; j += 16) {
__m128i s0 = _mm_loadu_si128((const __m128i *)ref);
__m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
__m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
__m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
__m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
__m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
__m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
__m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
__m128i t0, t1, t2, t3;
t0 = _mm_unpacklo_epi8(s0, s1);
s1 = _mm_unpackhi_epi8(s0, s1);
t1 = _mm_unpacklo_epi8(s2, s3);
s3 = _mm_unpackhi_epi8(s2, s3);
t2 = _mm_unpacklo_epi8(s4, s5);
s5 = _mm_unpackhi_epi8(s4, s5);
t3 = _mm_unpacklo_epi8(s6, s7);
s7 = _mm_unpackhi_epi8(s6, s7);
s0 = _mm_unpacklo_epi8(t0, s1);
s2 = _mm_unpacklo_epi8(t1, s3);
s4 = _mm_unpacklo_epi8(t2, s5);
s6 = _mm_unpacklo_epi8(t3, s7);
s0 = _mm_unpacklo_epi32(s0, s2);
s4 = _mm_unpacklo_epi32(s4, s6);
s0 = _mm_unpacklo_epi64(s0, s4);
_mm_storeu_si128((__m128i *)(comp_pred), s0);
if (!subpel_x_q3 && !subpel_y_q3) {
if (width >= 16) {
int i;
assert(!(width & 15));
/*Read 16 pixels one row at a time.*/
for (i = 0; i < height; i++) {
int j;