Commit 79362e33 authored by Debargha Mukherjee's avatar Debargha Mukherjee
Browse files

Revert "Limit to 192 filters for warp, clamp index since in some cases index 192"

This reverts commit 266db85d.

Reason for revert: Reverting to prevent software slowdown. Will be implemented differently in a separate patch.

Change-Id: I386a9661c87d69e22761e5c01507f2f1f968433f
parent f3e1ead3
......@@ -498,7 +498,7 @@ static uint8_t warp_interpolate(uint8_t *ref, int x, int y, int width,
// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
// We need an extra 2 taps to fit this in, for a total of 8 taps.
/* clang-format off */
const int16_t warped_filter_taps[WARPEDPIXEL_PREC_SHIFTS * 3][8] = {
const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
#if WARPEDPIXEL_PREC_BITS == 6
// [-1, 0)
{ 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
......@@ -656,11 +656,10 @@ const int16_t warped_filter_taps[WARPEDPIXEL_PREC_SHIFTS * 3][8] = {
{0, 0, 1, -3, 8, 126, -5, 1}, {0, 0, 0, -1, 4, 127, -3, 1},
#endif // WARPEDPIXEL_PREC_BITS == 6
};
const int16_t *av1_get_warped_filter(int offs) {
return warped_filter_taps[(offs >= 192) ? 191 : offs];
}
// dummy
{ 0, 0, 0, 0, 1, 127, 0, 0 },
};
/* clang-format on */
......@@ -1025,7 +1024,7 @@ void av1_highbd_warp_affine_c(int32_t *mat, uint16_t *ref, int width,
int ix = ix4 + l - 3;
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS;
const int16_t *coeffs = av1_get_warped_filter(offs);
const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) {
......@@ -1050,7 +1049,7 @@ void av1_highbd_warp_affine_c(int32_t *mat, uint16_t *ref, int width,
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS;
const int16_t *coeffs = av1_get_warped_filter(offs);
const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) {
......@@ -1286,7 +1285,7 @@ void av1_warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
// At this point, sx = sx4 + alpha * l + beta * k
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS;
const int16_t *coeffs = av1_get_warped_filter(offs);
const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) {
......@@ -1308,7 +1307,7 @@ void av1_warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
// At this point, sy = sy4 + gamma * l + delta * k
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS;
const int16_t *coeffs = av1_get_warped_filter(offs);
const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) {
......
......@@ -33,7 +33,7 @@
#define DEFAULT_WMTYPE AFFINE
#endif // CONFIG_WARPED_MOTION
const int16_t *av1_get_warped_filter(int offs);
const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
typedef void (*ProjectPointsFunc)(int32_t *mat, int *points, int *proj,
const int n, const int stride_points,
......
......@@ -14,6 +14,8 @@
#include "./av1_rtcd.h"
#include "av1/common/warped_motion.h"
static const __m128i *const filter = (const __m128i *const)warped_filter;
/* SSE2 version of the rotzoom/affine warp filter */
void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
int stride, uint8_t *pred, int p_col, int p_row,
......@@ -96,14 +98,10 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
_mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
// Filter even-index pixels
__m128i tmp_0 = *(__m128i const *)av1_get_warped_filter(
(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_2 = *(__m128i const *)av1_get_warped_filter(
(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_4 = *(__m128i const *)av1_get_warped_filter(
(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_6 = *(__m128i const *)av1_get_warped_filter(
(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_0 = filter[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_2 = filter[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_4 = filter[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_6 = filter[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS];
// coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
__m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
......@@ -142,14 +140,10 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
HORSHEAR_REDUCE_PREC_BITS);
// Filter odd-index pixels
__m128i tmp_1 = *(__m128i const *)av1_get_warped_filter(
(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_3 = *(__m128i const *)av1_get_warped_filter(
(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_5 = *(__m128i const *)av1_get_warped_filter(
(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_7 = *(__m128i const *)av1_get_warped_filter(
(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_1 = filter[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_3 = filter[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_5 = filter[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_7 = filter[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
__m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
......@@ -197,14 +191,10 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
__m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
// Filter even-index pixels
__m128i tmp_0 = *(__m128i const *)av1_get_warped_filter(
(sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_2 = *(__m128i const *)av1_get_warped_filter(
(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_4 = *(__m128i const *)av1_get_warped_filter(
(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_6 = *(__m128i const *)av1_get_warped_filter(
(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_0 = filter[(sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_2 = filter[(sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_4 = filter[(sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_6 = filter[(sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
__m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
......@@ -230,14 +220,10 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
__m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
__m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
__m128i tmp_1 = *(__m128i const *)av1_get_warped_filter(
(sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_3 = *(__m128i const *)av1_get_warped_filter(
(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_5 = *(__m128i const *)av1_get_warped_filter(
(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_7 = *(__m128i const *)av1_get_warped_filter(
(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS);
__m128i tmp_1 = filter[(sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_3 = filter[(sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_5 = filter[(sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_7 = filter[(sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS];
__m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
__m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment