Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Xiph.Org
aom-rav1e
Commits
a77ec1c9
Commit
a77ec1c9
authored
May 24, 2017
by
Debargha Mukherjee
Browse files
Change warp filter to use one less precision bit
Change-Id: Idc7bb686f5751b0457c9f21daac0fa6f4865fd22
parent
8feaaac8
Changes
4
Hide whitespace changes
Inline
Side-by-side
av1/common/warped_motion.c
View file @
a77ec1c9
...
...
@@ -701,12 +701,8 @@ static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
8240
,
8224
,
8208
,
8192
,
};
static
INLINE
int16_t
saturate_int16
(
int32_t
v
)
{
if
(
v
>
32767
)
return
32767
;
else
if
(
v
<
-
32768
)
return
-
32768
;
return
v
;
static
INLINE
uint16_t
saturate_uint
(
int32_t
v
,
int
bits
)
{
return
(
uint16_t
)
clamp
(
v
,
0
,
(
1
<<
bits
)
-
1
);
}
#if CONFIG_WARPED_MOTION
...
...
@@ -1028,14 +1024,18 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
if
(
ix4
<=
-
7
)
{
for
(
l
=
0
;
l
<
8
;
++
l
)
{
tmp
[(
k
+
7
)
*
8
+
l
]
=
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
}
}
else
if
(
ix4
>=
width
+
6
)
{
for
(
l
=
0
;
l
<
8
;
++
l
)
{
tmp
[(
k
+
7
)
*
8
+
l
]
=
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
tmp
[(
k
+
7
)
*
8
+
l
]
=
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
}
}
else
{
int
sx
=
sx4
+
beta
*
(
k
+
4
);
...
...
@@ -1045,14 +1045,16 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
const
int
offs
=
ROUND_POWER_OF_TWO
(
sx
,
WARPEDDIFF_PREC_BITS
)
+
WARPEDPIXEL_PREC_SHIFTS
;
const
int16_t
*
coeffs
=
warped_filter
[
offs
];
int32_t
sum
=
0
;
int32_t
sum
=
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
1
)
;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for
(
m
=
0
;
m
<
8
;
++
m
)
{
sum
+=
ref
[
iy
*
stride
+
ix
+
m
]
*
coeffs
[
m
];
}
sum
=
ROUND_POWER_OF_TWO
(
sum
,
HORSHEAR_REDUCE_PREC_BITS
);
#if HORSHEAR_REDUCE_PREC_BITS >= 5
tmp
[(
k
+
7
)
*
8
+
(
l
+
4
)]
=
saturate_int16
(
sum
);
tmp
[(
k
+
7
)
*
8
+
(
l
+
4
)]
=
saturate_uint
(
sum
,
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
+
1
);
#else
tmp
[(
k
+
7
)
*
8
+
(
l
+
4
)]
=
sum
;
#endif
...
...
@@ -1070,7 +1072,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
const
int
offs
=
ROUND_POWER_OF_TWO
(
sy
,
WARPEDDIFF_PREC_BITS
)
+
WARPEDPIXEL_PREC_SHIFTS
;
const
int16_t
*
coeffs
=
warped_filter
[
offs
];
int32_t
sum
=
0
;
int32_t
sum
=
-
(
1
<<
(
bd
+
VERSHEAR_REDUCE_PREC_BITS
-
1
))
;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for
(
m
=
0
;
m
<
8
;
++
m
)
{
sum
+=
tmp
[(
k
+
m
+
4
)
*
8
+
(
l
+
4
)]
*
coeffs
[
m
];
...
...
@@ -1232,6 +1234,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
int16_t
delta
)
{
int16_t
tmp
[
15
*
8
];
int
i
,
j
,
k
,
l
,
m
;
const
int
bd
=
8
;
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
...
...
@@ -1288,8 +1291,10 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
// (once border extension is taken into account)
for
(
l
=
0
;
l
<
8
;
++
l
)
{
tmp
[(
k
+
7
)
*
8
+
l
]
=
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
}
}
else
if
(
ix4
>=
width
+
6
)
{
// In this case, the leftmost pixel sampled is in column
...
...
@@ -1297,9 +1302,11 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
// will sample only from the rightmost column
// (once border extension is taken into account)
for
(
l
=
0
;
l
<
8
;
++
l
)
{
tmp
[(
k
+
7
)
*
8
+
l
]
=
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
tmp
[(
k
+
7
)
*
8
+
l
]
=
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
}
}
else
{
// If we get here, then
...
...
@@ -1317,13 +1324,15 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
const
int
offs
=
ROUND_POWER_OF_TWO
(
sx
,
WARPEDDIFF_PREC_BITS
)
+
WARPEDPIXEL_PREC_SHIFTS
;
const
int16_t
*
coeffs
=
warped_filter
[
offs
];
int32_t
sum
=
0
;
int32_t
sum
=
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
1
)
;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for
(
m
=
0
;
m
<
8
;
++
m
)
{
sum
+=
ref
[
iy
*
stride
+
ix
+
m
]
*
coeffs
[
m
];
}
sum
=
ROUND_POWER_OF_TWO
(
sum
,
HORSHEAR_REDUCE_PREC_BITS
);
tmp
[(
k
+
7
)
*
8
+
(
l
+
4
)]
=
saturate_int16
(
sum
);
tmp
[(
k
+
7
)
*
8
+
(
l
+
4
)]
=
saturate_uint
(
sum
,
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
+
1
);
sx
+=
alpha
;
}
}
...
...
@@ -1339,7 +1348,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
const
int
offs
=
ROUND_POWER_OF_TWO
(
sy
,
WARPEDDIFF_PREC_BITS
)
+
WARPEDPIXEL_PREC_SHIFTS
;
const
int16_t
*
coeffs
=
warped_filter
[
offs
];
int32_t
sum
=
0
;
int32_t
sum
=
-
(
1
<<
(
bd
+
VERSHEAR_REDUCE_PREC_BITS
-
1
))
;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for
(
m
=
0
;
m
<
8
;
++
m
)
{
sum
+=
tmp
[(
k
+
m
+
4
)
*
8
+
(
l
+
4
)]
*
coeffs
[
m
];
...
...
av1/common/x86/highbd_warp_plane_ssse3.c
View file @
a77ec1c9
...
...
@@ -89,8 +89,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
else
if
(
iy
>
height
-
1
)
iy
=
height
-
1
;
tmp
[
k
+
7
]
=
_mm_set1_epi16
(
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
}
}
else
if
(
ix4
>=
width
+
6
)
{
for
(
k
=
-
7
;
k
<
AOMMIN
(
8
,
p_height
-
i
);
++
k
)
{
...
...
@@ -100,8 +102,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
else
if
(
iy
>
height
-
1
)
iy
=
height
-
1
;
tmp
[
k
+
7
]
=
_mm_set1_epi16
(
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
}
}
else
{
for
(
k
=
-
7
;
k
<
AOMMIN
(
8
,
p_height
-
i
);
++
k
)
{
...
...
@@ -151,7 +155,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
const
__m128i
coeff_6
=
_mm_unpackhi_epi64
(
tmp_12
,
tmp_14
);
const
__m128i
round_const
=
_mm_set1_epi32
((
1
<<
HORSHEAR_REDUCE_PREC_BITS
)
>>
1
);
_mm_set1_epi32
((
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
1
))
+
((
1
<<
HORSHEAR_REDUCE_PREC_BITS
)
>>
1
));
// Calculate filtered results
const
__m128i
res_0
=
_mm_madd_epi16
(
src
,
coeff_0
);
...
...
@@ -299,7 +304,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
// Round and pack into 8 bits
const
__m128i
round_const
=
_mm_set1_epi32
((
1
<<
VERSHEAR_REDUCE_PREC_BITS
)
>>
1
);
_mm_set1_epi32
(
-
(
1
<<
(
bd
+
VERSHEAR_REDUCE_PREC_BITS
-
1
))
+
((
1
<<
VERSHEAR_REDUCE_PREC_BITS
)
>>
1
));
const
__m128i
res_lo_round
=
_mm_srai_epi32
(
_mm_add_epi32
(
res_lo
,
round_const
),
VERSHEAR_REDUCE_PREC_BITS
);
...
...
av1/common/x86/warp_plane_sse2.c
View file @
a77ec1c9
...
...
@@ -23,6 +23,7 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
int16_t
delta
)
{
__m128i
tmp
[
15
];
int
i
,
j
,
k
;
const
int
bd
=
8
;
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
...
...
@@ -84,8 +85,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
else
if
(
iy
>
height
-
1
)
iy
=
height
-
1
;
tmp
[
k
+
7
]
=
_mm_set1_epi16
(
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
}
}
else
if
(
ix4
>=
width
+
6
)
{
for
(
k
=
-
7
;
k
<
AOMMIN
(
8
,
p_height
-
i
);
++
k
)
{
...
...
@@ -95,8 +98,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
else
if
(
iy
>
height
-
1
)
iy
=
height
-
1
;
tmp
[
k
+
7
]
=
_mm_set1_epi16
(
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
}
}
else
{
for
(
k
=
-
7
;
k
<
AOMMIN
(
8
,
p_height
-
i
);
++
k
)
{
...
...
@@ -145,7 +150,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
const
__m128i
coeff_6
=
_mm_unpackhi_epi64
(
tmp_12
,
tmp_14
);
const
__m128i
round_const
=
_mm_set1_epi32
((
1
<<
HORSHEAR_REDUCE_PREC_BITS
)
>>
1
);
_mm_set1_epi32
((
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
1
))
+
((
1
<<
HORSHEAR_REDUCE_PREC_BITS
)
>>
1
));
// Calculate filtered results
const
__m128i
src_0
=
_mm_unpacklo_epi8
(
src
,
zero
);
...
...
@@ -294,7 +300,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
// Round and pack into 8 bits
const
__m128i
round_const
=
_mm_set1_epi32
((
1
<<
VERSHEAR_REDUCE_PREC_BITS
)
>>
1
);
_mm_set1_epi32
(
-
(
1
<<
(
bd
+
VERSHEAR_REDUCE_PREC_BITS
-
1
))
+
((
1
<<
VERSHEAR_REDUCE_PREC_BITS
)
>>
1
));
const
__m128i
res_lo_round
=
_mm_srai_epi32
(
_mm_add_epi32
(
res_lo
,
round_const
),
VERSHEAR_REDUCE_PREC_BITS
);
...
...
av1/common/x86/warp_plane_ssse3.c
View file @
a77ec1c9
...
...
@@ -210,6 +210,7 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
int16_t
delta
)
{
__m128i
tmp
[
15
];
int
i
,
j
,
k
;
const
int
bd
=
8
;
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
...
...
@@ -271,8 +272,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
else
if
(
iy
>
height
-
1
)
iy
=
height
-
1
;
tmp
[
k
+
7
]
=
_mm_set1_epi16
(
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
}
}
else
if
(
ix4
>=
width
+
6
)
{
for
(
k
=
-
7
;
k
<
AOMMIN
(
8
,
p_height
-
i
);
++
k
)
{
...
...
@@ -282,8 +285,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
else
if
(
iy
>
height
-
1
)
iy
=
height
-
1
;
tmp
[
k
+
7
]
=
_mm_set1_epi16
(
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
}
}
else
{
for
(
k
=
-
7
;
k
<
AOMMIN
(
8
,
p_height
-
i
);
++
k
)
{
...
...
@@ -365,7 +370,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
const
__m128i
res_57
=
_mm_maddubs_epi16
(
src_57
,
coeff_57
);
const
__m128i
round_const
=
_mm_set1_epi16
((
1
<<
HORSHEAR_REDUCE_PREC_BITS
)
>>
1
);
_mm_set1_epi16
((
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
1
))
+
((
1
<<
HORSHEAR_REDUCE_PREC_BITS
)
>>
1
));
// Note: res_02 + res_46 and res_13 + res_57 are always in the range
// [-6120, 32640]. This gives us enough room to add the rounding
...
...
@@ -374,12 +380,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
_mm_add_epi16
(
_mm_add_epi16
(
res_02
,
res_46
),
round_const
);
const
__m128i
res_b
=
_mm_add_epi16
(
res_13
,
res_57
);
// Calculate (res_a + res_b) >> 1 while avoiding overflow
const
__m128i
t1
=
_mm_and_si128
(
res_a
,
res_b
);
const
__m128i
t2
=
_mm_srai_epi16
(
_mm_xor_si128
(
res_a
,
res_b
),
1
);
const
__m128i
res
=
_mm_srai_epi16
(
_mm_add_epi16
(
t1
,
t2
),
HORSHEAR_REDUCE_PREC_BITS
-
1
);
const
__m128i
res
=
_mm_srli_epi16
(
_mm_add_epi16
(
res_a
,
res_b
),
HORSHEAR_REDUCE_PREC_BITS
);
tmp
[
k
+
7
]
=
res
;
}
}
...
...
@@ -471,7 +473,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
// Round and pack into 8 bits
const
__m128i
round_const
=
_mm_set1_epi32
((
1
<<
VERSHEAR_REDUCE_PREC_BITS
)
>>
1
);
_mm_set1_epi32
(
-
(
1
<<
(
bd
+
VERSHEAR_REDUCE_PREC_BITS
-
1
))
+
((
1
<<
VERSHEAR_REDUCE_PREC_BITS
)
>>
1
));
const
__m128i
res_lo_round
=
_mm_srai_epi32
(
_mm_add_epi32
(
res_lo
,
round_const
),
VERSHEAR_REDUCE_PREC_BITS
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment