Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Xiph.Org
aom-rav1e
Commits
a77ec1c9
Commit
a77ec1c9
authored
May 24, 2017
by
Debargha Mukherjee
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Change warp filter to use one less precision bit
Change-Id: Idc7bb686f5751b0457c9f21daac0fa6f4865fd22
parent
8feaaac8
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
65 additions
and
40 deletions
+65
-40
av1/common/warped_motion.c
av1/common/warped_motion.c
+31
-22
av1/common/x86/highbd_warp_plane_ssse3.c
av1/common/x86/highbd_warp_plane_ssse3.c
+10
-4
av1/common/x86/warp_plane_sse2.c
av1/common/x86/warp_plane_sse2.c
+11
-4
av1/common/x86/warp_plane_ssse3.c
av1/common/x86/warp_plane_ssse3.c
+13
-10
No files found.
av1/common/warped_motion.c
View file @
a77ec1c9
...
...
@@ -701,12 +701,8 @@ static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
8240
,
8224
,
8208
,
8192
,
};
static
INLINE
int16_t
saturate_int16
(
int32_t
v
)
{
if
(
v
>
32767
)
return
32767
;
else
if
(
v
<
-
32768
)
return
-
32768
;
return
v
;
static
INLINE
uint16_t
saturate_uint
(
int32_t
v
,
int
bits
)
{
return
(
uint16_t
)
clamp
(
v
,
0
,
(
1
<<
bits
)
-
1
);
}
#if CONFIG_WARPED_MOTION
...
...
@@ -1028,14 +1024,18 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
if
(
ix4
<=
-
7
)
{
for
(
l
=
0
;
l
<
8
;
++
l
)
{
tmp
[(
k
+
7
)
*
8
+
l
]
=
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
}
}
else
if
(
ix4
>=
width
+
6
)
{
for
(
l
=
0
;
l
<
8
;
++
l
)
{
tmp
[(
k
+
7
)
*
8
+
l
]
=
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
tmp
[(
k
+
7
)
*
8
+
l
]
=
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
}
}
else
{
int
sx
=
sx4
+
beta
*
(
k
+
4
);
...
...
@@ -1045,14 +1045,16 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
const
int
offs
=
ROUND_POWER_OF_TWO
(
sx
,
WARPEDDIFF_PREC_BITS
)
+
WARPEDPIXEL_PREC_SHIFTS
;
const
int16_t
*
coeffs
=
warped_filter
[
offs
];
int32_t
sum
=
0
;
int32_t
sum
=
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
1
)
;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for
(
m
=
0
;
m
<
8
;
++
m
)
{
sum
+=
ref
[
iy
*
stride
+
ix
+
m
]
*
coeffs
[
m
];
}
sum
=
ROUND_POWER_OF_TWO
(
sum
,
HORSHEAR_REDUCE_PREC_BITS
);
#if HORSHEAR_REDUCE_PREC_BITS >= 5
tmp
[(
k
+
7
)
*
8
+
(
l
+
4
)]
=
saturate_int16
(
sum
);
tmp
[(
k
+
7
)
*
8
+
(
l
+
4
)]
=
saturate_uint
(
sum
,
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
+
1
);
#else
tmp
[(
k
+
7
)
*
8
+
(
l
+
4
)]
=
sum
;
#endif
...
...
@@ -1070,7 +1072,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
const
int
offs
=
ROUND_POWER_OF_TWO
(
sy
,
WARPEDDIFF_PREC_BITS
)
+
WARPEDPIXEL_PREC_SHIFTS
;
const
int16_t
*
coeffs
=
warped_filter
[
offs
];
int32_t
sum
=
0
;
int32_t
sum
=
-
(
1
<<
(
bd
+
VERSHEAR_REDUCE_PREC_BITS
-
1
))
;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for
(
m
=
0
;
m
<
8
;
++
m
)
{
sum
+=
tmp
[(
k
+
m
+
4
)
*
8
+
(
l
+
4
)]
*
coeffs
[
m
];
...
...
@@ -1232,6 +1234,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
int16_t
delta
)
{
int16_t
tmp
[
15
*
8
];
int
i
,
j
,
k
,
l
,
m
;
const
int
bd
=
8
;
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
...
...
@@ -1288,8 +1291,10 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
// (once border extension is taken into account)
for
(
l
=
0
;
l
<
8
;
++
l
)
{
tmp
[(
k
+
7
)
*
8
+
l
]
=
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
}
}
else
if
(
ix4
>=
width
+
6
)
{
// In this case, the leftmost pixel sampled is in column
...
...
@@ -1297,9 +1302,11 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
// will sample only from the rightmost column
// (once border extension is taken into account)
for
(
l
=
0
;
l
<
8
;
++
l
)
{
tmp
[(
k
+
7
)
*
8
+
l
]
=
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
tmp
[(
k
+
7
)
*
8
+
l
]
=
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
));
}
}
else
{
// If we get here, then
...
...
@@ -1317,13 +1324,15 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
const
int
offs
=
ROUND_POWER_OF_TWO
(
sx
,
WARPEDDIFF_PREC_BITS
)
+
WARPEDPIXEL_PREC_SHIFTS
;
const
int16_t
*
coeffs
=
warped_filter
[
offs
];
int32_t
sum
=
0
;
int32_t
sum
=
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
1
)
;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for
(
m
=
0
;
m
<
8
;
++
m
)
{
sum
+=
ref
[
iy
*
stride
+
ix
+
m
]
*
coeffs
[
m
];
}
sum
=
ROUND_POWER_OF_TWO
(
sum
,
HORSHEAR_REDUCE_PREC_BITS
);
tmp
[(
k
+
7
)
*
8
+
(
l
+
4
)]
=
saturate_int16
(
sum
);
tmp
[(
k
+
7
)
*
8
+
(
l
+
4
)]
=
saturate_uint
(
sum
,
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
+
1
);
sx
+=
alpha
;
}
}
...
...
@@ -1339,7 +1348,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
const
int
offs
=
ROUND_POWER_OF_TWO
(
sy
,
WARPEDDIFF_PREC_BITS
)
+
WARPEDPIXEL_PREC_SHIFTS
;
const
int16_t
*
coeffs
=
warped_filter
[
offs
];
int32_t
sum
=
0
;
int32_t
sum
=
-
(
1
<<
(
bd
+
VERSHEAR_REDUCE_PREC_BITS
-
1
))
;
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for
(
m
=
0
;
m
<
8
;
++
m
)
{
sum
+=
tmp
[(
k
+
m
+
4
)
*
8
+
(
l
+
4
)]
*
coeffs
[
m
];
...
...
av1/common/x86/highbd_warp_plane_ssse3.c
View file @
a77ec1c9
...
...
@@ -89,8 +89,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
else
if
(
iy
>
height
-
1
)
iy
=
height
-
1
;
tmp
[
k
+
7
]
=
_mm_set1_epi16
(
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
}
}
else
if
(
ix4
>=
width
+
6
)
{
for
(
k
=
-
7
;
k
<
AOMMIN
(
8
,
p_height
-
i
);
++
k
)
{
...
...
@@ -100,8 +102,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
else
if
(
iy
>
height
-
1
)
iy
=
height
-
1
;
tmp
[
k
+
7
]
=
_mm_set1_epi16
(
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
}
}
else
{
for
(
k
=
-
7
;
k
<
AOMMIN
(
8
,
p_height
-
i
);
++
k
)
{
...
...
@@ -151,7 +155,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
const
__m128i
coeff_6
=
_mm_unpackhi_epi64
(
tmp_12
,
tmp_14
);
const
__m128i
round_const
=
_mm_set1_epi32
((
1
<<
HORSHEAR_REDUCE_PREC_BITS
)
>>
1
);
_mm_set1_epi32
((
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
1
))
+
((
1
<<
HORSHEAR_REDUCE_PREC_BITS
)
>>
1
));
// Calculate filtered results
const
__m128i
res_0
=
_mm_madd_epi16
(
src
,
coeff_0
);
...
...
@@ -299,7 +304,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
// Round and pack into 8 bits
const
__m128i
round_const
=
_mm_set1_epi32
((
1
<<
VERSHEAR_REDUCE_PREC_BITS
)
>>
1
);
_mm_set1_epi32
(
-
(
1
<<
(
bd
+
VERSHEAR_REDUCE_PREC_BITS
-
1
))
+
((
1
<<
VERSHEAR_REDUCE_PREC_BITS
)
>>
1
));
const
__m128i
res_lo_round
=
_mm_srai_epi32
(
_mm_add_epi32
(
res_lo
,
round_const
),
VERSHEAR_REDUCE_PREC_BITS
);
...
...
av1/common/x86/warp_plane_sse2.c
View file @
a77ec1c9
...
...
@@ -23,6 +23,7 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
int16_t
delta
)
{
__m128i
tmp
[
15
];
int
i
,
j
,
k
;
const
int
bd
=
8
;
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
...
...
@@ -84,8 +85,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
else
if
(
iy
>
height
-
1
)
iy
=
height
-
1
;
tmp
[
k
+
7
]
=
_mm_set1_epi16
(
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
}
}
else
if
(
ix4
>=
width
+
6
)
{
for
(
k
=
-
7
;
k
<
AOMMIN
(
8
,
p_height
-
i
);
++
k
)
{
...
...
@@ -95,8 +98,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
else
if
(
iy
>
height
-
1
)
iy
=
height
-
1
;
tmp
[
k
+
7
]
=
_mm_set1_epi16
(
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
}
}
else
{
for
(
k
=
-
7
;
k
<
AOMMIN
(
8
,
p_height
-
i
);
++
k
)
{
...
...
@@ -145,7 +150,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
const
__m128i
coeff_6
=
_mm_unpackhi_epi64
(
tmp_12
,
tmp_14
);
const
__m128i
round_const
=
_mm_set1_epi32
((
1
<<
HORSHEAR_REDUCE_PREC_BITS
)
>>
1
);
_mm_set1_epi32
((
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
1
))
+
((
1
<<
HORSHEAR_REDUCE_PREC_BITS
)
>>
1
));
// Calculate filtered results
const
__m128i
src_0
=
_mm_unpacklo_epi8
(
src
,
zero
);
...
...
@@ -294,7 +300,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
// Round and pack into 8 bits
const
__m128i
round_const
=
_mm_set1_epi32
((
1
<<
VERSHEAR_REDUCE_PREC_BITS
)
>>
1
);
_mm_set1_epi32
(
-
(
1
<<
(
bd
+
VERSHEAR_REDUCE_PREC_BITS
-
1
))
+
((
1
<<
VERSHEAR_REDUCE_PREC_BITS
)
>>
1
));
const
__m128i
res_lo_round
=
_mm_srai_epi32
(
_mm_add_epi32
(
res_lo
,
round_const
),
VERSHEAR_REDUCE_PREC_BITS
);
...
...
av1/common/x86/warp_plane_ssse3.c
View file @
a77ec1c9
...
...
@@ -210,6 +210,7 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
int16_t
delta
)
{
__m128i
tmp
[
15
];
int
i
,
j
,
k
;
const
int
bd
=
8
;
/* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other
...
...
@@ -271,8 +272,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
else
if
(
iy
>
height
-
1
)
iy
=
height
-
1
;
tmp
[
k
+
7
]
=
_mm_set1_epi16
(
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
}
}
else
if
(
ix4
>=
width
+
6
)
{
for
(
k
=
-
7
;
k
<
AOMMIN
(
8
,
p_height
-
i
);
++
k
)
{
...
...
@@ -282,8 +285,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
else
if
(
iy
>
height
-
1
)
iy
=
height
-
1
;
tmp
[
k
+
7
]
=
_mm_set1_epi16
(
(
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
-
1
))
+
ref
[
iy
*
stride
+
(
width
-
1
)]
*
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
(
1
<<
(
WARPEDPIXEL_FILTER_BITS
-
HORSHEAR_REDUCE_PREC_BITS
)));
}
}
else
{
for
(
k
=
-
7
;
k
<
AOMMIN
(
8
,
p_height
-
i
);
++
k
)
{
...
...
@@ -365,7 +370,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
const
__m128i
res_57
=
_mm_maddubs_epi16
(
src_57
,
coeff_57
);
const
__m128i
round_const
=
_mm_set1_epi16
((
1
<<
HORSHEAR_REDUCE_PREC_BITS
)
>>
1
);
_mm_set1_epi16
((
1
<<
(
bd
+
WARPEDPIXEL_FILTER_BITS
-
1
))
+
((
1
<<
HORSHEAR_REDUCE_PREC_BITS
)
>>
1
));
// Note: res_02 + res_46 and res_13 + res_57 are always in the range
// [-6120, 32640]. This gives us enough room to add the rounding
...
...
@@ -374,12 +380,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
_mm_add_epi16
(
_mm_add_epi16
(
res_02
,
res_46
),
round_const
);
const
__m128i
res_b
=
_mm_add_epi16
(
res_13
,
res_57
);
// Calculate (res_a + res_b) >> 1 while avoiding overflow
const
__m128i
t1
=
_mm_and_si128
(
res_a
,
res_b
);
const
__m128i
t2
=
_mm_srai_epi16
(
_mm_xor_si128
(
res_a
,
res_b
),
1
);
const
__m128i
res
=
_mm_srai_epi16
(
_mm_add_epi16
(
t1
,
t2
),
HORSHEAR_REDUCE_PREC_BITS
-
1
);
const
__m128i
res
=
_mm_srli_epi16
(
_mm_add_epi16
(
res_a
,
res_b
),
HORSHEAR_REDUCE_PREC_BITS
);
tmp
[
k
+
7
]
=
res
;
}
}
...
...
@@ -471,7 +473,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
// Round and pack into 8 bits
const
__m128i
round_const
=
_mm_set1_epi32
((
1
<<
VERSHEAR_REDUCE_PREC_BITS
)
>>
1
);
_mm_set1_epi32
(
-
(
1
<<
(
bd
+
VERSHEAR_REDUCE_PREC_BITS
-
1
))
+
((
1
<<
VERSHEAR_REDUCE_PREC_BITS
)
>>
1
));
const
__m128i
res_lo_round
=
_mm_srai_epi32
(
_mm_add_epi32
(
res_lo
,
round_const
),
VERSHEAR_REDUCE_PREC_BITS
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment