Commit 26496c52 authored by Scott LaVarnway's avatar Scott LaVarnway
Browse files

Quick modifications to wide loopfilter intrinsic functions

Modified to work with 8x8 blocks of memory.  Will revisit
later for further optimizations.  For the HD clip used, the
decoder improved my 20%.

Change-Id: Ia0057f55d66d1445882351ea6c43b595a5a980e5
parent bbd5cb2b
...@@ -184,7 +184,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch, ...@@ -184,7 +184,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
if (mask & 1) { if (mask & 1) {
if (mask_16x16 & 1) { if (mask_16x16 & 1) {
vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim, vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
lfi->hev_thr, 1); lfi->hev_thr);
assert(!(mask_8x8 & 1)); assert(!(mask_8x8 & 1));
assert(!(mask_4x4 & 1)); assert(!(mask_4x4 & 1));
assert(!(mask_4x4_int & 1)); assert(!(mask_4x4_int & 1));
...@@ -229,7 +229,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, ...@@ -229,7 +229,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
if (!only_4x4_1) { if (!only_4x4_1) {
if (mask_16x16 & 1) { if (mask_16x16 & 1) {
vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
lfi->hev_thr, 1); lfi->hev_thr);
assert(!(mask_8x8 & 1)); assert(!(mask_8x8 & 1));
assert(!(mask_4x4 & 1)); assert(!(mask_4x4 & 1));
assert(!(mask_4x4_int & 1)); assert(!(mask_4x4_int & 1));
......
...@@ -82,15 +82,4 @@ void vp9_loop_filter_partial_frame(struct VP9Common *cm, ...@@ -82,15 +82,4 @@ void vp9_loop_filter_partial_frame(struct VP9Common *cm,
void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi, void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
int sharpness_lvl); int sharpness_lvl);
void vp9_mb_lpf_horizontal_edge_w(unsigned char *s, int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
int count);
void vp9_mb_lpf_vertical_edge_w(unsigned char *s, int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
int count);
#endif // VP9_COMMON_VP9_LOOPFILTER_H_ #endif // VP9_COMMON_VP9_LOOPFILTER_H_
...@@ -255,16 +255,15 @@ static INLINE void wide_mbfilter(int8_t mask, uint8_t hev, ...@@ -255,16 +255,15 @@ static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
} }
} }
void vp9_mb_lpf_horizontal_edge_w(uint8_t *s, int p, void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p,
const uint8_t *blimit, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *limit,
const uint8_t *thresh, const uint8_t *thresh) {
int count) {
int i; int i;
// loop filter designed to work using chars so that we can make maximum use // loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions. // of 8 bit simd instructions.
for (i = 0; i < 8 * count; ++i) { for (i = 0; i < 8; ++i) {
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
const int8_t mask = filter_mask(*limit, *blimit, const int8_t mask = filter_mask(*limit, *blimit,
...@@ -285,14 +284,13 @@ void vp9_mb_lpf_horizontal_edge_w(uint8_t *s, int p, ...@@ -285,14 +284,13 @@ void vp9_mb_lpf_horizontal_edge_w(uint8_t *s, int p,
} }
} }
void vp9_mb_lpf_vertical_edge_w(uint8_t *s, int p, void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p,
const uint8_t *blimit, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *limit,
const uint8_t *thresh, const uint8_t *thresh) {
int count) {
int i; int i;
for (i = 0; i < 8 * count; ++i) { for (i = 0; i < 8; ++i) {
const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask = filter_mask(*limit, *blimit, const int8_t mask = filter_mask(*limit, *blimit,
......
...@@ -86,8 +86,8 @@ fi ...@@ -86,8 +86,8 @@ fi
# #
# Loopfilter # Loopfilter
# #
prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
specialize vp9_mb_lpf_vertical_edge_w specialize vp9_mb_lpf_vertical_edge_w sse2
prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mbloop_filter_vertical_edge specialize vp9_mbloop_filter_vertical_edge
...@@ -95,8 +95,8 @@ specialize vp9_mbloop_filter_vertical_edge ...@@ -95,8 +95,8 @@ specialize vp9_mbloop_filter_vertical_edge
prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_loop_filter_vertical_edge mmx specialize vp9_loop_filter_vertical_edge mmx
prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
specialize vp9_mb_lpf_horizontal_edge_w specialize vp9_mb_lpf_horizontal_edge_w sse2
prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mbloop_filter_horizontal_edge specialize vp9_mbloop_filter_horizontal_edge
......
...@@ -23,14 +23,14 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, ...@@ -23,14 +23,14 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
const unsigned char *_blimit, const unsigned char *_blimit,
const unsigned char *_limit, const unsigned char *_limit,
const unsigned char *_thresh) { const unsigned char *_thresh) {
DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]); DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]);
DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]); DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]);
DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]); DECLARE_ALIGNED(16, unsigned char, flat_op[3][8]);
DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]); DECLARE_ALIGNED(16, unsigned char, flat_oq[3][8]);
DECLARE_ALIGNED(16, unsigned char, ap[8][16]); DECLARE_ALIGNED(16, unsigned char, ap[8][8]);
DECLARE_ALIGNED(16, unsigned char, aq[8][16]); DECLARE_ALIGNED(16, unsigned char, aq[8][8]);
__m128i mask, hev, flat, flat2; __m128i mask, hev, flat, flat2;
...@@ -50,27 +50,27 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, ...@@ -50,27 +50,27 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
const __m128i blimit = const __m128i blimit =
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); q4 = _mm_loadl_epi64((__m128i *)(s + 4 * p));
_mm_store_si128((__m128i *)ap[4], p4); _mm_storel_epi64((__m128i *)ap[4], p4);
_mm_store_si128((__m128i *)ap[3], p3); _mm_storel_epi64((__m128i *)ap[3], p3);
_mm_store_si128((__m128i *)ap[2], p2); _mm_storel_epi64((__m128i *)ap[2], p2);
_mm_store_si128((__m128i *)ap[1], p1); _mm_storel_epi64((__m128i *)ap[1], p1);
_mm_store_si128((__m128i *)ap[0], p0); _mm_storel_epi64((__m128i *)ap[0], p0);
_mm_store_si128((__m128i *)aq[4], q4); _mm_storel_epi64((__m128i *)aq[4], q4);
_mm_store_si128((__m128i *)aq[3], q3); _mm_storel_epi64((__m128i *)aq[3], q3);
_mm_store_si128((__m128i *)aq[2], q2); _mm_storel_epi64((__m128i *)aq[2], q2);
_mm_store_si128((__m128i *)aq[1], q1); _mm_storel_epi64((__m128i *)aq[1], q1);
_mm_store_si128((__m128i *)aq[0], q0); _mm_storel_epi64((__m128i *)aq[0], q0);
{ {
...@@ -188,33 +188,33 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, ...@@ -188,33 +188,33 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_cmpeq_epi8(flat, zero);
flat = _mm_and_si128(flat, mask); flat = _mm_and_si128(flat, mask);
p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); q5 = _mm_loadl_epi64((__m128i *)(s + 5 * p));
flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0), flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
_mm_subs_epu8(p0, p5)), _mm_subs_epu8(p0, p5)),
_mm_or_si128(_mm_subs_epu8(q5, q0), _mm_or_si128(_mm_subs_epu8(q5, q0),
_mm_subs_epu8(q0, q5))); _mm_subs_epu8(q0, q5)));
_mm_store_si128((__m128i *)ap[5], p5); _mm_storel_epi64((__m128i *)ap[5], p5);
_mm_store_si128((__m128i *)aq[5], q5); _mm_storel_epi64((__m128i *)aq[5], q5);
flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_max_epu8(work, flat2);
p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); q6 = _mm_loadl_epi64((__m128i *)(s + 6 * p));
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0), work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
_mm_subs_epu8(p0, p6)), _mm_subs_epu8(p0, p6)),
_mm_or_si128(_mm_subs_epu8(q6, q0), _mm_or_si128(_mm_subs_epu8(q6, q0),
_mm_subs_epu8(q0, q6))); _mm_subs_epu8(q0, q6)));
_mm_store_si128((__m128i *)ap[6], p6); _mm_storel_epi64((__m128i *)ap[6], p6);
_mm_store_si128((__m128i *)aq[6], q6); _mm_storel_epi64((__m128i *)aq[6], q6);
flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_max_epu8(work, flat2);
p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); q7 = _mm_loadl_epi64((__m128i *)(s + 7 * p));
work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0), work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
_mm_subs_epu8(p0, p7)), _mm_subs_epu8(p0, p7)),
_mm_or_si128(_mm_subs_epu8(q7, q0), _mm_or_si128(_mm_subs_epu8(q7, q0),
_mm_subs_epu8(q0, q7))); _mm_subs_epu8(q0, q7)));
_mm_store_si128((__m128i *)ap[7], p7); _mm_storel_epi64((__m128i *)ap[7], p7);
_mm_store_si128((__m128i *)aq[7], q7); _mm_storel_epi64((__m128i *)aq[7], q7);
flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_max_epu8(work, flat2);
flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_subs_epu8(flat2, one);
flat2 = _mm_cmpeq_epi8(flat2, zero); flat2 = _mm_cmpeq_epi8(flat2, zero);
...@@ -226,30 +226,26 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, ...@@ -226,30 +226,26 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
{ {
const __m128i eight = _mm_set1_epi16(8); const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4); const __m128i four = _mm_set1_epi16(4);
__m128i temp_flat2 = flat2; {
unsigned char *src = s;
int i = 0;
do {
__m128i workp_shft; __m128i workp_shft;
__m128i a, b, c; __m128i a, b, c;
unsigned int off = i * 8; p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7])), zero);
p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero); p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6])), zero);
p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero); p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5])), zero);
p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero); p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4])), zero);
p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero); p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3])), zero);
p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero); p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2])), zero);
p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero); p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1])), zero);
p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero); p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0])), zero);
p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero); q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0])), zero);
q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero); q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1])), zero);
q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero); q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2])), zero);
q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero); q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3])), zero);
q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero); q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4])), zero);
q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero); q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5])), zero);
q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero); q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6])), zero);
q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero); q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7])), zero);
q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7
c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
...@@ -370,120 +366,117 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, ...@@ -370,120 +366,117 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
_mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
_mm_packus_epi16(workp_shft, workp_shft)); _mm_packus_epi16(workp_shft, workp_shft));
}
temp_flat2 = _mm_srli_si128(temp_flat2, 8);
src += 8;
} while (++i < 2);
} }
// wide flat // wide flat
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
work_a = _mm_load_si128((__m128i *)ap[2]); work_a = _mm_loadl_epi64((__m128i *)ap[2]);
p2 = _mm_load_si128((__m128i *)flat_op[2]); p2 = _mm_loadl_epi64((__m128i *)flat_op[2]);
work_a = _mm_andnot_si128(flat, work_a); work_a = _mm_andnot_si128(flat, work_a);
p2 = _mm_and_si128(flat, p2); p2 = _mm_and_si128(flat, p2);
p2 = _mm_or_si128(work_a, p2); p2 = _mm_or_si128(work_a, p2);
_mm_store_si128((__m128i *)flat_op[2], p2); _mm_storel_epi64((__m128i *)flat_op[2], p2);
p1 = _mm_load_si128((__m128i *)flat_op[1]); p1 = _mm_loadl_epi64((__m128i *)flat_op[1]);
work_a = _mm_andnot_si128(flat, ps1); work_a = _mm_andnot_si128(flat, ps1);
p1 = _mm_and_si128(flat, p1); p1 = _mm_and_si128(flat, p1);
p1 = _mm_or_si128(work_a, p1); p1 = _mm_or_si128(work_a, p1);
_mm_store_si128((__m128i *)flat_op[1], p1); _mm_storel_epi64((__m128i *)flat_op[1], p1);
p0 = _mm_load_si128((__m128i *)flat_op[0]); p0 = _mm_loadl_epi64((__m128i *)flat_op[0]);
work_a = _mm_andnot_si128(flat, ps0); work_a = _mm_andnot_si128(flat, ps0);
p0 = _mm_and_si128(flat, p0); p0 = _mm_and_si128(flat, p0);
p0 = _mm_or_si128(work_a, p0); p0 = _mm_or_si128(work_a, p0);
_mm_store_si128((__m128i *)flat_op[0], p0); _mm_storel_epi64((__m128i *)flat_op[0], p0);
q0 = _mm_load_si128((__m128i *)flat_oq[0]); q0 = _mm_loadl_epi64((__m128i *)flat_oq[0]);
work_a = _mm_andnot_si128(flat, qs0); work_a = _mm_andnot_si128(flat, qs0);
q0 = _mm_and_si128(flat, q0); q0 = _mm_and_si128(flat, q0);
q0 = _mm_or_si128(work_a, q0); q0 = _mm_or_si128(work_a, q0);
_mm_store_si128((__m128i *)flat_oq[0], q0); _mm_storel_epi64((__m128i *)flat_oq[0], q0);
q1 = _mm_load_si128((__m128i *)flat_oq[1]); q1 = _mm_loadl_epi64((__m128i *)flat_oq[1]);
work_a = _mm_andnot_si128(flat, qs1); work_a = _mm_andnot_si128(flat, qs1);
q1 = _mm_and_si128(flat, q1); q1 = _mm_and_si128(flat, q1);
q1 = _mm_or_si128(work_a, q1); q1 = _mm_or_si128(work_a, q1);
_mm_store_si128((__m128i *)flat_oq[1], q1); _mm_storel_epi64((__m128i *)flat_oq[1], q1);
work_a = _mm_load_si128((__m128i *)aq[2]); work_a = _mm_loadl_epi64((__m128i *)aq[2]);
q2 = _mm_load_si128((__m128i *)flat_oq[2]); q2 = _mm_loadl_epi64((__m128i *)flat_oq[2]);
work_a = _mm_andnot_si128(flat, work_a); work_a = _mm_andnot_si128(flat, work_a);
q2 = _mm_and_si128(flat, q2); q2 = _mm_and_si128(flat, q2);
q2 = _mm_or_si128(work_a, q2); q2 = _mm_or_si128(work_a, q2);
_mm_store_si128((__m128i *)flat_oq[2], q2); _mm_storel_epi64((__m128i *)flat_oq[2], q2);
// write out op6 - op3 // write out op6 - op3
{ {
unsigned char *dst = (s - 7 * p); unsigned char *dst = (s - 7 * p);
for (i = 6; i > 2; i--) { for (i = 6; i > 2; i--) {
__m128i flat2_output; __m128i flat2_output;
work_a = _mm_load_si128((__m128i *)ap[i]); work_a = _mm_loadl_epi64((__m128i *)ap[i]);
flat2_output = _mm_load_si128((__m128i *)flat2_op[i]); flat2_output = _mm_loadl_epi64((__m128i *)flat2_op[i]);
work_a = _mm_andnot_si128(flat2, work_a); work_a = _mm_andnot_si128(flat2, work_a);
flat2_output = _mm_and_si128(flat2, flat2_output); flat2_output = _mm_and_si128(flat2, flat2_output);
work_a = _mm_or_si128(work_a, flat2_output); work_a = _mm_or_si128(work_a, flat2_output);
_mm_storeu_si128((__m128i *)dst, work_a); _mm_storel_epi64((__m128i *)dst, work_a);
dst += p; dst += p;
} }
} }
work_a = _mm_load_si128((__m128i *)flat_op[2]); work_a = _mm_loadl_epi64((__m128i *)flat_op[2]);
p2 = _mm_load_si128((__m128i *)flat2_op[2]); p2 = _mm_loadl_epi64((__m128i *)flat2_op[2]);
work_a = _mm_andnot_si128(flat2, work_a); work_a = _mm_andnot_si128(flat2, work_a);
p2 = _mm_and_si128(flat2, p2); p2 = _mm_and_si128(flat2, p2);
p2 = _mm_or_si128(work_a, p2); p2 = _mm_or_si128(work_a, p2);
_mm_storeu_si128((__m128i *)(s - 3 * p), p2); _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
work_a = _mm_load_si128((__m128i *)flat_op[1]); work_a = _mm_loadl_epi64((__m128i *)flat_op[1]);
p1 = _mm_load_si128((__m128i *)flat2_op[1]); p1 = _mm_loadl_epi64((__m128i *)flat2_op[1]);
work_a = _mm_andnot_si128(flat2, work_a); work_a = _mm_andnot_si128(flat2, work_a);
p1 = _mm_and_si128(flat2, p1); p1 = _mm_and_si128(flat2, p1);
p1 = _mm_or_si128(work_a, p1); p1 = _mm_or_si128(work_a, p1);
_mm_storeu_si128((__m128i *)(s - 2 * p), p1); _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
work_a = _mm_load_si128((__m128i *)flat_op[0]); work_a = _mm_loadl_epi64((__m128i *)flat_op[0]);
p0 = _mm_load_si128((__m128i *)flat2_op[0]); p0 = _mm_loadl_epi64((__m128i *)flat2_op[0]);
work_a = _mm_andnot_si128(flat2, work_a); work_a = _mm_andnot_si128(flat2, work_a);
p0 = _mm_and_si128(flat2, p0); p0 = _mm_and_si128(flat2, p0);
p0 = _mm_or_si128(work_a, p0); p0 = _mm_or_si128(work_a, p0);
_mm_storeu_si128((__m128i *)(s - 1 * p), p0); _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
work_a = _mm_load_si128((__m128i *)flat_oq[0]); work_a = _mm_loadl_epi64((__m128i *)flat_oq[0]);
q0 = _mm_load_si128((__m128i *)flat2_oq[0]); q0 = _mm_loadl_epi64((__m128i *)flat2_oq[0]);
work_a = _mm_andnot_si128(flat2, work_a); work_a = _mm_andnot_si128(flat2, work_a);
q0 = _mm_and_si128(flat2, q0); q0 = _mm_and_si128(flat2, q0);
q0 = _mm_or_si128(work_a, q0); q0 = _mm_or_si128(work_a, q0);
_mm_storeu_si128((__m128i *)(s - 0 * p), q0); _mm_storel_epi64((__m128i *)(s - 0 * p), q0);
work_a = _mm_load_si128((__m128i *)flat_oq[1]); work_a = _mm_loadl_epi64((__m128i *)flat_oq[1]);
q1 = _mm_load_si128((__m128i *)flat2_oq[1]); q1 = _mm_loadl_epi64((__m128i *)flat2_oq[1]);
work_a = _mm_andnot_si128(flat2, work_a); work_a = _mm_andnot_si128(flat2, work_a);
q1 = _mm_and_si128(flat2, q1); q1 = _mm_and_si128(flat2, q1);
q1 = _mm_or_si128(work_a, q1); q1 = _mm_or_si128(work_a, q1);
_mm_storeu_si128((__m128i *)(s + 1 * p), q1); _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
work_a = _mm_load_si128((__m128i *)flat_oq[2]); work_a = _mm_loadl_epi64((__m128i *)flat_oq[2]);
q2 = _mm_load_si128((__m128i *)flat2_oq[2]); q2 = _mm_loadl_epi64((__m128i *)flat2_oq[2]);
work_a = _mm_andnot_si128(flat2, work_a); work_a = _mm_andnot_si128(flat2, work_a);
q2 = _mm_and_si128(flat2, q2); q2 = _mm_and_si128(flat2, q2);
q2 = _mm_or_si128(work_a, q2); q2 = _mm_or_si128(work_a, q2);
_mm_storeu_si128((__m128i *)(s + 2 * p), q2); _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
// write out oq3 - oq7 // write out oq3 - oq7
{ {
unsigned char *dst = (s + 3 * p); unsigned char *dst = (s + 3 * p);
for (i = 3; i < 7; i++) { for (i = 3; i < 7; i++) {
__m128i flat2_output; __m128i flat2_output;
work_a = _mm_load_si128((__m128i *)aq[i]); work_a = _mm_loadl_epi64((__m128i *)aq[i]);
flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]); flat2_output = _mm_loadl_epi64((__m128i *)flat2_oq[i]);
work_a = _mm_andnot_si128(flat2, work_a); work_a = _mm_andnot_si128(flat2, work_a);
flat2_output = _mm_and_si128(flat2, flat2_output); flat2_output = _mm_and_si128(flat2, flat2_output);
work_a = _mm_or_si128(work_a, flat2_output); work_a = _mm_or_si128(work_a, flat2_output);
_mm_storeu_si128((__m128i *)dst, work_a); _mm_storel_epi64((__m128i *)dst, work_a);
dst += p; dst += p;
} }
} }
...@@ -967,9 +960,14 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, ...@@ -967,9 +960,14 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
unsigned char *src[4]; unsigned char *src[4];
unsigned char *dst[4]; unsigned char *dst[4];
dst[0] = t_dst;
dst[1] = t_dst + 8 * 16;
src[0] = s - 8;
src[1] = s - 8 + 8;
/* Transpose 16x16 */ /* Transpose 16x16 */
transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16); transpose(src, p, dst, 16, 2);
transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
/* Loop filtering */ /* Loop filtering */
vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit, vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
...@@ -977,16 +975,11 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, ...@@ -977,16 +975,11 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
src[0] = t_dst;