Commit 8b698e8c authored by Yunqing Wang's avatar Yunqing Wang Committed by Gerrit Code Review
Browse files

Merge "post-proc: deblock filter optimization"

parents 107f14bc 4c53bacc
......@@ -19,9 +19,9 @@ typedef void (*post_proc_func_t)(unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows,
int cols,
int flimit);
unsigned char *flimit,
int size);
namespace {
......@@ -29,7 +29,7 @@ class Vp8PostProcessingFilterTest
: public ::testing::TestWithParam<post_proc_func_t> {};
// Test routine for the VP8 post-processing function
// vp8_post_proc_down_and_across_c.
// vp8_post_proc_down_and_across_mb_row_c.
TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
// Size of the underlying data block that will be filtered.
......@@ -56,6 +56,8 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
// Pointers to top-left pixel of block in the input and output images.
uint8_t *const src_image_ptr = src_image + (input_stride << 1);
uint8_t *const dst_image_ptr = dst_image + 8;
uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
(void)vpx_memset(flimits, 255, block_width);
// Initialize pixels in the input:
// block pixels to value 1,
......@@ -73,14 +75,13 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
(void)vpx_memset(dst_image, 99, output_size);
GetParam()(src_image_ptr, dst_image_ptr, input_stride,
output_stride, block_height, block_width,
255);
output_stride, block_width, flimits, 16);
static const uint8_t expected_data[block_height] = {
3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3
4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
};
pixel_ptr = dst_image;
pixel_ptr = dst_image_ptr;
for (int i = 0; i < block_height; ++i) {
for (int j = 0; j < block_width; ++j) {
EXPECT_EQ(expected_data[i], pixel_ptr[j])
......@@ -91,19 +92,15 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
vpx_free(src_image);
vpx_free(dst_image);
vpx_free(flimits);
};
INSTANTIATE_TEST_CASE_P(C, Vp8PostProcessingFilterTest,
::testing::Values(vp8_post_proc_down_and_across_c));
#if HAVE_MMX
INSTANTIATE_TEST_CASE_P(MMX, Vp8PostProcessingFilterTest,
::testing::Values(vp8_post_proc_down_and_across_mmx));
#endif
::testing::Values(vp8_post_proc_down_and_across_mb_row_c));
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2, Vp8PostProcessingFilterTest,
::testing::Values(vp8_post_proc_down_and_across_xmm));
::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2));
#endif
} // namespace
......@@ -127,25 +127,24 @@ extern void vp8_blit_text(const char *msg, unsigned char *address, const int pit
extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
/***********************************************************************************************************
*/
void vp8_post_proc_down_and_across_c
void vp8_post_proc_down_and_across_mb_row_c
(
unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows,
int cols,
int flimit
unsigned char *f,
int size
)
{
unsigned char *p_src, *p_dst;
int row;
int col;
int i;
int v;
unsigned char d[8];
unsigned char v;
unsigned char d[4];
for (row = 0; row < rows; row++)
for (row = 0; row < size; row++)
{
/* post_proc_down for one row */
p_src = src_ptr;
......@@ -153,20 +152,23 @@ void vp8_post_proc_down_and_across_c
for (col = 0; col < cols; col++)
{
unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
unsigned char p_above1 = p_src[col - src_pixels_per_line];
unsigned char p_below1 = p_src[col + src_pixels_per_line];
unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
int kernel = 4;
int v = p_src[col];
v = p_src[col];
for (i = -2; i <= 2; i++)
if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
&& (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col]))
{
if (abs(v - p_src[col+i*src_pixels_per_line]) > flimit)
goto down_skip_convolve;
kernel += kernel5[2+i] * p_src[col+i*src_pixels_per_line];
unsigned char k1, k2, k3;
k1 = (p_above2 + p_above1 + 1) >> 1;
k2 = (p_below2 + p_below1 + 1) >> 1;
k3 = (k1 + k2 + 1) >> 1;
v = (k3 + v + 1) >> 1;
}
v = (kernel >> 3);
down_skip_convolve:
p_dst[col] = v;
}
......@@ -174,40 +176,34 @@ void vp8_post_proc_down_and_across_c
p_src = dst_ptr;
p_dst = dst_ptr;
for (i = -8; i<0; i++)
p_src[i]=p_src[0];
for (i = cols; i<cols+8; i++)
p_src[i]=p_src[cols-1];
for (i = 0; i < 8; i++)
d[i] = p_src[i];
p_src[-2] = p_src[-1] = p_src[0];
p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
for (col = 0; col < cols; col++)
{
int kernel = 4;
v = p_src[col];
d[col&7] = v;
for (i = -2; i <= 2; i++)
if ((abs(v - p_src[col - 2]) < f[col])
&& (abs(v - p_src[col - 1]) < f[col])
&& (abs(v - p_src[col + 1]) < f[col])
&& (abs(v - p_src[col + 2]) < f[col]))
{
if (abs(v - p_src[col+i]) > flimit)
goto across_skip_convolve;
kernel += kernel5[2+i] * p_src[col+i];
unsigned char k1, k2, k3;
k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
k3 = (k1 + k2 + 1) >> 1;
v = (k3 + v + 1) >> 1;
}
d[col&7] = (kernel >> 3);
across_skip_convolve:
d[col & 3] = v;
if (col >= 2)
p_dst[col-2] = d[(col-2)&7];
p_dst[col - 2] = d[(col - 2) & 3];
}
/* handle the last two pixels */
p_dst[col-2] = d[(col-2)&7];
p_dst[col-1] = d[(col-1)&7];
p_dst[col - 2] = d[(col - 2) & 3];
p_dst[col - 1] = d[(col - 1) & 3];
/* next row */
src_ptr += src_pixels_per_line;
......@@ -318,28 +314,17 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
}
}
static void vp8_deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post,
int q,
int low_var_thresh,
int flag)
static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
int q)
{
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
int ppl = (int)(level + .5);
(void) low_var_thresh;
(void) flag;
vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride, post->y_stride, source->y_height, source->y_width, ppl);
vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
post->y_width, q2mbl(q));
}
void vp8_deblock(YV12_BUFFER_CONFIG *source,
void vp8_deblock(VP8_COMMON *cm,
YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post,
int q,
int low_var_thresh,
......@@ -347,12 +332,58 @@ void vp8_deblock(YV12_BUFFER_CONFIG *source,
{
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
int ppl = (int)(level + .5);
const MODE_INFO *mode_info_context = cm->mi;
int mbr, mbc;
/* The pixel thresholds are adjusted according to if or not the macroblock
* is a skipped block. */
unsigned char *ylimits = (unsigned char *)vpx_memalign(16, 16 * cm->mb_cols);
unsigned char *uvlimits = (unsigned char *)vpx_memalign(16, 8 * cm->mb_cols);
(void) low_var_thresh;
(void) flag;
vp8_post_proc_down_and_across(source->y_buffer, post->y_buffer, source->y_stride, post->y_stride, source->y_height, source->y_width, ppl);
vp8_post_proc_down_and_across(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
vp8_post_proc_down_and_across(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
if (ppl > 0)
{
for (mbr = 0; mbr < cm->mb_rows; mbr++)
{
unsigned char *ylptr = ylimits;
unsigned char *uvlptr = uvlimits;
for (mbc = 0; mbc < cm->mb_cols; mbc++)
{
unsigned char mb_ppl;
if (mode_info_context->mbmi.mb_skip_coeff)
mb_ppl = (unsigned char)ppl >> 1;
else
mb_ppl = (unsigned char)ppl;
vpx_memset(ylptr, mb_ppl, 16);
vpx_memset(uvlptr, mb_ppl, 8);
ylptr += 16;
uvlptr += 8;
mode_info_context++;
}
mode_info_context++;
vp8_post_proc_down_and_across_mb_row(
source->y_buffer + 16 * mbr * source->y_stride,
post->y_buffer + 16 * mbr * post->y_stride, source->y_stride,
post->y_stride, source->y_width, ylimits, 16);
vp8_post_proc_down_and_across_mb_row(
source->u_buffer + 8 * mbr * source->uv_stride,
post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
post->uv_stride, source->uv_width, uvlimits, 8);
vp8_post_proc_down_and_across_mb_row(
source->v_buffer + 8 * mbr * source->uv_stride,
post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
post->uv_stride, source->uv_width, uvlimits, 8);
}
}
vpx_free(ylimits);
vpx_free(uvlimits);
}
#if !(CONFIG_TEMPORAL_DENOISING)
......@@ -364,33 +395,35 @@ void vp8_de_noise(YV12_BUFFER_CONFIG *source,
{
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
int ppl = (int)(level + .5);
int mb_rows = source->y_width >> 4;
int mb_cols = source->y_height >> 4;
unsigned char *limits = (unsigned char *)vpx_memalign(16, 16 * mb_cols);
int mbr, mbc;
(void) post;
(void) low_var_thresh;
(void) flag;
vp8_post_proc_down_and_across(
source->y_buffer + 2 * source->y_stride + 2,
source->y_buffer + 2 * source->y_stride + 2,
source->y_stride,
source->y_stride,
source->y_height - 4,
source->y_width - 4,
ppl);
vp8_post_proc_down_and_across(
source->u_buffer + 2 * source->uv_stride + 2,
source->u_buffer + 2 * source->uv_stride + 2,
source->uv_stride,
source->uv_stride,
source->uv_height - 4,
source->uv_width - 4, ppl);
vp8_post_proc_down_and_across(
source->v_buffer + 2 * source->uv_stride + 2,
source->v_buffer + 2 * source->uv_stride + 2,
source->uv_stride,
source->uv_stride,
source->uv_height - 4,
source->uv_width - 4, ppl);
/* TODO: The original code don't filter the 2 outer rows and columns. */
vpx_memset(limits, (unsigned char)ppl, 16 * mb_cols);
for (mbr = 0; mbr < mb_rows; mbr++)
{
vp8_post_proc_down_and_across_mb_row(
source->y_buffer + 16 * mbr * source->y_stride,
source->y_buffer + 16 * mbr * source->y_stride,
source->y_stride, source->y_stride, source->y_width, limits, 16);
vp8_post_proc_down_and_across_mb_row(
source->u_buffer + 8 * mbr * source->uv_stride,
source->u_buffer + 8 * mbr * source->uv_stride,
source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
vp8_post_proc_down_and_across_mb_row(
source->v_buffer + 8 * mbr * source->uv_stride,
source->v_buffer + 8 * mbr * source->uv_stride,
source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
}
vpx_free(limits);
}
#endif
......@@ -752,12 +785,14 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
if (flags & VP8D_DEMACROBLOCK)
{
vp8_deblock_and_de_macro_block(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
q + (deblock_level - 5) * 10, 1, 0);
vp8_de_mblock(&oci->post_proc_buffer,
q + (deblock_level - 5) * 10);
}
else if (flags & VP8D_DEBLOCK)
{
vp8_deblock(&oci->post_proc_buffer_int, &oci->post_proc_buffer,
vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
q, 1, 0);
}
}
......@@ -766,13 +801,15 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
}
else if (flags & VP8D_DEMACROBLOCK)
{
vp8_deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
q + (deblock_level - 5) * 10, 1, 0);
vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
q + (deblock_level - 5) * 10, 1, 0);
vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10);
oci->postproc_state.last_base_qindex = oci->base_qindex;
}
else if (flags & VP8D_DEBLOCK)
{
vp8_deblock(oci->frame_to_show, &oci->post_proc_buffer,
vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
q, 1, 0);
oci->postproc_state.last_base_qindex = oci->base_qindex;
}
......
......@@ -36,7 +36,8 @@ void vp8_de_noise(YV12_BUFFER_CONFIG *source,
int low_var_thresh,
int flag);
void vp8_deblock(YV12_BUFFER_CONFIG *source,
void vp8_deblock(struct VP8Common *oci,
YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post,
int q,
int low_var_thresh,
......
......@@ -19,14 +19,14 @@ void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
extern void (*vp8_post_proc_down_and_across)(
extern void (*vp8_post_proc_down_and_across_mb_row)(
unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows,
int cols,
int flimit
unsigned char *f,
int size
);
extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
......@@ -34,15 +34,15 @@ extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int
extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
extern void vp8_post_proc_down_and_across_c
extern void vp8_post_proc_down_and_across_mb_row_c
(
unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows,
int cols,
int flimit
unsigned char *f,
int size
);
void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
......@@ -158,7 +158,7 @@ void vp8_machine_specific_config(void)
vp8_lf_mbhsimple = loop_filter_mbhs_ppc;
vp8_lf_bhsimple = loop_filter_bhs_ppc;
vp8_post_proc_down_and_across = vp8_post_proc_down_and_across_c;
vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
vp8_mbpost_proc_down = vp8_mbpost_proc_down_c;
vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c;
vp8_plane_add_noise = vp8_plane_add_noise_c;
......
......@@ -162,9 +162,8 @@ if [ "$CONFIG_POSTPROC" = "yes" ]; then
specialize vp8_mbpost_proc_across_ip sse2
vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm
prototype void vp8_post_proc_down_and_across "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int rows, int cols, int flimit"
specialize vp8_post_proc_down_and_across mmx sse2
vp8_post_proc_down_and_across_sse2=vp8_post_proc_down_and_across_xmm
prototype void vp8_post_proc_down_and_across_mb_row "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"
specialize vp8_post_proc_down_and_across_mb_row sse2
prototype void vp8_plane_add_noise "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch"
specialize vp8_plane_add_noise mmx sse2
......
......@@ -14,271 +14,6 @@
%define VP8_FILTER_WEIGHT 128
%define VP8_FILTER_SHIFT 7
;void vp8_post_proc_down_and_across_mmx
;(
; unsigned char *src_ptr,
; unsigned char *dst_ptr,
; int src_pixels_per_line,
; int dst_pixels_per_line,
; int rows,
; int cols,
; int flimit
;)
global sym(vp8_post_proc_down_and_across_mmx) PRIVATE
sym(vp8_post_proc_down_and_across_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
GET_GOT rbx
push rsi
push rdi
; end prolog
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
; move the global rd onto the stack, since we don't have enough registers
; to do PIC addressing
movq mm0, [GLOBAL(rd)]
sub rsp, 8
movq [rsp], mm0
%define RD [rsp]
%else
%define RD [GLOBAL(rd)]
%endif
push rbx
lea rbx, [GLOBAL(Blur)]
movd mm2, dword ptr arg(6) ;flimit
punpcklwd mm2, mm2
punpckldq mm2, mm2
mov rsi, arg(0) ;src_ptr
mov rdi, arg(1) ;dst_ptr
movsxd rcx, DWORD PTR arg(4) ;rows
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
pxor mm0, mm0 ; mm0 = 00000000
.nextrow:
xor rdx, rdx ; clear out rdx for use as loop counter
.nextcol:
pxor mm7, mm7 ; mm7 = 00000000
movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
movq mm3, [rsi] ; mm4 = r0 p0..p7
punpcklbw mm3, mm0 ; mm3 = p0..p3
movq mm1, mm3 ; mm1 = p0..p3
pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
movq mm6, [rbx + 48] ; mm6 = kernel 3 taps
movq mm5, [rsi + rax] ; mm4 = r1 p0..p7
punpcklbw mm5, mm0 ; mm5 = r1 p0..p3
pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers
paddusw mm3, mm6 ; mm3 += mm6
; thresholding
movq mm7, mm1 ; mm7 = r0 p0..p3
psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3
psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3
paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
pcmpgtw mm7, mm2
movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers
movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7
punpcklbw mm5, mm0 ; mm5 = r2 p0..p3
pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
paddusw mm3, mm6 ; mm3 += mm5
; thresholding
movq mm6, mm1 ; mm6 = r0 p0..p3
psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3
psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3
paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
pcmpgtw mm6, mm2
por mm7, mm6 ; accumulate thresholds
neg rax
movq mm6, [rbx ] ; kernel 0 taps
movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7
punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3
pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
paddusw mm3, mm6 ; mm3 += mm5
; thresholding
movq mm6, mm1 ; mm6 = r0 p0..p3
psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3
psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3
paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
pcmpgtw mm6, mm2
por mm7, mm6 ; accumulate thresholds
movq mm6, [rbx + 16] ; kernel 1 taps
movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7
punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3
pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
paddusw mm3, mm6 ; mm3 += mm5
; thresholding
movq mm6, mm1 ; mm6 = r0 p0..p3
psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3
psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3
paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
pcmpgtw mm6, mm2
por mm7, mm6 ; accumulate thresholds
paddusw mm3, RD ; mm3 += round value
psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
pand mm1, mm7 ; mm1 select vals > thresh from source
pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
paddusw mm1, mm7 ; combination
packuswb mm1, mm0 ; pack to bytes
movd [rdi], mm1 ;
neg rax ; pitch is positive
add rsi, 4
add rdi, 4
add rdx, 4
cmp edx, dword ptr arg(5) ;cols
jl .nextcol
; done with the all cols, start the across filtering in place
sub rsi, rdx
sub rdi, rdx
; dup the first byte into the left border 8 times
movq mm1, [rdi]
punpcklbw mm1, mm1
punpcklwd mm1, mm1
punpckldq mm1, mm1
mov rdx, -8
movq [rdi+rdx], mm1
; dup the last byte into the right border
movsxd rdx, dword arg(5)