Commit 72af533f authored by Yunqing Wang's avatar Yunqing Wang Committed by Gerrit Code Review

Merge "Align image buffer in multiple-resolution encoder"

parents 6b2792b0 153eec46
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 90
Version: 102
License: BSD
License File: LICENSE
......
......@@ -13,21 +13,12 @@
#include <stddef.h> // for NULL, size_t
#ifndef WIN32
#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
#include <stdint.h> // for uintptr_t
#endif
#ifndef INT_TYPES_DEFINED
#define INT_TYPES_DEFINED
#ifdef COMPILER_MSVC
typedef __int64 int64;
#else
typedef long long int64;
#endif /* COMPILER_MSVC */
typedef int int32;
typedef short int16;
typedef char int8;
#ifdef COMPILER_MSVC
typedef unsigned __int64 uint64;
typedef __int64 int64;
......@@ -38,9 +29,20 @@ typedef __int64 int64;
#define UINT64_C(x) x ## UI64
#endif
#define INT64_F "I64"
#else
#else // COMPILER_MSVC
#ifdef __LP64__
typedef unsigned long uint64;
typedef long int64;
#ifndef INT64_C
#define INT64_C(x) x ## L
#endif
#ifndef UINT64_C
#define UINT64_C(x) x ## UL
#endif
#define INT64_F "l"
#else // __LP64__
typedef unsigned long long uint64;
//typedef long long int64;
typedef long long int64;
#ifndef INT64_C
#define INT64_C(x) x ## LL
#endif
......@@ -48,10 +50,14 @@ typedef unsigned long long uint64;
#define UINT64_C(x) x ## ULL
#endif
#define INT64_F "ll"
#endif /* COMPILER_MSVC */
#endif // __LP64__
#endif // COMPILER_MSVC
typedef unsigned int uint32;
typedef int int32;
typedef unsigned short uint16;
typedef short int16;
typedef unsigned char uint8;
typedef char int8;
#endif // INT_TYPES_DEFINED
// Detect compiler is for x86 or x64.
......@@ -60,7 +66,6 @@ typedef unsigned char uint8;
#define CPU_X86 1
#endif
#define IS_ALIGNED(p, a) (0==((uintptr_t)(p) & ((a)-1)))
#define ALIGNP(p, t) \
((uint8*)((((uintptr_t)(p) + \
((t)-1)) & ~((t)-1))))
......
......@@ -11,21 +11,39 @@
#ifndef INCLUDE_LIBYUV_CPU_ID_H_
#define INCLUDE_LIBYUV_CPU_ID_H_
//namespace libyuv {
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// These flags are only valid on x86 processors
static const int kCpuHasSSE2 = 1;
static const int kCpuHasSSSE3 = 2;
// SIMD support on ARM processors
// These flags are only valid on ARM processors
static const int kCpuHasNEON = 4;
// Internal flag to indicate cpuid is initialized.
static const int kCpuInitialized = 8;
// Detect CPU has SSE2 etc.
int TestCpuFlag(int flag);
// test_flag parameter should be one of kCpuHas constants above
// returns non-zero if instruction set is detected
static __inline int TestCpuFlag(int test_flag) {
extern int cpu_info_;
extern int InitCpuFlags();
return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag;
}
// For testing, allow CPU flags to be disabled.
void MaskCpuFlagsForTest(int enable_flags);
//} // namespace libyuv
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
// -1 to enable all cpu specific optimizations.
// 0 to disable all cpu specific optimizations.
void MaskCpuFlags(int enable_flags);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_CPU_ID_H_
......@@ -13,7 +13,10 @@
#include "third_party/libyuv/include/libyuv/basic_types.h"
//namespace libyuv {
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Supported filtering
typedef enum {
......@@ -42,16 +45,8 @@ int I420Scale(const uint8* src_y, int src_stride_y,
int dst_width, int dst_height,
FilterMode filtering);
// Legacy API
// If dst_height_offset is non-zero, the image is offset by that many pixels
// and stretched to (dst_height - dst_height_offset * 2) pixels high,
// instead of dst_height.
int Scale_1(const uint8* src, int src_width, int src_height,
uint8* dst, int dst_width, int dst_height, int dst_height_offset,
int interpolate);
// Same, but specified src terms of each plane location and stride.
int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v,
// Legacy API. Deprecated
int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
int src_stride_y, int src_stride_u, int src_stride_v,
int src_width, int src_height,
uint8* dst_y, uint8* dst_u, uint8* dst_v,
......@@ -59,9 +54,17 @@ int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v,
int dst_width, int dst_height,
int interpolate);
// Legacy API. Deprecated
int ScaleOffset(const uint8* src, int src_width, int src_height,
uint8* dst, int dst_width, int dst_height, int dst_yoffset,
int interpolate);
// For testing, allow disabling of optimizations.
void SetUseReferenceImpl(int use);
//} // namespace libyuv
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_SCALE_H_
......@@ -9,66 +9,73 @@
*/
#include "third_party/libyuv/include/libyuv/cpu_id.h"
#include "third_party/libyuv/include/libyuv/basic_types.h" // for CPU_X86
#ifdef _MSC_VER
#include <intrin.h>
#endif
#ifdef __ANDROID__
#include <cpu-features.h>
#endif
#include "third_party/libyuv/include/libyuv/basic_types.h" // for CPU_X86
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
static inline void __cpuid(int cpu_info[4], int info_type) {
__asm__ volatile (
"mov %%ebx, %%edi\n"
"cpuid\n"
"xchg %%edi, %%ebx\n"
asm volatile (
"mov %%ebx, %%edi \n"
"cpuid \n"
"xchg %%edi, %%ebx \n"
: "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type)
);
}
#elif defined(__i386__) || defined(__x86_64__)
static inline void __cpuid(int cpu_info[4], int info_type) {
__asm__ volatile (
"cpuid\n"
asm volatile (
"cpuid \n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type)
);
}
#endif
//namespace libyuv {
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// CPU detect function for SIMD instruction sets.
static int cpu_info_initialized_ = 0;
static int cpu_info_ = 0;
int cpu_info_ = 0;
// Global lock for cpu initialization.
static void InitCpuFlags() {
int InitCpuFlags() {
#ifdef CPU_X86
int cpu_info[4];
__cpuid(cpu_info, 1);
cpu_info_ = (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
(cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0);
cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
(cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
kCpuInitialized;
#elif defined(__ANDROID__) && defined(__ARM_NEON__)
uint64_t features = android_getCpuFeatures();
cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
kCpuInitialized;
#elif defined(__ARM_NEON__)
// gcc -mfpu=neon defines __ARM_NEON__
// if code is specifically built for Neon-only, enable the flag.
cpu_info_ |= kCpuHasNEON;
// Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
// to disable Neon on devices that do not have it.
cpu_info_ = kCpuHasNEON | kCpuInitialized;
#else
cpu_info_ = 0;
cpu_info_ = kCpuInitialized;
#endif
cpu_info_initialized_ = 1;
return cpu_info_;
}
void MaskCpuFlagsForTest(int enable_flags) {
void MaskCpuFlags(int enable_flags) {
InitCpuFlags();
cpu_info_ &= enable_flags;
}
int TestCpuFlag(int flag) {
if (!cpu_info_initialized_) {
InitCpuFlags();
}
return cpu_info_ & flag ? 1 : 0;
cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
}
//} // namespace libyuv
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
......@@ -14,7 +14,7 @@
#include "third_party/libyuv/include/libyuv/basic_types.h"
#define kMaxStride (2048 * 4)
//#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)
#define YUV_DISABLE_ASM
......@@ -72,7 +72,10 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
#define HAS_REVERSE_ROW_NEON
#endif
//extern "C" {
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
......@@ -253,6 +256,9 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
#endif
//} // extern "C"
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // LIBYUV_SOURCE_ROW_H_
......@@ -15,6 +15,17 @@
#include "third_party/libyuv/include/libyuv/cpu_id.h"
#include "third_party/libyuv/source/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
/*
* Note: Defining YUV_DISABLE_ASM allows to use c version.
*/
//#define YUV_DISABLE_ASM
#if defined(_MSC_VER)
#define ALIGN16(var) __declspec(align(16)) var
#else
......@@ -26,8 +37,6 @@
// Note: Some SSE2 reference manuals
// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
//namespace libyuv {
// Set the following flag to true to revert to only
// using the reference implementation ScalePlaneBox(), and
// NOT the optimized versions. Useful for debugging and
......@@ -40,9 +49,7 @@ void SetUseReferenceImpl(int use) {
use_reference_impl_ = use;
}
// TODO: The preprocessor definitions for Win64 are not right in build system.
// Disable optimized code for now.
#define YUV_DISABLE_ASM
// ScaleRowDown2Int also used by planar functions
/**
* NEON downscalers with interpolation.
......@@ -511,83 +518,116 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
!defined(YUV_DISABLE_ASM)
#if defined(_MSC_VER)
#define TALIGN16(t, var) __declspec(align(16)) t _ ## var
#elif defined(OSX) && defined(__i386__)
#elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__)
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
#else
#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
#endif
#if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \
defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".globl _" #name " \n" \
"_" #name ": \n"
#else
#define DECLARE_FUNCTION(name) \
".text \n" \
".global " #name " \n" \
#name ": \n"
#endif
// Offsets for source bytes 0 to 9
//extern "C"
TALIGN16(const uint8, shuf0[16]) =
{ 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
//extern "C"
TALIGN16(const uint8, shuf1[16]) =
{ 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
//extern "C"
TALIGN16(const uint8, shuf2[16]) =
{ 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
// Offsets for source bytes 0 to 10
//extern "C"
TALIGN16(const uint8, shuf01[16]) =
{ 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
//extern "C"
TALIGN16(const uint8, shuf11[16]) =
{ 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
//extern "C"
TALIGN16(const uint8, shuf21[16]) =
{ 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
// Coefficients for source bytes 0 to 10
//extern "C"
TALIGN16(const uint8, madd01[16]) =
{ 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
// Coefficients for source bytes 10 to 21
//extern "C"
TALIGN16(const uint8, madd11[16]) =
{ 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
// Coefficients for source bytes 21 to 31
//extern "C"
TALIGN16(const uint8, madd21[16]) =
{ 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
// Coefficients for source bytes 21 to 31
//extern "C"
TALIGN16(const int16, round34[8]) =
{ 2, 2, 2, 2, 2, 2, 2, 2 };
//extern "C"
TALIGN16(const uint8, shuf38a[16]) =
{ 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
//extern "C"
TALIGN16(const uint8, shuf38b[16]) =
{ 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
// Arrange words 0,3,6 into 0,1,2
//extern "C"
TALIGN16(const uint8, shufac0[16]) =
{ 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
// Arrange words 0,3,6 into 3,4,5
//extern "C"
TALIGN16(const uint8, shufac3[16]) =
{ 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
// Scaling values for boxes of 3x3 and 2x3
//extern "C"
TALIGN16(const uint16, scaleac3[8]) =
{ 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
// Arrange first value for pixels 0,1,2,3,4,5
//extern "C"
TALIGN16(const uint8, shufab0[16]) =
{ 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
// Arrange second value for pixels 0,1,2,3,4,5
//extern "C"
TALIGN16(const uint8, shufab1[16]) =
{ 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
// Arrange third value for pixels 0,1,2,3,4,5
//extern "C"
TALIGN16(const uint8, shufab2[16]) =
{ 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
// Scaling values for boxes of 3x2 and 2x2
//extern "C"
TALIGN16(const uint16, scaleab2[8]) =
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
#endif
......@@ -1620,14 +1660,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
asm(
".text \n"
#if defined(OSX)
".globl _ScaleRowDown8Int_SSE2 \n"
"_ScaleRowDown8Int_SSE2: \n"
#else
".global ScaleRowDown8Int_SSE2 \n"
"ScaleRowDown8Int_SSE2: \n"
#endif
DECLARE_FUNCTION(ScaleRowDown8Int_SSE2)
"pusha \n"
"mov 0x24(%esp),%esi \n"
"mov 0x28(%esp),%ebx \n"
......@@ -1691,14 +1724,7 @@ void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
asm(
".text \n"
#if defined(OSX)
".globl _ScaleRowDown34_SSSE3 \n"
"_ScaleRowDown34_SSSE3: \n"
#else
".global ScaleRowDown34_SSSE3 \n"
"ScaleRowDown34_SSSE3: \n"
#endif
DECLARE_FUNCTION(ScaleRowDown34_SSSE3)
"pusha \n"
"mov 0x24(%esp),%esi \n"
"mov 0x2c(%esp),%edi \n"
......@@ -1729,14 +1755,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
asm(
".text \n"
#if defined(OSX)
".globl _ScaleRowDown34_1_Int_SSSE3 \n"
"_ScaleRowDown34_1_Int_SSSE3: \n"
#else
".global ScaleRowDown34_1_Int_SSSE3 \n"
"ScaleRowDown34_1_Int_SSSE3: \n"
#endif
DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3)
"pusha \n"
"mov 0x24(%esp),%esi \n"
"mov 0x28(%esp),%ebp \n"
......@@ -1790,14 +1809,7 @@ void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
asm(
".text \n"
#if defined(OSX)
".globl _ScaleRowDown34_0_Int_SSSE3 \n"
"_ScaleRowDown34_0_Int_SSSE3: \n"
#else
".global ScaleRowDown34_0_Int_SSSE3 \n"
"ScaleRowDown34_0_Int_SSSE3: \n"
#endif
DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3)
"pusha \n"
"mov 0x24(%esp),%esi \n"
"mov 0x28(%esp),%ebp \n"
......@@ -1854,14 +1866,7 @@ void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
asm(
".text \n"
#if defined(OSX)
".globl _ScaleRowDown38_SSSE3 \n"
"_ScaleRowDown38_SSSE3: \n"
#else
".global ScaleRowDown38_SSSE3 \n"
"ScaleRowDown38_SSSE3: \n"
#endif
DECLARE_FUNCTION(ScaleRowDown38_SSSE3)
"pusha \n"
"mov 0x24(%esp),%esi \n"
"mov 0x28(%esp),%edx \n"
......@@ -1890,14 +1895,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
asm(
".text \n"
#if defined(OSX)
".globl _ScaleRowDown38_3_Int_SSSE3 \n"
"_ScaleRowDown38_3_Int_SSSE3: \n"
#else
".global ScaleRowDown38_3_Int_SSSE3 \n"
"ScaleRowDown38_3_Int_SSSE3: \n"
#endif
DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3)
"pusha \n"
"mov 0x24(%esp),%esi \n"
"mov 0x28(%esp),%edx \n"
......@@ -1954,14 +1952,7 @@ void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
asm(
".text \n"
#if defined(OSX)
".globl _ScaleRowDown38_2_Int_SSSE3 \n"
"_ScaleRowDown38_2_Int_SSSE3: \n"
#else
".global ScaleRowDown38_2_Int_SSSE3 \n"
"ScaleRowDown38_2_Int_SSSE3: \n"
#endif
DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3)
"pusha \n"
"mov 0x24(%esp),%esi \n"
"mov 0x28(%esp),%edx \n"
......@@ -2001,14 +1992,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
uint16* dst_ptr, int src_width,
int src_height);
asm(
".text \n"
#if defined(OSX)
".globl _ScaleAddRows_SSE2 \n"
"_ScaleAddRows_SSE2: \n"
#else
".global ScaleAddRows_SSE2 \n"
"ScaleAddRows_SSE2: \n"
#endif
DECLARE_FUNCTION(ScaleAddRows_SSE2)
"pusha \n"
"mov 0x24(%esp),%esi \n"
"mov 0x28(%esp),%edx \n"
......@@ -2052,14 +2036,7 @@ void ScaleFilterRows_SSE2(uint8* dst_ptr,
const uint8* src_ptr, int src_stride,
int dst_width, int source_y_fraction);
asm(
".text \n"
#if defined(OSX)
".globl _ScaleFilterRows_SSE2 \n"
"_ScaleFilterRows_SSE2: \n"
#else
".global ScaleFilterRows_SSE2 \n"
"ScaleFilterRows_SSE2: \n"
#endif
DECLARE_FUNCTION(ScaleFilterRows_SSE2)
"push %esi \n"
"push %edi \n"
"mov 0xc(%esp),%edi \n"
......@@ -2147,14 +2124,7 @@ void ScaleFilterRows_SSSE3(uint8* dst_ptr,
const uint8* src_ptr, int src_stride,
int dst_width, int source_y_fraction);
asm(
".text \n"
#if defined(OSX)
".globl _ScaleFilterRows_SSSE3 \n"
"_ScaleFilterRows_SSSE3: \n"
#else
".global ScaleFilterRows_SSSE3 \n"
"ScaleFilterRows_SSSE3: \n"
#endif
DECLARE_FUNCTION(ScaleFilterRows_SSSE3)
"push %esi \n"
"push %edi \n"
"mov 0xc(%esp),%edi \n"
......@@ -2318,7 +2288,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
uint8* dst_ptr