Commit 454fd586 authored by Yi Luo's avatar Yi Luo Committed by Gerrit Code Review
Browse files

Merge "Optimization for HBD filter intra predictors (SSE4.1)" into nextgenv2

parents 9c6a7cab 8e0360a1
......@@ -32,6 +32,20 @@ typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, int bs,
typedef tuple<Predictor, Predictor, int> PredFuncMode;
typedef tuple<PredFuncMode, int> PredParams;
#if CONFIG_VP9_HIGHBITDEPTH
typedef void (*HbdPredictor)(uint16_t *dst, ptrdiff_t stride, int bs,
const uint16_t *above, const uint16_t *left,
int bd);
// Note:
// Test parameter list:
// Reference predictor, optimized predictor, prediction mode, block size,
// bit depth
//
typedef tuple<HbdPredictor, HbdPredictor, int> HbdPredFuncMode;
typedef tuple<HbdPredFuncMode, int, int> HbdPredParams;
#endif
const int MaxBlkSize = 32;
// By default, disable speed test
......@@ -136,6 +150,105 @@ class VP10IntraPredOptimzTest : public ::testing::TestWithParam<PredParams> {
uint8_t *predRef_;
};
#if CONFIG_VP9_HIGHBITDEPTH
class VP10HbdIntraPredOptimzTest :
public ::testing::TestWithParam<HbdPredParams> {
public:
virtual ~VP10HbdIntraPredOptimzTest() {}
virtual void SetUp() {
HbdPredFuncMode funcMode = GET_PARAM(0);
predFuncRef_ = std::tr1::get<0>(funcMode);
predFunc_ = std::tr1::get<1>(funcMode);
mode_ = std::tr1::get<2>(funcMode);
blockSize_ = GET_PARAM(1);
bd_ = GET_PARAM(2);
alloc_ = (uint16_t *)malloc((3 * MaxBlkSize + 2) * sizeof(alloc_[0]));
predRef_ =
(uint16_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(predRef_[0]));
pred_ = (uint16_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(pred_[0]));
}
virtual void TearDown() {
delete[] alloc_;
delete[] predRef_;
delete[] pred_;
libvpx_test::ClearSystemState();
}
protected:
void RunTest() const {
int tstIndex = 0;
int stride = blockSize_;
uint16_t *left = alloc_;
uint16_t *above = alloc_ + MaxBlkSize + 1;
while (tstIndex < MaxTestNum) {
PrepareBuffer();
predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_);
ASM_REGISTER_STATE_CHECK(
predFunc_(pred_, stride, blockSize_, &above[1], left, bd_));
DiffPred(tstIndex);
tstIndex += 1;
}
}
void RunSpeedTestC() const {
int tstIndex = 0;
int stride = blockSize_;
uint16_t *left = alloc_;
uint16_t *above = alloc_ + MaxBlkSize + 1;
PrepareBuffer();
while (tstIndex < MaxTestNum) {
predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_);
tstIndex += 1;
}
}
void RunSpeedTestSSE() const {
int tstIndex = 0;
int stride = blockSize_;
uint16_t *left = alloc_;
uint16_t *above = alloc_ + MaxBlkSize + 1;
PrepareBuffer();
while (tstIndex < MaxTestNum) {
predFunc_(predRef_, stride, blockSize_, &above[1], left, bd_);
tstIndex += 1;
}
}
private:
void PrepareBuffer() const {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int i = 0;
while (i < (3 * MaxBlkSize + 2)) {
alloc_[i] = rnd.Rand16() & ((1 << bd_) - 1);
i += 1;
}
}
void DiffPred(int testNum) const {
int i = 0;
while (i < blockSize_ * blockSize_) {
EXPECT_EQ(predRef_[i], pred_[i])
<< "Error at position: " << i << " "
<< "Block size: " << blockSize_ << " "
<< "Bit depth: " << bd_ << " "
<< "Test number: " << testNum;
i += 1;
}
}
HbdPredictor predFunc_;
HbdPredictor predFuncRef_;
int mode_;
int blockSize_;
int bd_;
uint16_t *alloc_;
uint16_t *pred_;
uint16_t *predRef_;
};
#endif // CONFIG_VP9_HIGHBITDEPTH
TEST_P(VP10IntraPredOptimzTest, BitExactCheck) {
RunTest();
}
......@@ -150,6 +263,22 @@ TEST_P(VP10IntraPredOptimzTest, SpeedCheckSSE) {
}
#endif
#if CONFIG_VP9_HIGHBITDEPTH
TEST_P(VP10HbdIntraPredOptimzTest, BitExactCheck) {
RunTest();
}
#if PREDICTORS_SPEED_TEST
TEST_P(VP10HbdIntraPredOptimzTest, SpeedCheckC) {
RunSpeedTestC();
}
TEST_P(VP10HbdIntraPredOptimzTest, SpeedCheckSSE) {
RunSpeedTestSSE();
}
#endif // PREDICTORS_SPEED_TEST
#endif // CONFIG_VP9_HIGHBITDEPTH
using std::tr1::make_tuple;
const PredFuncMode kPredFuncMdArray[] = {
......@@ -183,4 +312,38 @@ INSTANTIATE_TEST_CASE_P(
::testing::ValuesIn(kPredFuncMdArray),
::testing::ValuesIn(kBlkSize)));
#if CONFIG_VP9_HIGHBITDEPTH
const HbdPredFuncMode kHbdPredFuncMdArray[] = {
make_tuple(vp10_highbd_dc_filter_predictor_c,
vp10_highbd_dc_filter_predictor_sse4_1, DC_PRED),
make_tuple(vp10_highbd_v_filter_predictor_c,
vp10_highbd_v_filter_predictor_sse4_1, V_PRED),
make_tuple(vp10_highbd_h_filter_predictor_c,
vp10_highbd_h_filter_predictor_sse4_1, H_PRED),
make_tuple(vp10_highbd_d45_filter_predictor_c,
vp10_highbd_d45_filter_predictor_sse4_1, D45_PRED),
make_tuple(vp10_highbd_d135_filter_predictor_c,
vp10_highbd_d135_filter_predictor_sse4_1, D135_PRED),
make_tuple(vp10_highbd_d117_filter_predictor_c,
vp10_highbd_d117_filter_predictor_sse4_1, D117_PRED),
make_tuple(vp10_highbd_d153_filter_predictor_c,
vp10_highbd_d153_filter_predictor_sse4_1, D153_PRED),
make_tuple(vp10_highbd_d207_filter_predictor_c,
vp10_highbd_d207_filter_predictor_sse4_1, D207_PRED),
make_tuple(vp10_highbd_d63_filter_predictor_c,
vp10_highbd_d63_filter_predictor_sse4_1, D63_PRED),
make_tuple(vp10_highbd_tm_filter_predictor_c,
vp10_highbd_tm_filter_predictor_sse4_1, TM_PRED),
};
const int kBd[] = {10, 12};
INSTANTIATE_TEST_CASE_P(
SSE4_1, VP10HbdIntraPredOptimzTest,
::testing::Combine(
::testing::ValuesIn(kHbdPredFuncMdArray),
::testing::ValuesIn(kBlkSize),
::testing::ValuesIn(kBd)));
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
......@@ -1071,85 +1071,115 @@ static void highbd_filter_intra_predictors_4tap(uint16_t *dst, ptrdiff_t stride,
}
}
static void highbd_dc_filter_predictor(uint16_t *dst, ptrdiff_t stride,
void vp10_highbd_dc_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED,
bd);
}
static void highbd_v_filter_predictor(uint16_t *dst, ptrdiff_t stride,
void vp10_highbd_v_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED,
bd);
}
static void highbd_h_filter_predictor(uint16_t *dst, ptrdiff_t stride,
void vp10_highbd_h_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED,
bd);
}
static void highbd_d45_filter_predictor(uint16_t *dst, ptrdiff_t stride,
void vp10_highbd_d45_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED,
bd);
}
static void highbd_d135_filter_predictor(uint16_t *dst, ptrdiff_t stride,
void vp10_highbd_d135_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED,
bd);
}
static void highbd_d117_filter_predictor(uint16_t *dst, ptrdiff_t stride,
void vp10_highbd_d117_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED,
bd);
}
static void highbd_d153_filter_predictor(uint16_t *dst, ptrdiff_t stride,
void vp10_highbd_d153_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED,
bd);
}
static void highbd_d207_filter_predictor(uint16_t *dst, ptrdiff_t stride,
void vp10_highbd_d207_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED,
bd);
}
static void highbd_d63_filter_predictor(uint16_t *dst, ptrdiff_t stride,
void vp10_highbd_d63_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED,
bd);
}
static void highbd_tm_filter_predictor(uint16_t *dst, ptrdiff_t stride,
void vp10_highbd_tm_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
int bs, const uint16_t *above,
const uint16_t *left, int bd) {
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED,
bd);
}
static void (*highbd_filter_intra_predictors[EXT_INTRA_MODES])(uint16_t *dst,
ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left,
int bd) = {
highbd_dc_filter_predictor, highbd_v_filter_predictor,
highbd_h_filter_predictor, highbd_d45_filter_predictor,
highbd_d135_filter_predictor, highbd_d117_filter_predictor,
highbd_d153_filter_predictor, highbd_d207_filter_predictor,
highbd_d63_filter_predictor, highbd_tm_filter_predictor,
};
static void highbd_filter_intra_predictors(int mode, uint16_t *dst,
ptrdiff_t stride, int bs,
const uint16_t *above,
const uint16_t *left, int bd) {
switch (mode) {
case DC_PRED:
vp10_highbd_dc_filter_predictor(dst, stride, bs, above, left, bd);
break;
case V_PRED:
vp10_highbd_v_filter_predictor(dst, stride, bs, above, left, bd);
break;
case H_PRED:
vp10_highbd_h_filter_predictor(dst, stride, bs, above, left, bd);
break;
case D45_PRED:
vp10_highbd_d45_filter_predictor(dst, stride, bs, above, left, bd);
break;
case D135_PRED:
vp10_highbd_d135_filter_predictor(dst, stride, bs, above, left, bd);
break;
case D117_PRED:
vp10_highbd_d117_filter_predictor(dst, stride, bs, above, left, bd);
break;
case D153_PRED:
vp10_highbd_d153_filter_predictor(dst, stride, bs, above, left, bd);
break;
case D207_PRED:
vp10_highbd_d207_filter_predictor(dst, stride, bs, above, left, bd);
break;
case D63_PRED:
vp10_highbd_d63_filter_predictor(dst, stride, bs, above, left, bd);
break;
case TM_PRED:
vp10_highbd_tm_filter_predictor(dst, stride, bs, above, left, bd);
break;
default:
assert(0);
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_EXT_INTRA
......@@ -1303,7 +1333,7 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
#if CONFIG_EXT_INTRA
if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
highbd_filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
highbd_filter_intra_predictors(ext_intra_mode, dst, dst_stride, bs,
const_above_row, left_col, xd->bd);
return;
}
......
......@@ -320,6 +320,29 @@ if (vpx_config("CONFIG_EXT_INTRA") eq "yes") {
specialize qw/vp10_d63_filter_predictor sse4_1/;
add_proto qw/void vp10_tm_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
specialize qw/vp10_tm_filter_predictor sse4_1/;
# High bitdepth functions
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp10_highbd_dc_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp10_highbd_dc_filter_predictor sse4_1/;
add_proto qw/void vp10_highbd_v_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp10_highbd_v_filter_predictor sse4_1/;
add_proto qw/void vp10_highbd_h_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp10_highbd_h_filter_predictor sse4_1/;
add_proto qw/void vp10_highbd_d45_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp10_highbd_d45_filter_predictor sse4_1/;
add_proto qw/void vp10_highbd_d135_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp10_highbd_d135_filter_predictor sse4_1/;
add_proto qw/void vp10_highbd_d117_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp10_highbd_d117_filter_predictor sse4_1/;
add_proto qw/void vp10_highbd_d153_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp10_highbd_d153_filter_predictor sse4_1/;
add_proto qw/void vp10_highbd_d207_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp10_highbd_d207_filter_predictor sse4_1/;
add_proto qw/void vp10_highbd_d63_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp10_highbd_d63_filter_predictor sse4_1/;
add_proto qw/void vp10_highbd_tm_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp10_highbd_tm_filter_predictor sse4_1/;
}
}
# High bitdepth functions
......
......@@ -591,3 +591,323 @@ void vp10_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
GetIntraFilterParams(bs, TM_PRED, &prm[0]);
FilterPrediction(above, left, bs, prm, dst, stride);
}
// ============== High Bit Depth ==============
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE int HighbdGetMeanValue4x4(const uint16_t *above,
const uint16_t *left, const int bd,
__m128i *params) {
const __m128i a = _mm_loadu_si128((const __m128i *)above);
const __m128i l = _mm_loadu_si128((const __m128i *)left);
const __m128i zero = _mm_setzero_si128();
__m128i sum_vector, u;
uint16_t sum_value;
(void)bd;
sum_vector = _mm_add_epi16(a, l);
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
u = _mm_srli_si128(sum_vector, 2);
sum_vector = _mm_add_epi16(sum_vector, u);
sum_value = _mm_extract_epi16(sum_vector, 0);
sum_value += 4;
sum_value >>= 3;
*params = _mm_set1_epi32(sum_value);
return sum_value;
}
static INLINE int HighbdGetMeanValue8x8(const uint16_t *above,
const uint16_t *left, const int bd,
__m128i *params) {
const __m128i a = _mm_loadu_si128((const __m128i *)above);
const __m128i l = _mm_loadu_si128((const __m128i *)left);
const __m128i zero = _mm_setzero_si128();
__m128i sum_vector, u;
uint16_t sum_value;
(void)bd;
sum_vector = _mm_add_epi16(a, l);
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
u = _mm_srli_si128(sum_vector, 2);
sum_vector = _mm_add_epi16(sum_vector, u);
sum_value = _mm_extract_epi16(sum_vector, 0);
sum_value += 8;
sum_value >>= 4;
*params = _mm_set1_epi32(sum_value);
return sum_value;
}
// Note:
// Process 16 pixels above and left, 10-bit depth
// Add to the last 8 pixels sum
static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left,
__m128i *sum) {
__m128i a = _mm_loadu_si128((const __m128i *)above);
__m128i l = _mm_loadu_si128((const __m128i *)left);
sum[0] = _mm_add_epi16(a, l);
a = _mm_loadu_si128((const __m128i *)(above + 8));
l = _mm_loadu_si128((const __m128i *)(left + 8));
sum[0] = _mm_add_epi16(sum[0], a);
sum[0] = _mm_add_epi16(sum[0], l);
}
// Note:
// Process 16 pixels above and left, 12-bit depth
// Add to the last 8 pixels sum
static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left,
__m128i *sum) {
__m128i a = _mm_loadu_si128((const __m128i *)above);
__m128i l = _mm_loadu_si128((const __m128i *)left);
const __m128i zero = _mm_setzero_si128();
__m128i v0, v1;
v0 = _mm_unpacklo_epi16(a, zero);
v1 = _mm_unpacklo_epi16(l, zero);
sum[0] = _mm_add_epi32(v0, v1);
v0 = _mm_unpackhi_epi16(a, zero);
v1 = _mm_unpackhi_epi16(l, zero);
sum[0] = _mm_add_epi32(sum[0], v0);
sum[0] = _mm_add_epi32(sum[0], v1);
a = _mm_loadu_si128((const __m128i *)(above + 8));
l = _mm_loadu_si128((const __m128i *)(left + 8));
v0 = _mm_unpacklo_epi16(a, zero);
v1 = _mm_unpacklo_epi16(l, zero);
sum[0] = _mm_add_epi32(sum[0], v0);
sum[0] = _mm_add_epi32(sum[0], v1);
v0 = _mm_unpackhi_epi16(a, zero);
v1 = _mm_unpackhi_epi16(l, zero);
sum[0] = _mm_add_epi32(sum[0], v0);
sum[0] = _mm_add_epi32(sum[0], v1);
}
static INLINE int HighbdGetMeanValue16x16(const uint16_t *above,
const uint16_t *left, const int bd,
__m128i *params) {
const __m128i zero = _mm_setzero_si128();
__m128i sum_vector, u;
uint32_t sum_value = 0;
if (10 == bd) {
AddPixels10bit(above, left, &sum_vector);
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
u = _mm_srli_si128(sum_vector, 2);
sum_vector = _mm_add_epi16(sum_vector, u);
sum_value = _mm_extract_epi16(sum_vector, 0);
} else if (12 == bd) {
AddPixels12bit(above, left, &sum_vector);
sum_vector = _mm_hadd_epi32(sum_vector, zero);
u = _mm_srli_si128(sum_vector, 4);
sum_vector = _mm_add_epi32(u, sum_vector);
sum_value = _mm_extract_epi32(sum_vector, 0);
}
sum_value += 16;
sum_value >>= 5;
*params = _mm_set1_epi32(sum_value);
return sum_value;
}
static INLINE int HighbdGetMeanValue32x32(const uint16_t *above,
const uint16_t *left, const int bd,
__m128i *params) {
const __m128i zero = _mm_setzero_si128();
__m128i sum_vector[2], u;
uint32_t sum_value = 0;
if (10 == bd) {
AddPixels10bit(above, left, &sum_vector[0]);
AddPixels10bit(above + 16, left + 16, &sum_vector[1]);
sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 4 values
sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 2 values
u = _mm_srli_si128(sum_vector[0], 2);
sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
sum_value = _mm_extract_epi16(sum_vector[0], 0);
} else if (12 == bd) {
AddPixels12bit(above, left, &sum_vector[0]);
AddPixels12bit(above + 16, left + 16, &sum_vector[1]);
sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]);
sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero);
u = _mm_srli_si128(sum_vector[0], 4);
sum_vector[0] = _mm_add_epi32(u, sum_vector[0]);
sum_value = _mm_extract_epi32(sum_vector[0], 0);
}
sum_value += 32;
sum_value >>= 6;
*params = _mm_set1_epi32(sum_value);
return sum_value;
}
// Note:
// params[4] : mean value, 4 int32_t repetition
//
static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above,
const uint16_t *left, int bs,
const int bd, __m128i *params) {
int meanValue = 0;
switch (bs) {
case 4:
meanValue = HighbdGetMeanValue4x4(above, left, bd, params);
break;
case 8:
meanValue = HighbdGetMeanValue8x8(above, left, bd, params);
break;
case 16:
meanValue = HighbdGetMeanValue16x16(above, left, bd, params);
break;
case 32:
meanValue = HighbdGetMeanValue32x32(above, left, bd, params);
break;
default:
assert(0);
}
return meanValue;
}
// Note:
// At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
// the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
static void HighbdGeneratePrediction(const uint16_t *above,
const uint16_t *left,
const int bs, const int bd,
const __m128i *prm, int meanValue,
uint16_t *dst,
ptrdiff_t stride) {
int pred[33][65];
int r, c, colBound;
int remainings;
int ipred;
for (r = 0; r < bs; ++r) {
pred[r + 1][0] = (int)left[r] - meanValue;
}
above -= 1;
for (c = 0; c < 2 * bs + 1; ++c) {
pred[0][c] = (int)above[c] - meanValue;
}
r = 0;
c = 0;
while (r < bs) {
colBound = (bs << 1) - r;
for (c = 0; c < colBound; c += 4) {
remainings = colBound - c + 1;
ProducePixels(&pred[r][c], prm, remainings);
}
r += 1;
}
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) {
ipred = pred[r + 1][c + 1] + meanValue;
dst[c] = clip_pixel_highbd(ipred, bd);
}
dst += stride