diff --git a/args.c b/args.c index 14b031040a4b70057e5fa9485c3d4e045c9842d0..51c0fb9c4ebf4818a07044f3b23b968dd6f7c6f4 100644 --- a/args.c +++ b/args.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include <stdlib.h> #include <string.h> #include <limits.h> @@ -22,42 +21,36 @@ extern void die(const char *fmt, ...) __attribute__((noreturn)); extern void die(const char *fmt, ...); #endif - struct arg arg_init(char **argv) { struct arg a; - a.argv = argv; + a.argv = argv; a.argv_step = 1; - a.name = NULL; - a.val = NULL; - a.def = NULL; + a.name = NULL; + a.val = NULL; + a.def = NULL; return a; } int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) { struct arg arg; - if (!argv[0] || argv[0][0] != '-') - return 0; + if (!argv[0] || argv[0][0] != '-') return 0; arg = arg_init(argv); - if (def->short_name - && strlen(arg.argv[0]) == strlen(def->short_name) + 1 - && !strcmp(arg.argv[0] + 1, def->short_name)) { - + if (def->short_name && strlen(arg.argv[0]) == strlen(def->short_name) + 1 && + !strcmp(arg.argv[0] + 1, def->short_name)) { arg.name = arg.argv[0] + 1; arg.val = def->has_val ? arg.argv[1] : NULL; arg.argv_step = def->has_val ? 2 : 1; } else if (def->long_name) { const size_t name_len = strlen(def->long_name); - if (strlen(arg.argv[0]) >= name_len + 2 - && arg.argv[0][1] == '-' - && !strncmp(arg.argv[0] + 2, def->long_name, name_len) - && (arg.argv[0][name_len + 2] == '=' - || arg.argv[0][name_len + 2] == '\0')) { - + if (strlen(arg.argv[0]) >= name_len + 2 && arg.argv[0][1] == '-' && + !strncmp(arg.argv[0] + 2, def->long_name, name_len) && + (arg.argv[0][name_len + 2] == '=' || + arg.argv[0][name_len + 2] == '\0')) { arg.name = arg.argv[0] + 2; arg.val = arg.name[name_len] == '=' ? arg.name + name_len + 1 : NULL; arg.argv_step = 1; @@ -70,8 +63,7 @@ int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) { if (arg.name && arg.val && !def->has_val) die("Error: option %s requires no argument.\n", arg.name); - if (arg.name - && (arg.val || !def->has_val)) { + if (arg.name && (arg.val || !def->has_val)) { arg.def = def; *arg_ = arg; return 1; @@ -80,15 +72,12 @@ int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) { return 0; } - const char *arg_next(struct arg *arg) { - if (arg->argv[0]) - arg->argv += arg->argv_step; + if (arg->argv[0]) arg->argv += arg->argv_step; return *arg->argv; } - char **argv_dup(int argc, const char **argv) { char **new_argv = malloc((argc + 1) * sizeof(*argv)); @@ -97,9 +86,8 @@ char **argv_dup(int argc, const char **argv) { return new_argv; } - void arg_show_usage(FILE *fp, const struct arg_def *const *defs) { - char option_text[40] = {0}; + char option_text[40] = { 0 }; for (; *defs; defs++) { const struct arg_def *def = *defs; @@ -109,15 +97,12 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs) { if (def->short_name && def->long_name) { char *comma = def->has_val ? "," : ", "; - snprintf(option_text, 37, "-%s%s%s --%s%6s", - def->short_name, short_val, comma, - def->long_name, long_val); + snprintf(option_text, 37, "-%s%s%s --%s%6s", def->short_name, short_val, + comma, def->long_name, long_val); } else if (def->short_name) - snprintf(option_text, 37, "-%s%s", - def->short_name, short_val); + snprintf(option_text, 37, "-%s%s", def->short_name, short_val); else if (def->long_name) - snprintf(option_text, 37, " --%s%s", - def->long_name, long_val); + snprintf(option_text, 37, " --%s%s", def->long_name, long_val); fprintf(fp, " %-37s\t%s\n", option_text, def->desc); @@ -127,59 +112,53 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs) { fprintf(fp, " %-37s\t ", ""); for (listptr = def->enums; listptr->name; listptr++) - fprintf(fp, "%s%s", listptr->name, - listptr[1].name ? ", " : "\n"); + fprintf(fp, "%s%s", listptr->name, listptr[1].name ? ", " : "\n"); } } } - unsigned int arg_parse_uint(const struct arg *arg) { - long int rawval; - char *endptr; + long int rawval; + char *endptr; rawval = strtol(arg->val, &endptr, 10); if (arg->val[0] != '\0' && endptr[0] == '\0') { - if (rawval >= 0 && rawval <= UINT_MAX) - return rawval; + if (rawval >= 0 && rawval <= UINT_MAX) return rawval; - die("Option %s: Value %ld out of range for unsigned int\n", - arg->name, rawval); + die("Option %s: Value %ld out of range for unsigned int\n", arg->name, + rawval); } die("Option %s: Invalid character '%c'\n", arg->name, *endptr); return 0; } - int arg_parse_int(const struct arg *arg) { - long int rawval; - char *endptr; + long int rawval; + char *endptr; rawval = strtol(arg->val, &endptr, 10); if (arg->val[0] != '\0' && endptr[0] == '\0') { - if (rawval >= INT_MIN && rawval <= INT_MAX) - return rawval; + if (rawval >= INT_MIN && rawval <= INT_MAX) return rawval; - die("Option %s: Value %ld out of range for signed int\n", - arg->name, rawval); + die("Option %s: Value %ld out of range for signed int\n", arg->name, + rawval); } die("Option %s: Invalid character '%c'\n", arg->name, *endptr); return 0; } - struct vpx_rational { int num; /**< fraction numerator */ int den; /**< fraction denominator */ }; struct vpx_rational arg_parse_rational(const struct arg *arg) { - long int rawval; - char *endptr; - struct vpx_rational rat; + long int rawval; + char *endptr; + struct vpx_rational rat; /* parse numerator */ rawval = strtol(arg->val, &endptr, 10); @@ -187,9 +166,11 @@ struct vpx_rational arg_parse_rational(const struct arg *arg) { if (arg->val[0] != '\0' && endptr[0] == '/') { if (rawval >= INT_MIN && rawval <= INT_MAX) rat.num = rawval; - else die("Option %s: Value %ld out of range for signed int\n", - arg->name, rawval); - } else die("Option %s: Expected / at '%c'\n", arg->name, *endptr); + else + die("Option %s: Value %ld out of range for signed int\n", arg->name, + rawval); + } else + die("Option %s: Expected / at '%c'\n", arg->name, *endptr); /* parse denominator */ rawval = strtol(endptr + 1, &endptr, 10); @@ -197,40 +178,37 @@ struct vpx_rational arg_parse_rational(const struct arg *arg) { if (arg->val[0] != '\0' && endptr[0] == '\0') { if (rawval >= INT_MIN && rawval <= INT_MAX) rat.den = rawval; - else die("Option %s: Value %ld out of range for signed int\n", - arg->name, rawval); - } else die("Option %s: Invalid character '%c'\n", arg->name, *endptr); + else + die("Option %s: Value %ld out of range for signed int\n", arg->name, + rawval); + } else + die("Option %s: Invalid character '%c'\n", arg->name, *endptr); return rat; } - int arg_parse_enum(const struct arg *arg) { const struct arg_enum_list *listptr; - long int rawval; - char *endptr; + long int rawval; + char *endptr; /* First see if the value can be parsed as a raw value */ rawval = strtol(arg->val, &endptr, 10); if (arg->val[0] != '\0' && endptr[0] == '\0') { /* Got a raw value, make sure it's valid */ for (listptr = arg->def->enums; listptr->name; listptr++) - if (listptr->val == rawval) - return rawval; + if (listptr->val == rawval) return rawval; } /* Next see if it can be parsed as a string */ for (listptr = arg->def->enums; listptr->name; listptr++) - if (!strcmp(arg->val, listptr->name)) - return listptr->val; + if (!strcmp(arg->val, listptr->name)) return listptr->val; die("Option %s: Invalid value '%s'\n", arg->name, arg->val); return 0; } - int arg_parse_enum_or_int(const struct arg *arg) { - if (arg->def->enums) - return arg_parse_enum(arg); + if (arg->def->enums) return arg_parse_enum(arg); return arg_parse_int(arg); } diff --git a/args.h b/args.h index 1f37151a028681d9b0f215bb6975261e19e5fd1d..54abe04607d97903212315bd1a04981b5335728e 100644 --- a/args.h +++ b/args.h @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - #ifndef ARGS_H_ #define ARGS_H_ #include <stdio.h> @@ -18,29 +17,33 @@ extern "C" { #endif struct arg { - char **argv; - const char *name; - const char *val; - unsigned int argv_step; - const struct arg_def *def; + char **argv; + const char *name; + const char *val; + unsigned int argv_step; + const struct arg_def *def; }; struct arg_enum_list { const char *name; - int val; + int val; }; -#define ARG_ENUM_LIST_END {0} +#define ARG_ENUM_LIST_END \ + { 0 } typedef struct arg_def { const char *short_name; const char *long_name; - int has_val; + int has_val; const char *desc; const struct arg_enum_list *enums; } arg_def_t; -#define ARG_DEF(s,l,v,d) {s,l,v,d, NULL} -#define ARG_DEF_ENUM(s,l,v,d,e) {s,l,v,d,e} -#define ARG_DEF_LIST_END {0} +#define ARG_DEF(s, l, v, d) \ + { s, l, v, d, NULL } +#define ARG_DEF_ENUM(s, l, v, d, e) \ + { s, l, v, d, e } +#define ARG_DEF_LIST_END \ + { 0 } struct arg arg_init(char **argv); int arg_match(struct arg *arg_, const struct arg_def *def, char **argv); diff --git a/examples/decode_to_md5.c b/examples/decode_to_md5.c index 1ae7a4b57f50292a9d369f60892a14d05ae0f7f2..51959f37df764adbcd01c5180e1445a9e1dbb594 100644 --- a/examples/decode_to_md5.c +++ b/examples/decode_to_md5.c @@ -65,8 +65,7 @@ static void get_image_md5(const vpx_image_t *img, unsigned char digest[16]) { static void print_md5(FILE *stream, unsigned char digest[16]) { int i; - for (i = 0; i < 16; ++i) - fprintf(stream, "%02x", digest[i]); + for (i = 0; i < 16; ++i) fprintf(stream, "%02x", digest[i]); } static const char *exec_name; @@ -86,12 +85,10 @@ int main(int argc, char **argv) { exec_name = argv[0]; - if (argc != 3) - die("Invalid number of arguments."); + if (argc != 3) die("Invalid number of arguments."); reader = vpx_video_reader_open(argv[1]); - if (!reader) - die("Failed to open %s for reading.", argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); if (!(outfile = fopen(argv[2], "wb"))) die("Failed to open %s for writing.", argv[2]); @@ -99,8 +96,7 @@ int main(int argc, char **argv) { info = vpx_video_reader_get_info(reader); decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc); - if (!decoder) - die("Unknown input codec."); + if (!decoder) die("Unknown input codec."); printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface())); @@ -111,8 +107,8 @@ int main(int argc, char **argv) { vpx_codec_iter_t iter = NULL; vpx_image_t *img = NULL; size_t frame_size = 0; - const unsigned char *frame = vpx_video_reader_get_frame(reader, - &frame_size); + const unsigned char *frame = + vpx_video_reader_get_frame(reader, &frame_size); if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0)) die_codec(&codec, "Failed to decode frame"); @@ -121,14 +117,13 @@ int main(int argc, char **argv) { get_image_md5(img, digest); print_md5(outfile, digest); - fprintf(outfile, " img-%dx%d-%04d.i420\n", - img->d_w, img->d_h, ++frame_cnt); + fprintf(outfile, " img-%dx%d-%04d.i420\n", img->d_w, img->d_h, + ++frame_cnt); } } printf("Processed %d frames.\n", frame_cnt); - if (vpx_codec_destroy(&codec)) - die_codec(&codec, "Failed to destroy codec."); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); vpx_video_reader_close(reader); diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c index 2233e473d364c39cf5c104d828b5ae53787f027e..29b8be94131cd59fb8ab9729e9d9f5d6154ead8c 100644 --- a/examples/decode_with_drops.c +++ b/examples/decode_with_drops.c @@ -84,12 +84,10 @@ int main(int argc, char **argv) { exec_name = argv[0]; - if (argc != 4) - die("Invalid number of arguments."); + if (argc != 4) die("Invalid number of arguments."); reader = vpx_video_reader_open(argv[1]); - if (!reader) - die("Failed to open %s for reading.", argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); if (!(outfile = fopen(argv[2], "wb"))) die("Failed to open %s for writing.", argv[2]); @@ -103,8 +101,7 @@ int main(int argc, char **argv) { info = vpx_video_reader_get_info(reader); decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc); - if (!decoder) - die("Unknown input codec."); + if (!decoder) die("Unknown input codec."); printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface())); @@ -116,8 +113,8 @@ int main(int argc, char **argv) { vpx_image_t *img = NULL; size_t frame_size = 0; int skip; - const unsigned char *frame = vpx_video_reader_get_frame(reader, - &frame_size); + const unsigned char *frame = + vpx_video_reader_get_frame(reader, &frame_size); if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0)) die_codec(&codec, "Failed to decode frame."); @@ -139,8 +136,7 @@ int main(int argc, char **argv) { } printf("Processed %d frames.\n", frame_cnt); - if (vpx_codec_destroy(&codec)) - die_codec(&codec, "Failed to destroy codec."); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n", info->frame_width, info->frame_height, argv[2]); diff --git a/examples/lossless_encoder.c b/examples/lossless_encoder.c index da2487820d23dfa1bf334ca38ba852ae3dfd364b..5c380a887c46f70927ea3f984ead18e6daf4124f 100644 --- a/examples/lossless_encoder.c +++ b/examples/lossless_encoder.c @@ -21,32 +21,28 @@ static const char *exec_name; void usage_exit(void) { - fprintf(stderr, "lossless_encoder: Example demonstrating lossless " - "encoding feature. Supports raw input only.\n"); + fprintf(stderr, + "lossless_encoder: Example demonstrating lossless " + "encoding feature. Supports raw input only.\n"); fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name); exit(EXIT_FAILURE); } -static int encode_frame(vpx_codec_ctx_t *codec, - vpx_image_t *img, - int frame_index, - int flags, - VpxVideoWriter *writer) { +static int encode_frame(vpx_codec_ctx_t *codec, vpx_image_t *img, + int frame_index, int flags, VpxVideoWriter *writer) { int got_pkts = 0; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt = NULL; - const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, - flags, VPX_DL_GOOD_QUALITY); - if (res != VPX_CODEC_OK) - die_codec(codec, "Failed to encode frame"); + const vpx_codec_err_t res = + vpx_codec_encode(codec, img, frame_index, 1, flags, VPX_DL_GOOD_QUALITY); + if (res != VPX_CODEC_OK) die_codec(codec, "Failed to encode frame"); while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) { got_pkts = 1; if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; - if (!vpx_video_writer_write_frame(writer, - pkt->data.frame.buf, + if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf, pkt->data.frame.sz, pkt->data.frame.pts)) { die_codec(codec, "Failed to write compressed frame"); @@ -66,19 +62,17 @@ int main(int argc, char **argv) { int frame_count = 0; vpx_image_t raw; vpx_codec_err_t res; - VpxVideoInfo info = {0}; + VpxVideoInfo info = { 0 }; VpxVideoWriter *writer = NULL; const VpxInterface *encoder = NULL; const int fps = 30; exec_name = argv[0]; - if (argc < 5) - die("Invalid number of arguments"); + if (argc < 5) die("Invalid number of arguments"); encoder = get_vpx_encoder_by_name("vp9"); - if (!encoder) - die("Unsupported codec."); + if (!encoder) die("Unsupported codec."); info.codec_fourcc = encoder->fourcc; info.frame_width = strtol(argv[1], NULL, 0); @@ -86,23 +80,20 @@ int main(int argc, char **argv) { info.time_base.numerator = 1; info.time_base.denominator = fps; - if (info.frame_width <= 0 || - info.frame_height <= 0 || - (info.frame_width % 2) != 0 || - (info.frame_height % 2) != 0) { + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); } if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width, - info.frame_height, 1)) { + info.frame_height, 1)) { die("Failed to allocate image."); } printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); - if (res) - die_codec(&codec, "Failed to get default codec config."); + if (res) die_codec(&codec, "Failed to get default codec config."); cfg.g_w = info.frame_width; cfg.g_h = info.frame_height; @@ -110,8 +101,7 @@ int main(int argc, char **argv) { cfg.g_timebase.den = info.time_base.denominator; writer = vpx_video_writer_open(argv[4], kContainerIVF, &info); - if (!writer) - die("Failed to open %s for writing.", argv[4]); + if (!writer) die("Failed to open %s for writing.", argv[4]); if (!(infile = fopen(argv[3], "rb"))) die("Failed to open %s for reading.", argv[3]); @@ -128,15 +118,15 @@ int main(int argc, char **argv) { } // Flush encoder. - while (encode_frame(&codec, NULL, -1, 0, writer)) {} + while (encode_frame(&codec, NULL, -1, 0, writer)) { + } printf("\n"); fclose(infile); printf("Processed %d frames.\n", frame_count); vpx_img_free(&raw); - if (vpx_codec_destroy(&codec)) - die_codec(&codec, "Failed to destroy codec."); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); vpx_video_writer_close(writer); diff --git a/examples/resize_util.c b/examples/resize_util.c index e6fdd5bb2af2bdd91e0974fc0dc5b90fd0b9e4b8..7e529b2e203123d7fb85de478b1c31416e79b7f8 100644 --- a/examples/resize_util.c +++ b/examples/resize_util.c @@ -34,10 +34,8 @@ void usage_exit(void) { static int parse_dim(char *v, int *width, int *height) { char *x = strchr(v, 'x'); - if (x == NULL) - x = strchr(v, 'X'); - if (x == NULL) - return 0; + if (x == NULL) x = strchr(v, 'X'); + if (x == NULL) return 0; *width = atoi(v); *height = atoi(&x[1]); if (*width <= 0 || *height <= 0) @@ -93,30 +91,25 @@ int main(int argc, char *argv[]) { else frames = INT_MAX; - printf("Input size: %dx%d\n", - width, height); - printf("Target size: %dx%d, Frames: ", - target_width, target_height); + printf("Input size: %dx%d\n", width, height); + printf("Target size: %dx%d, Frames: ", target_width, target_height); if (frames == INT_MAX) printf("All\n"); else printf("%d\n", frames); - inbuf = (uint8_t*)malloc(width * height * 3 / 2); - outbuf = (uint8_t*)malloc(target_width * target_height * 3 / 2); + inbuf = (uint8_t *)malloc(width * height * 3 / 2); + outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2); inbuf_u = inbuf + width * height; inbuf_v = inbuf_u + width * height / 4; outbuf_u = outbuf + target_width * target_height; outbuf_v = outbuf_u + target_width * target_height / 4; f = 0; while (f < frames) { - if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1) - break; - vp9_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, - height, width, - outbuf, target_width, outbuf_u, outbuf_v, - target_width / 2, - target_height, target_width); + if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1) break; + vp9_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, height, + width, outbuf, target_width, outbuf_u, outbuf_v, + target_width / 2, target_height, target_width); fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout); f++; } diff --git a/examples/set_maps.c b/examples/set_maps.c index 1dc3ac0c98f8e1b75e6eff2372b64cd1c399f9af..d128e7d9a0d7822ef25be53510e8856a476db177 100644 --- a/examples/set_maps.c +++ b/examples/set_maps.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - // VP8 Set Active and ROI Maps // =========================== // @@ -86,8 +85,7 @@ static void set_roi_map(const vpx_codec_enc_cfg_t *cfg, roi.static_threshold[3] = 0; roi.roi_map = (uint8_t *)malloc(roi.rows * roi.cols); - for (i = 0; i < roi.rows * roi.cols; ++i) - roi.roi_map[i] = i % 4; + for (i = 0; i < roi.rows * roi.cols; ++i) roi.roi_map[i] = i % 4; if (vpx_codec_control(codec, VP8E_SET_ROI_MAP, &roi)) die_codec(codec, "Failed to set ROI map"); @@ -98,14 +96,13 @@ static void set_roi_map(const vpx_codec_enc_cfg_t *cfg, static void set_active_map(const vpx_codec_enc_cfg_t *cfg, vpx_codec_ctx_t *codec) { unsigned int i; - vpx_active_map_t map = {0, 0, 0}; + vpx_active_map_t map = { 0, 0, 0 }; map.rows = (cfg->g_h + 15) / 16; map.cols = (cfg->g_w + 15) / 16; map.active_map = (uint8_t *)malloc(map.rows * map.cols); - for (i = 0; i < map.rows * map.cols; ++i) - map.active_map[i] = i % 2; + for (i = 0; i < map.rows * map.cols; ++i) map.active_map[i] = i % 2; if (vpx_codec_control(codec, VP8E_SET_ACTIVEMAP, &map)) die_codec(codec, "Failed to set active map"); @@ -115,7 +112,7 @@ static void set_active_map(const vpx_codec_enc_cfg_t *cfg, static void unset_active_map(const vpx_codec_enc_cfg_t *cfg, vpx_codec_ctx_t *codec) { - vpx_active_map_t map = {0, 0, 0}; + vpx_active_map_t map = { 0, 0, 0 }; map.rows = (cfg->g_h + 15) / 16; map.cols = (cfg->g_w + 15) / 16; @@ -125,25 +122,21 @@ static void unset_active_map(const vpx_codec_enc_cfg_t *cfg, die_codec(codec, "Failed to set active map"); } -static int encode_frame(vpx_codec_ctx_t *codec, - vpx_image_t *img, - int frame_index, - VpxVideoWriter *writer) { +static int encode_frame(vpx_codec_ctx_t *codec, vpx_image_t *img, + int frame_index, VpxVideoWriter *writer) { int got_pkts = 0; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt = NULL; - const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, 0, - VPX_DL_GOOD_QUALITY); - if (res != VPX_CODEC_OK) - die_codec(codec, "Failed to encode frame"); + const vpx_codec_err_t res = + vpx_codec_encode(codec, img, frame_index, 1, 0, VPX_DL_GOOD_QUALITY); + if (res != VPX_CODEC_OK) die_codec(codec, "Failed to encode frame"); while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) { got_pkts = 1; if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; - if (!vpx_video_writer_write_frame(writer, - pkt->data.frame.buf, + if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf, pkt->data.frame.sz, pkt->data.frame.pts)) { die_codec(codec, "Failed to write compressed frame"); @@ -167,12 +160,11 @@ int main(int argc, char **argv) { VpxVideoInfo info; VpxVideoWriter *writer = NULL; const VpxInterface *encoder = NULL; - const int fps = 2; // TODO(dkovalev) add command line argument + const int fps = 2; // TODO(dkovalev) add command line argument const double bits_per_pixel_per_frame = 0.067; exec_name = argv[0]; - if (argc != 6) - die("Invalid number of arguments"); + if (argc != 6) die("Invalid number of arguments"); memset(&info, 0, sizeof(info)); @@ -187,35 +179,31 @@ int main(int argc, char **argv) { info.time_base.numerator = 1; info.time_base.denominator = fps; - if (info.frame_width <= 0 || - info.frame_height <= 0 || - (info.frame_width % 2) != 0 || - (info.frame_height % 2) != 0) { + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); } if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width, - info.frame_height, 1)) { + info.frame_height, 1)) { die("Failed to allocate image."); } printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); - if (res) - die_codec(&codec, "Failed to get default codec config."); + if (res) die_codec(&codec, "Failed to get default codec config."); cfg.g_w = info.frame_width; cfg.g_h = info.frame_height; cfg.g_timebase.num = info.time_base.numerator; cfg.g_timebase.den = info.time_base.denominator; - cfg.rc_target_bitrate = (unsigned int)(bits_per_pixel_per_frame * cfg.g_w * - cfg.g_h * fps / 1000); + cfg.rc_target_bitrate = + (unsigned int)(bits_per_pixel_per_frame * cfg.g_w * cfg.g_h * fps / 1000); cfg.g_lag_in_frames = 0; writer = vpx_video_writer_open(argv[5], kContainerIVF, &info); - if (!writer) - die("Failed to open %s for writing.", argv[5]); + if (!writer) die("Failed to open %s for writing.", argv[5]); if (!(infile = fopen(argv[4], "rb"))) die("Failed to open %s for reading.", argv[4]); @@ -239,15 +227,15 @@ int main(int argc, char **argv) { } // Flush encoder. - while (encode_frame(&codec, NULL, -1, writer)) {} + while (encode_frame(&codec, NULL, -1, writer)) { + } printf("\n"); fclose(infile); printf("Processed %d frames.\n", frame_count); vpx_img_free(&raw); - if (vpx_codec_destroy(&codec)) - die_codec(&codec, "Failed to destroy codec."); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); vpx_video_writer_close(writer); diff --git a/examples/simple_decoder.c b/examples/simple_decoder.c index 8ccc81035e3ba649ca0e6eb216ebbf0896c6a0bc..2bb1a05245bd3abe27fa2e46cc061ba822b0a7eb 100644 --- a/examples/simple_decoder.c +++ b/examples/simple_decoder.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - // Simple Decoder // ============== // @@ -103,12 +102,10 @@ int main(int argc, char **argv) { exec_name = argv[0]; - if (argc != 3) - die("Invalid number of arguments."); + if (argc != 3) die("Invalid number of arguments."); reader = vpx_video_reader_open(argv[1]); - if (!reader) - die("Failed to open %s for reading.", argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); if (!(outfile = fopen(argv[2], "wb"))) die("Failed to open %s for writing.", argv[2]); @@ -116,8 +113,7 @@ int main(int argc, char **argv) { info = vpx_video_reader_get_info(reader); decoder = get_vpx_decoder_by_fourcc(info->codec_fourcc); - if (!decoder) - die("Unknown input codec."); + if (!decoder) die("Unknown input codec."); printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface())); @@ -128,8 +124,8 @@ int main(int argc, char **argv) { vpx_codec_iter_t iter = NULL; vpx_image_t *img = NULL; size_t frame_size = 0; - const unsigned char *frame = vpx_video_reader_get_frame(reader, - &frame_size); + const unsigned char *frame = + vpx_video_reader_get_frame(reader, &frame_size); if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 0)) die_codec(&codec, "Failed to decode frame."); @@ -140,8 +136,7 @@ int main(int argc, char **argv) { } printf("Processed %d frames.\n", frame_cnt); - if (vpx_codec_destroy(&codec)) - die_codec(&codec, "Failed to destroy codec"); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n", info->frame_width, info->frame_height, argv[2]); diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c index 64f0a01379a1a3ea4cbd8ea667d02a4d4bcbdffd..331a2a595dd980d3d51fab6daa9fbc57bcd387dc 100644 --- a/examples/simple_encoder.c +++ b/examples/simple_encoder.c @@ -109,32 +109,27 @@ static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile> " - "<keyframe-interval> <error-resilient> <frames to encode>\n" - "See comments in simple_encoder.c for more information.\n", + "<keyframe-interval> <error-resilient> <frames to encode>\n" + "See comments in simple_encoder.c for more information.\n", exec_name); exit(EXIT_FAILURE); } -static int encode_frame(vpx_codec_ctx_t *codec, - vpx_image_t *img, - int frame_index, - int flags, - VpxVideoWriter *writer) { +static int encode_frame(vpx_codec_ctx_t *codec, vpx_image_t *img, + int frame_index, int flags, VpxVideoWriter *writer) { int got_pkts = 0; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt = NULL; - const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, - flags, VPX_DL_GOOD_QUALITY); - if (res != VPX_CODEC_OK) - die_codec(codec, "Failed to encode frame"); + const vpx_codec_err_t res = + vpx_codec_encode(codec, img, frame_index, 1, flags, VPX_DL_GOOD_QUALITY); + if (res != VPX_CODEC_OK) die_codec(codec, "Failed to encode frame"); while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) { got_pkts = 1; if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; - if (!vpx_video_writer_write_frame(writer, - pkt->data.frame.buf, + if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf, pkt->data.frame.sz, pkt->data.frame.pts)) { die_codec(codec, "Failed to write compressed frame"); @@ -155,7 +150,7 @@ int main(int argc, char **argv) { int frame_count = 0; vpx_image_t raw; vpx_codec_err_t res; - VpxVideoInfo info = {0}; + VpxVideoInfo info = { 0 }; VpxVideoWriter *writer = NULL; const VpxInterface *encoder = NULL; const int fps = 30; @@ -172,8 +167,7 @@ int main(int argc, char **argv) { exec_name = argv[0]; - if (argc != 9) - die("Invalid number of arguments"); + if (argc != 9) die("Invalid number of arguments"); codec_arg = argv[1]; width_arg = argv[2]; @@ -184,8 +178,7 @@ int main(int argc, char **argv) { max_frames = strtol(argv[8], NULL, 0); encoder = get_vpx_encoder_by_name(codec_arg); - if (!encoder) - die("Unsupported codec."); + if (!encoder) die("Unsupported codec."); info.codec_fourcc = encoder->fourcc; info.frame_width = strtol(width_arg, NULL, 0); @@ -193,27 +186,23 @@ int main(int argc, char **argv) { info.time_base.numerator = 1; info.time_base.denominator = fps; - if (info.frame_width <= 0 || - info.frame_height <= 0 || - (info.frame_width % 2) != 0 || - (info.frame_height % 2) != 0) { + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); } if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width, - info.frame_height, 1)) { + info.frame_height, 1)) { die("Failed to allocate image."); } keyframe_interval = strtol(keyframe_interval_arg, NULL, 0); - if (keyframe_interval < 0) - die("Invalid keyframe interval value."); + if (keyframe_interval < 0) die("Invalid keyframe interval value."); printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); - if (res) - die_codec(&codec, "Failed to get default codec config."); + if (res) die_codec(&codec, "Failed to get default codec config."); cfg.g_w = info.frame_width; cfg.g_h = info.frame_height; @@ -223,8 +212,7 @@ int main(int argc, char **argv) { cfg.g_error_resilient = strtol(argv[7], NULL, 0); writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info); - if (!writer) - die("Failed to open %s for writing.", outfile_arg); + if (!writer) die("Failed to open %s for writing.", outfile_arg); if (!(infile = fopen(infile_arg, "rb"))) die("Failed to open %s for reading.", infile_arg); @@ -239,20 +227,19 @@ int main(int argc, char **argv) { flags |= VPX_EFLAG_FORCE_KF; encode_frame(&codec, &raw, frame_count++, flags, writer); frames_encoded++; - if (max_frames > 0 && frames_encoded >= max_frames) - break; + if (max_frames > 0 && frames_encoded >= max_frames) break; } // Flush encoder. - while (encode_frame(&codec, NULL, -1, 0, writer)) {}; + while (encode_frame(&codec, NULL, -1, 0, writer)) { + } printf("\n"); fclose(infile); printf("Processed %d frames.\n", frame_count); vpx_img_free(&raw); - if (vpx_codec_destroy(&codec)) - die_codec(&codec, "Failed to destroy codec."); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); vpx_video_writer_close(writer); diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c index 15a6617cd4f282a09ea046e652ac868ceff5c1b3..4c130ec18c3aef24d51ec5c6442338a8fa2fc8eb 100644 --- a/examples/twopass_encoder.c +++ b/examples/twopass_encoder.c @@ -61,25 +61,21 @@ static const char *exec_name; void usage_exit(void) { fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile> " - "<frame limit>\n", + "<frame limit>\n", exec_name); exit(EXIT_FAILURE); } -static int get_frame_stats(vpx_codec_ctx_t *ctx, - const vpx_image_t *img, - vpx_codec_pts_t pts, - unsigned int duration, - vpx_enc_frame_flags_t flags, - unsigned int deadline, +static int get_frame_stats(vpx_codec_ctx_t *ctx, const vpx_image_t *img, + vpx_codec_pts_t pts, unsigned int duration, + vpx_enc_frame_flags_t flags, unsigned int deadline, vpx_fixed_buf_t *stats) { int got_pkts = 0; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt = NULL; - const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags, - deadline); - if (res != VPX_CODEC_OK) - die_codec(ctx, "Failed to get frame stats."); + const vpx_codec_err_t res = + vpx_codec_encode(ctx, img, pts, duration, flags, deadline); + if (res != VPX_CODEC_OK) die_codec(ctx, "Failed to get frame stats."); while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) { got_pkts = 1; @@ -96,20 +92,16 @@ static int get_frame_stats(vpx_codec_ctx_t *ctx, return got_pkts; } -static int encode_frame(vpx_codec_ctx_t *ctx, - const vpx_image_t *img, - vpx_codec_pts_t pts, - unsigned int duration, - vpx_enc_frame_flags_t flags, - unsigned int deadline, +static int encode_frame(vpx_codec_ctx_t *ctx, const vpx_image_t *img, + vpx_codec_pts_t pts, unsigned int duration, + vpx_enc_frame_flags_t flags, unsigned int deadline, VpxVideoWriter *writer) { int got_pkts = 0; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt = NULL; - const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags, - deadline); - if (res != VPX_CODEC_OK) - die_codec(ctx, "Failed to encode frame."); + const vpx_codec_err_t res = + vpx_codec_encode(ctx, img, pts, duration, flags, deadline); + if (res != VPX_CODEC_OK) die_codec(ctx, "Failed to encode frame."); while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) { got_pkts = 1; @@ -117,8 +109,8 @@ static int encode_frame(vpx_codec_ctx_t *ctx, const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf, - pkt->data.frame.sz, - pkt->data.frame.pts)) + pkt->data.frame.sz, + pkt->data.frame.pts)) die_codec(ctx, "Failed to write compressed frame."); printf(keyframe ? "K" : "."); fflush(stdout); @@ -128,14 +120,12 @@ static int encode_frame(vpx_codec_ctx_t *ctx, return got_pkts; } -static vpx_fixed_buf_t pass0(vpx_image_t *raw, - FILE *infile, +static vpx_fixed_buf_t pass0(vpx_image_t *raw, FILE *infile, const VpxInterface *encoder, - const vpx_codec_enc_cfg_t *cfg, - int max_frames) { + const vpx_codec_enc_cfg_t *cfg, int max_frames) { vpx_codec_ctx_t codec; int frame_count = 0; - vpx_fixed_buf_t stats = {NULL, 0}; + vpx_fixed_buf_t stats = { NULL, 0 }; if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0)) die_codec(&codec, "Failed to initialize encoder"); @@ -145,40 +135,33 @@ static vpx_fixed_buf_t pass0(vpx_image_t *raw, ++frame_count; get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, &stats); - if (max_frames > 0 && frame_count >= max_frames) - break; + if (max_frames > 0 && frame_count >= max_frames) break; } // Flush encoder. - while (get_frame_stats(&codec, NULL, frame_count, 1, 0, - VPX_DL_GOOD_QUALITY, &stats)) {} + while (get_frame_stats(&codec, NULL, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, + &stats)) { + } printf("Pass 0 complete. Processed %d frames.\n", frame_count); - if (vpx_codec_destroy(&codec)) - die_codec(&codec, "Failed to destroy codec."); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); return stats; } -static void pass1(vpx_image_t *raw, - FILE *infile, - const char *outfile_name, - const VpxInterface *encoder, - const vpx_codec_enc_cfg_t *cfg, +static void pass1(vpx_image_t *raw, FILE *infile, const char *outfile_name, + const VpxInterface *encoder, const vpx_codec_enc_cfg_t *cfg, int max_frames) { - VpxVideoInfo info = { - encoder->fourcc, - cfg->g_w, - cfg->g_h, - {cfg->g_timebase.num, cfg->g_timebase.den} - }; + VpxVideoInfo info = { encoder->fourcc, + cfg->g_w, + cfg->g_h, + { cfg->g_timebase.num, cfg->g_timebase.den } }; VpxVideoWriter *writer = NULL; vpx_codec_ctx_t codec; int frame_count = 0; writer = vpx_video_writer_open(outfile_name, kContainerIVF, &info); - if (!writer) - die("Failed to open %s for writing", outfile_name); + if (!writer) die("Failed to open %s for writing", outfile_name); if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0)) die_codec(&codec, "Failed to initialize encoder"); @@ -188,17 +171,16 @@ static void pass1(vpx_image_t *raw, ++frame_count; encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer); - if (max_frames > 0 && frame_count >= max_frames) - break; + if (max_frames > 0 && frame_count >= max_frames) break; } // Flush encoder. - while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_GOOD_QUALITY, writer)) {} + while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_GOOD_QUALITY, writer)) { + } printf("\n"); - if (vpx_codec_destroy(&codec)) - die_codec(&codec, "Failed to destroy codec."); + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); vpx_video_writer_close(writer); @@ -215,8 +197,8 @@ int main(int argc, char **argv) { vpx_fixed_buf_t stats; const VpxInterface *encoder = NULL; - const int fps = 30; // TODO(dkovalev) add command line argument - const int bitrate = 200; // kbit/s TODO(dkovalev) add command line argument + const int fps = 30; // TODO(dkovalev) add command line argument + const int bitrate = 200; // kbit/s TODO(dkovalev) add command line argument const char *const codec_arg = argv[1]; const char *const width_arg = argv[2]; const char *const height_arg = argv[3]; @@ -225,19 +207,17 @@ int main(int argc, char **argv) { int max_frames = 0; exec_name = argv[0]; - if (argc != 7) - die("Invalid number of arguments."); + if (argc != 7) die("Invalid number of arguments."); max_frames = strtol(argv[6], NULL, 0); encoder = get_vpx_encoder_by_name(codec_arg); - if (!encoder) - die("Unsupported codec."); + if (!encoder) die("Unsupported codec."); w = strtol(width_arg, NULL, 0); h = strtol(height_arg, NULL, 0); - if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0) + if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0) die("Invalid frame size: %dx%d", w, h); if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1)) @@ -247,8 +227,7 @@ int main(int argc, char **argv) { // Configuration res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); - if (res) - die_codec(&codec, "Failed to get default codec config."); + if (res) die_codec(&codec, "Failed to get default codec config."); cfg.g_w = w; cfg.g_h = h; diff --git a/examples/vpxcx_set_ref.c b/examples/vpxcx_set_ref.c index 25164857c9ed7eb953e177493f9ee33d7c095a88..6771d422e1a5de8a7fbb48c381cff86ea52292d9 100644 --- a/examples/vpxcx_set_ref.c +++ b/examples/vpxcx_set_ref.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - // VP10 Set Reference Frame // ============================ // @@ -61,7 +60,8 @@ static const char *exec_name; void usage_exit() { - fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile> " + fprintf(stderr, + "Usage: %s <codec> <width> <height> <infile> <outfile> " "<frame> <limit(optional)>\n", exec_name); exit(EXIT_FAILURE); @@ -70,8 +70,7 @@ void usage_exit() { static int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2) { uint32_t l_w = img1->d_w; - uint32_t c_w = - (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; const uint32_t c_h = (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; uint32_t i; @@ -99,10 +98,10 @@ static int compare_img(const vpx_image_t *const img1, return match; } -#define mmin(a, b) ((a) < (b) ? (a) : (b)) +#define mmin(a, b) ((a) < (b) ? (a) : (b)) static void find_mismatch(const vpx_image_t *const img1, - const vpx_image_t *const img2, - int yloc[4], int uloc[4], int vloc[4]) { + const vpx_image_t *const img2, int yloc[4], + int uloc[4], int vloc[4]) { const uint32_t bsize = 64; const uint32_t bsizey = bsize >> img1->y_chroma_shift; const uint32_t bsizex = bsize >> img1->x_chroma_shift; @@ -190,21 +189,18 @@ static void find_mismatch(const vpx_image_t *const img1, } } -static void testing_decode(vpx_codec_ctx_t *encoder, - vpx_codec_ctx_t *decoder, - vpx_codec_enc_cfg_t *cfg, - unsigned int frame_out, +static void testing_decode(vpx_codec_ctx_t *encoder, vpx_codec_ctx_t *decoder, + vpx_codec_enc_cfg_t *cfg, unsigned int frame_out, int *mismatch_seen) { vpx_image_t enc_img, dec_img; struct vp9_ref_frame ref_enc, ref_dec; - if (*mismatch_seen) - return; + if (*mismatch_seen) return; ref_enc.idx = 0; ref_dec.idx = 0; if (vpx_codec_control(encoder, VP9_GET_REFERENCE, &ref_enc)) - die_codec(encoder, "Failed to get encoder reference frame"); + die_codec(encoder, "Failed to get encoder reference frame"); enc_img = ref_enc.img; if (vpx_codec_control(decoder, VP9_GET_REFERENCE, &ref_dec)) die_codec(decoder, "Failed to get decoder reference frame"); @@ -216,37 +212,31 @@ static void testing_decode(vpx_codec_ctx_t *encoder, *mismatch_seen = 1; find_mismatch(&enc_img, &dec_img, y, u, v); - printf("Encode/decode mismatch on frame %d at" - " Y[%d, %d] {%d/%d}," - " U[%d, %d] {%d/%d}," - " V[%d, %d] {%d/%d}", - frame_out, - y[0], y[1], y[2], y[3], - u[0], u[1], u[2], u[3], - v[0], v[1], v[2], v[3]); + printf( + "Encode/decode mismatch on frame %d at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}", + frame_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1], + v[2], v[3]); } vpx_img_free(&enc_img); vpx_img_free(&dec_img); } -static int encode_frame(vpx_codec_ctx_t *ecodec, - vpx_codec_enc_cfg_t *cfg, - vpx_image_t *img, - unsigned int frame_in, - VpxVideoWriter *writer, - int test_decode, - vpx_codec_ctx_t *dcodec, - unsigned int *frame_out, +static int encode_frame(vpx_codec_ctx_t *ecodec, vpx_codec_enc_cfg_t *cfg, + vpx_image_t *img, unsigned int frame_in, + VpxVideoWriter *writer, int test_decode, + vpx_codec_ctx_t *dcodec, unsigned int *frame_out, int *mismatch_seen) { int got_pkts = 0; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt = NULL; int got_data; - const vpx_codec_err_t res = vpx_codec_encode(ecodec, img, frame_in, 1, - 0, VPX_DL_GOOD_QUALITY); - if (res != VPX_CODEC_OK) - die_codec(ecodec, "Failed to encode frame"); + const vpx_codec_err_t res = + vpx_codec_encode(ecodec, img, frame_in, 1, 0, VPX_DL_GOOD_QUALITY); + if (res != VPX_CODEC_OK) die_codec(ecodec, "Failed to encode frame"); got_data = 0; @@ -257,11 +247,10 @@ static int encode_frame(vpx_codec_ctx_t *ecodec, const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0; if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) { - *frame_out += 1; - } + *frame_out += 1; + } - if (!vpx_video_writer_write_frame(writer, - pkt->data.frame.buf, + if (!vpx_video_writer_write_frame(writer, pkt->data.frame.buf, pkt->data.frame.sz, pkt->data.frame.pts)) { die_codec(ecodec, "Failed to write compressed frame"); @@ -290,12 +279,12 @@ static int encode_frame(vpx_codec_ctx_t *ecodec, int main(int argc, char **argv) { FILE *infile = NULL; // Encoder - vpx_codec_ctx_t ecodec = {0}; - vpx_codec_enc_cfg_t cfg = {0}; + vpx_codec_ctx_t ecodec = { 0 }; + vpx_codec_enc_cfg_t cfg = { 0 }; unsigned int frame_in = 0; vpx_image_t raw; vpx_codec_err_t res; - VpxVideoInfo info = {0}; + VpxVideoInfo info = { 0 }; VpxVideoWriter *writer = NULL; const VpxInterface *encoder = NULL; @@ -320,8 +309,7 @@ int main(int argc, char **argv) { unsigned int limit = 0; exec_name = argv[0]; - if (argc < 7) - die("Invalid number of arguments"); + if (argc < 7) die("Invalid number of arguments"); codec_arg = argv[1]; width_arg = argv[2]; @@ -330,15 +318,13 @@ int main(int argc, char **argv) { outfile_arg = argv[5]; encoder = get_vpx_encoder_by_name(codec_arg); - if (!encoder) - die("Unsupported codec."); + if (!encoder) die("Unsupported codec."); update_frame_num = atoi(argv[6]); // In VP10, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are // allocated while calling vpx_codec_encode(), thus, setting reference for // 1st frame isn't supported. - if (update_frame_num <= 1) - die("Couldn't parse frame number '%s'\n", argv[6]); + if (update_frame_num <= 1) die("Couldn't parse frame number '%s'\n", argv[6]); if (argc > 7) { limit = atoi(argv[7]); @@ -352,23 +338,20 @@ int main(int argc, char **argv) { info.time_base.numerator = 1; info.time_base.denominator = fps; - if (info.frame_width <= 0 || - info.frame_height <= 0 || - (info.frame_width % 2) != 0 || - (info.frame_height % 2) != 0) { + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); } if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width, - info.frame_height, 1)) { + info.frame_height, 1)) { die("Failed to allocate image."); } printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0); - if (res) - die_codec(&ecodec, "Failed to get default codec config."); + if (res) die_codec(&ecodec, "Failed to get default codec config."); cfg.g_w = info.frame_width; cfg.g_h = info.frame_height; @@ -378,8 +361,7 @@ int main(int argc, char **argv) { cfg.g_lag_in_frames = 3; writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info); - if (!writer) - die("Failed to open %s for writing.", outfile_arg); + if (!writer) die("Failed to open %s for writing.", outfile_arg); if (!(infile = fopen(infile_arg, "rb"))) die("Failed to open %s for reading.", infile_arg); @@ -392,15 +374,14 @@ int main(int argc, char **argv) { die_codec(&ecodec, "Failed to set enable auto alt ref"); if (test_decode) { - const VpxInterface *decoder = get_vpx_decoder_by_name(codec_arg); - if (vpx_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0)) - die_codec(&dcodec, "Failed to initialize decoder."); + const VpxInterface *decoder = get_vpx_decoder_by_name(codec_arg); + if (vpx_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0)) + die_codec(&dcodec, "Failed to initialize decoder."); } // Encode frames. while (vpx_img_read(&raw, infile)) { - if (limit && frame_in >= limit) - break; + if (limit && frame_in >= limit) break; if (update_frame_num > 1 && frame_out + 1 == update_frame_num) { vpx_ref_frame_t ref; ref.frame_type = VP8_LAST_FRAME; @@ -418,17 +399,17 @@ int main(int argc, char **argv) { } } - encode_frame(&ecodec, &cfg, &raw, frame_in, writer, test_decode, - &dcodec, &frame_out, &mismatch_seen); + encode_frame(&ecodec, &cfg, &raw, frame_in, writer, test_decode, &dcodec, + &frame_out, &mismatch_seen); frame_in++; - if (mismatch_seen) - break; + if (mismatch_seen) break; } // Flush encoder. if (!mismatch_seen) while (encode_frame(&ecodec, &cfg, NULL, frame_in, writer, test_decode, - &dcodec, &frame_out, &mismatch_seen)) {} + &dcodec, &frame_out, &mismatch_seen)) { + } printf("\n"); fclose(infile); diff --git a/ivfdec.c b/ivfdec.c index 7fc25a0e8105e6e2a96cc5026b0042ab54cff286..f64e594ab0e6952a83363518fe0175672196d452 100644 --- a/ivfdec.c +++ b/ivfdec.c @@ -46,7 +46,8 @@ int file_is_ivf(struct VpxInputContext *input_ctx) { is_ivf = 1; if (mem_get_le16(raw_hdr + 4) != 0) { - fprintf(stderr, "Error: Unrecognized IVF version! This file may not" + fprintf(stderr, + "Error: Unrecognized IVF version! This file may not" " decode properly."); } @@ -69,14 +70,13 @@ int file_is_ivf(struct VpxInputContext *input_ctx) { return is_ivf; } -int ivf_read_frame(FILE *infile, uint8_t **buffer, - size_t *bytes_read, size_t *buffer_size) { - char raw_header[IVF_FRAME_HDR_SZ] = {0}; +int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size) { + char raw_header[IVF_FRAME_HDR_SZ] = { 0 }; size_t frame_size = 0; if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) { - if (!feof(infile)) - warn("Failed to read frame size\n"); + if (!feof(infile)) warn("Failed to read frame size\n"); } else { frame_size = mem_get_le32(raw_header); diff --git a/ivfdec.h b/ivfdec.h index dd29cc6174b73d579ee3c4e15e372e29d73cc9a6..af725572b48dcfcee53c63c186799321ccdae5e2 100644 --- a/ivfdec.h +++ b/ivfdec.h @@ -18,11 +18,11 @@ extern "C" { int file_is_ivf(struct VpxInputContext *input); -int ivf_read_frame(FILE *infile, uint8_t **buffer, - size_t *bytes_read, size_t *buffer_size); +int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size); #ifdef __cplusplus -} /* extern "C" */ +} /* extern "C" */ #endif #endif // IVFDEC_H_ diff --git a/ivfenc.c b/ivfenc.c index 4a97c42731c93523379fcd2cb80c2bc5524c3722..a50d31839da01f09f5bf1e2284a3373222e2cc52 100644 --- a/ivfenc.c +++ b/ivfenc.c @@ -13,10 +13,8 @@ #include "vpx/vpx_encoder.h" #include "vpx_ports/mem_ops.h" -void ivf_write_file_header(FILE *outfile, - const struct vpx_codec_enc_cfg *cfg, - unsigned int fourcc, - int frame_cnt) { +void ivf_write_file_header(FILE *outfile, const struct vpx_codec_enc_cfg *cfg, + unsigned int fourcc, int frame_cnt) { char header[32]; header[0] = 'D'; diff --git a/ivfenc.h b/ivfenc.h index 6623687e8444061081a8393c578b3c794f1b7426..ebdce47be8f659f1e36115c1880f33c8dd53c6d2 100644 --- a/ivfenc.h +++ b/ivfenc.h @@ -19,17 +19,15 @@ struct vpx_codec_cx_pkt; extern "C" { #endif -void ivf_write_file_header(FILE *outfile, - const struct vpx_codec_enc_cfg *cfg, - uint32_t fourcc, - int frame_cnt); +void ivf_write_file_header(FILE *outfile, const struct vpx_codec_enc_cfg *cfg, + uint32_t fourcc, int frame_cnt); void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size); void ivf_write_frame_size(FILE *outfile, size_t frame_size); #ifdef __cplusplus -} /* extern "C" */ +} /* extern "C" */ #endif #endif // IVFENC_H_ diff --git a/md5_utils.c b/md5_utils.c index a9b979a419787cfc5211111c6c0aefe79104a060..093798b833983e57d2fdcd7a1bc9a4839a9b496f 100644 --- a/md5_utils.c +++ b/md5_utils.c @@ -20,19 +20,17 @@ * Still in the public domain. */ -#include <string.h> /* for memcpy() */ +#include <string.h> /* for memcpy() */ #include "md5_utils.h" -static void -byteSwap(UWORD32 *buf, unsigned words) { +static void byteSwap(UWORD32 *buf, unsigned words) { md5byte *p; /* Only swap bytes for big endian machines */ int i = 1; - if (*(char *)&i == 1) - return; + if (*(char *)&i == 1) return; p = (md5byte *)buf; @@ -47,8 +45,7 @@ byteSwap(UWORD32 *buf, unsigned words) { * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious * initialization constants. */ -void -MD5Init(struct MD5Context *ctx) { +void MD5Init(struct MD5Context *ctx) { ctx->buf[0] = 0x67452301; ctx->buf[1] = 0xefcdab89; ctx->buf[2] = 0x98badcfe; @@ -62,8 +59,7 @@ MD5Init(struct MD5Context *ctx) { * Update context to reflect the concatenation of another buffer full * of bytes. */ -void -MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) { +void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) { UWORD32 t; /* Update byte count */ @@ -71,9 +67,9 @@ MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) { t = ctx->bytes[0]; if ((ctx->bytes[0] = t + len) < t) - ctx->bytes[1]++; /* Carry from low to high */ + ctx->bytes[1]++; /* Carry from low to high */ - t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */ + t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */ if (t > len) { memcpy((md5byte *)ctx->in + 64 - t, buf, len); @@ -104,8 +100,7 @@ MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) { * Final wrapup - pad to 64-byte boundary with the bit pattern * 1 0* (64-bit count of bits processed, MSB-first) */ -void -MD5Final(md5byte digest[16], struct MD5Context *ctx) { +void MD5Final(md5byte digest[16], struct MD5Context *ctx) { int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */ md5byte *p = (md5byte *)ctx->in + count; @@ -115,7 +110,7 @@ MD5Final(md5byte digest[16], struct MD5Context *ctx) { /* Bytes of padding needed to make 56 bytes (-8..55) */ count = 56 - 1 - count; - if (count < 0) { /* Padding forces an extra block */ + if (count < 0) { /* Padding forces an extra block */ memset(p, 0, count + 8); byteSwap(ctx->in, 16); MD5Transform(ctx->buf, ctx->in); @@ -147,8 +142,8 @@ MD5Final(md5byte digest[16], struct MD5Context *ctx) { #define F4(x, y, z) (y ^ (x | ~z)) /* This is the central step in the MD5 algorithm. */ -#define MD5STEP(f,w,x,y,z,in,s) \ - (w += f(x,y,z) + in, w = (w<<s | w>>(32-s)) + x) +#define MD5STEP(f, w, x, y, z, in, s) \ + (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x) #if defined(__clang__) && defined(__has_attribute) #if __has_attribute(no_sanitize) @@ -166,8 +161,8 @@ MD5Final(md5byte digest[16], struct MD5Context *ctx) { * reflect the addition of 16 longwords of new data. MD5Update blocks * the data and converts bytes into longwords for this routine. */ -VPX_NO_UNSIGNED_OVERFLOW_CHECK void -MD5Transform(UWORD32 buf[4], UWORD32 const in[16]) { +VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4], + UWORD32 const in[16]) { register UWORD32 a, b, c, d; a = buf[0]; diff --git a/rate_hist.c b/rate_hist.c index a77222b16186644caceb6d0ba2c5c08eeee76329..872a10bae0b3b69d95083a169f22905a770788a8 100644 --- a/rate_hist.c +++ b/rate_hist.c @@ -45,8 +45,7 @@ struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg, hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000; // prevent division by zero - if (hist->samples == 0) - hist->samples = 1; + if (hist->samples == 0) hist->samples = 1; hist->frames = 0; hist->total = 0; @@ -78,18 +77,16 @@ void update_rate_histogram(struct rate_hist *hist, int64_t avg_bitrate = 0; int64_t sum_sz = 0; const int64_t now = pkt->data.frame.pts * 1000 * - (uint64_t)cfg->g_timebase.num / - (uint64_t)cfg->g_timebase.den; + (uint64_t)cfg->g_timebase.num / + (uint64_t)cfg->g_timebase.den; int idx = hist->frames++ % hist->samples; hist->pts[idx] = now; hist->sz[idx] = (int)pkt->data.frame.sz; - if (now < cfg->rc_buf_initial_sz) - return; + if (now < cfg->rc_buf_initial_sz) return; - if (!cfg->rc_target_bitrate) - return; + if (!cfg->rc_target_bitrate) return; then = now; @@ -98,20 +95,16 @@ void update_rate_histogram(struct rate_hist *hist, const int i_idx = (i - 1) % hist->samples; then = hist->pts[i_idx]; - if (now - then > cfg->rc_buf_sz) - break; + if (now - then > cfg->rc_buf_sz) break; sum_sz += hist->sz[i_idx]; } - if (now == then) - return; + if (now == then) return; avg_bitrate = sum_sz * 8 * 1000 / (now - then); idx = (int)(avg_bitrate * (RATE_BINS / 2) / (cfg->rc_target_bitrate * 1000)); - if (idx < 0) - idx = 0; - if (idx > RATE_BINS - 1) - idx = RATE_BINS - 1; + if (idx < 0) idx = 0; + if (idx > RATE_BINS - 1) idx = RATE_BINS - 1; if (hist->bucket[idx].low > avg_bitrate) hist->bucket[idx].low = (int)avg_bitrate; if (hist->bucket[idx].high < avg_bitrate) @@ -120,8 +113,8 @@ void update_rate_histogram(struct rate_hist *hist, hist->total++; } -static int merge_hist_buckets(struct hist_bucket *bucket, - int max_buckets, int *num_buckets) { +static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets, + int *num_buckets) { int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0; int buckets = *num_buckets; int i; @@ -129,10 +122,8 @@ static int merge_hist_buckets(struct hist_bucket *bucket, /* Find the extrema for this list of buckets */ big_bucket = small_bucket = 0; for (i = 0; i < buckets; i++) { - if (bucket[i].count < bucket[small_bucket].count) - small_bucket = i; - if (bucket[i].count > bucket[big_bucket].count) - big_bucket = i; + if (bucket[i].count < bucket[small_bucket].count) small_bucket = i; + if (bucket[i].count > bucket[big_bucket].count) big_bucket = i; } /* If we have too many buckets, merge the smallest with an adjacent @@ -174,13 +165,10 @@ static int merge_hist_buckets(struct hist_bucket *bucket, */ big_bucket = small_bucket = 0; for (i = 0; i < buckets; i++) { - if (i > merge_bucket) - bucket[i] = bucket[i + 1]; + if (i > merge_bucket) bucket[i] = bucket[i + 1]; - if (bucket[i].count < bucket[small_bucket].count) - small_bucket = i; - if (bucket[i].count > bucket[big_bucket].count) - big_bucket = i; + if (bucket[i].count < bucket[small_bucket].count) small_bucket = i; + if (bucket[i].count > bucket[big_bucket].count) big_bucket = i; } } @@ -188,8 +176,8 @@ static int merge_hist_buckets(struct hist_bucket *bucket, return bucket[big_bucket].count; } -static void show_histogram(const struct hist_bucket *bucket, - int buckets, int total, int scale) { +static void show_histogram(const struct hist_bucket *bucket, int buckets, + int total, int scale) { const char *pat1, *pat2; int i; @@ -232,8 +220,7 @@ static void show_histogram(const struct hist_bucket *bucket, pct = (float)(100.0 * bucket[i].count / total); len = HIST_BAR_MAX * bucket[i].count / scale; - if (len < 1) - len = 1; + if (len < 1) len = 1; assert(len <= HIST_BAR_MAX); if (bucket[i].low == bucket[i].high) @@ -241,8 +228,7 @@ static void show_histogram(const struct hist_bucket *bucket, else fprintf(stderr, pat2, bucket[i].low, bucket[i].high); - for (j = 0; j < HIST_BAR_MAX; j++) - fprintf(stderr, j < len ? "=" : " "); + for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " "); fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct); } } @@ -268,14 +254,13 @@ void show_q_histogram(const int counts[64], int max_buckets) { show_histogram(bucket, buckets, total, scale); } -void show_rate_histogram(struct rate_hist *hist, - const vpx_codec_enc_cfg_t *cfg, int max_buckets) { +void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg, + int max_buckets) { int i, scale; int buckets = 0; for (i = 0; i < RATE_BINS; i++) { - if (hist->bucket[i].low == INT_MAX) - continue; + if (hist->bucket[i].low == INT_MAX) continue; hist->bucket[buckets++] = hist->bucket[i]; } diff --git a/tools_common.c b/tools_common.c index 83eec5013f5a3d6b20b393fdb82f234752e0f68b..e1c89a4cf651490e1d5b5f1c462e67944eb52a1b 100644 --- a/tools_common.c +++ b/tools_common.c @@ -29,23 +29,22 @@ #include <fcntl.h> #ifdef __OS2__ -#define _setmode setmode -#define _fileno fileno -#define _O_BINARY O_BINARY +#define _setmode setmode +#define _fileno fileno +#define _O_BINARY O_BINARY #endif #endif -#define LOG_ERROR(label) do {\ - const char *l = label;\ - va_list ap;\ - va_start(ap, fmt);\ - if (l)\ - fprintf(stderr, "%s: ", l);\ - vfprintf(stderr, fmt, ap);\ - fprintf(stderr, "\n");\ - va_end(ap);\ -} while (0) - +#define LOG_ERROR(label) \ + do { \ + const char *l = label; \ + va_list ap; \ + va_start(ap, fmt); \ + if (l) fprintf(stderr, "%s: ", l); \ + vfprintf(stderr, fmt, ap); \ + fprintf(stderr, "\n"); \ + va_end(ap); \ + } while (0) FILE *set_binary_mode(FILE *stream) { (void)stream; @@ -65,16 +64,13 @@ void fatal(const char *fmt, ...) { exit(EXIT_FAILURE); } -void warn(const char *fmt, ...) { - LOG_ERROR("Warning"); -} +void warn(const char *fmt, ...) { LOG_ERROR("Warning"); } void die_codec(vpx_codec_ctx_t *ctx, const char *s) { const char *detail = vpx_codec_error_detail(ctx); printf("%s: %s\n", s, vpx_codec_error(ctx)); - if (detail) - printf(" %s\n", detail); + if (detail) printf(" %s\n", detail); exit(EXIT_FAILURE); } @@ -97,15 +93,16 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) { */ switch (plane) { case 1: - ptr = yuv_frame->planes[ - yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_V : VPX_PLANE_U]; + ptr = + yuv_frame->planes[yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_V + : VPX_PLANE_U]; break; case 2: - ptr = yuv_frame->planes[ - yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_U : VPX_PLANE_V]; + ptr = + yuv_frame->planes[yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_U + : VPX_PLANE_V]; break; - default: - ptr = yuv_frame->planes[plane]; + default: ptr = yuv_frame->planes[plane]; } for (r = 0; r < h; ++r) { @@ -134,7 +131,7 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) { static const VpxInterface vpx_encoders[] = { #if CONFIG_VP10_ENCODER - {"vp10", VP10_FOURCC, &vpx_codec_vp10_cx}, + { "vp10", VP10_FOURCC, &vpx_codec_vp10_cx }, #endif }; @@ -142,17 +139,14 @@ int get_vpx_encoder_count(void) { return sizeof(vpx_encoders) / sizeof(vpx_encoders[0]); } -const VpxInterface *get_vpx_encoder_by_index(int i) { - return &vpx_encoders[i]; -} +const VpxInterface *get_vpx_encoder_by_index(int i) { return &vpx_encoders[i]; } const VpxInterface *get_vpx_encoder_by_name(const char *name) { int i; for (i = 0; i < get_vpx_encoder_count(); ++i) { const VpxInterface *encoder = get_vpx_encoder_by_index(i); - if (strcmp(encoder->name, name) == 0) - return encoder; + if (strcmp(encoder->name, name) == 0) return encoder; } return NULL; @@ -165,7 +159,7 @@ const VpxInterface *get_vpx_encoder_by_name(const char *name) { static const VpxInterface vpx_decoders[] = { #if CONFIG_VP10_DECODER - {"vp10", VP10_FOURCC, &vpx_codec_vp10_dx}, + { "vp10", VP10_FOURCC, &vpx_codec_vp10_dx }, #endif }; @@ -173,17 +167,14 @@ int get_vpx_decoder_count(void) { return sizeof(vpx_decoders) / sizeof(vpx_decoders[0]); } -const VpxInterface *get_vpx_decoder_by_index(int i) { - return &vpx_decoders[i]; -} +const VpxInterface *get_vpx_decoder_by_index(int i) { return &vpx_decoders[i]; } const VpxInterface *get_vpx_decoder_by_name(const char *name) { int i; for (i = 0; i < get_vpx_decoder_count(); ++i) { - const VpxInterface *const decoder = get_vpx_decoder_by_index(i); - if (strcmp(decoder->name, name) == 0) - return decoder; + const VpxInterface *const decoder = get_vpx_decoder_by_index(i); + if (strcmp(decoder->name, name) == 0) return decoder; } return NULL; @@ -194,8 +185,7 @@ const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc) { for (i = 0; i < get_vpx_decoder_count(); ++i) { const VpxInterface *const decoder = get_vpx_decoder_by_index(i); - if (decoder->fourcc == fourcc) - return decoder; + if (decoder->fourcc == fourcc) return decoder; } return NULL; @@ -213,7 +203,7 @@ int vpx_img_plane_width(const vpx_image_t *img, int plane) { } int vpx_img_plane_height(const vpx_image_t *img, int plane) { - if (plane > 0 && img->y_chroma_shift > 0) + if (plane > 0 && img->y_chroma_shift > 0) return (img->d_h + 1) >> img->y_chroma_shift; else return img->d_h; @@ -226,7 +216,7 @@ void vpx_img_write(const vpx_image_t *img, FILE *file) { const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; const int w = vpx_img_plane_width(img, plane) * - ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); const int h = vpx_img_plane_height(img, plane); int y; @@ -244,13 +234,12 @@ int vpx_img_read(vpx_image_t *img, FILE *file) { unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; const int w = vpx_img_plane_width(img, plane) * - ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); const int h = vpx_img_plane_height(img, plane); int y; for (y = 0; y < h; ++y) { - if (fread(buf, 1, w, file) != (size_t)w) - return 0; + if (fread(buf, 1, w, file) != (size_t)w) return 0; buf += stride; } } @@ -279,19 +268,16 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, int plane; if (dst->d_w != src->d_w || dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || - dst->y_chroma_shift != src->y_chroma_shift || - dst->fmt != src->fmt || input_shift < 0) { + dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt || + input_shift < 0) { fatal("Unsupported image conversion"); } switch (src->fmt) { case VPX_IMG_FMT_I42016: case VPX_IMG_FMT_I42216: case VPX_IMG_FMT_I44416: - case VPX_IMG_FMT_I44016: - break; - default: - fatal("Unsupported image conversion"); - break; + case VPX_IMG_FMT_I44016: break; + default: fatal("Unsupported image conversion"); break; } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -306,8 +292,7 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, (uint16_t *)(src->planes[plane] + y * src->stride[plane]); uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); - for (x = 0; x < w; x++) - *p_dst++ = (*p_src++ << input_shift) + offset; + for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset; } } } @@ -320,19 +305,15 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, if (dst->d_w != src->d_w || dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || dst->y_chroma_shift != src->y_chroma_shift || - dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH || - input_shift < 0) { + dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH || input_shift < 0) { fatal("Unsupported image conversion"); } switch (src->fmt) { case VPX_IMG_FMT_I420: case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: - case VPX_IMG_FMT_I440: - break; - default: - fatal("Unsupported image conversion"); - break; + case VPX_IMG_FMT_I440: break; + default: fatal("Unsupported image conversion"); break; } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -353,8 +334,7 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, } } -void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, - int input_shift) { +void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, int input_shift) { if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { highbd_img_upshift(dst, src, input_shift); } else { @@ -364,9 +344,8 @@ void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) { int plane; - if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt || - dst->d_w != src->d_w || dst->d_h != src->d_h || - dst->x_chroma_shift != src->x_chroma_shift || + if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt || dst->d_w != src->d_w || + dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || dst->y_chroma_shift != src->y_chroma_shift) { fatal("Unsupported image conversion"); } @@ -374,11 +353,8 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) { case VPX_IMG_FMT_I420: case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: - case VPX_IMG_FMT_I440: - break; - default: - fatal("Unsupported image conversion"); - break; + case VPX_IMG_FMT_I440: break; + default: fatal("Unsupported image conversion"); break; } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -404,19 +380,16 @@ static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src, int plane; if (dst->d_w != src->d_w || dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || - dst->y_chroma_shift != src->y_chroma_shift || - dst->fmt != src->fmt || down_shift < 0) { + dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt || + down_shift < 0) { fatal("Unsupported image conversion"); } switch (src->fmt) { case VPX_IMG_FMT_I42016: case VPX_IMG_FMT_I42216: case VPX_IMG_FMT_I44416: - case VPX_IMG_FMT_I44016: - break; - default: - fatal("Unsupported image conversion"); - break; + case VPX_IMG_FMT_I44016: break; + default: fatal("Unsupported image conversion"); break; } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -431,8 +404,7 @@ static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src, (uint16_t *)(src->planes[plane] + y * src->stride[plane]); uint16_t *p_dst = (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); - for (x = 0; x < w; x++) - *p_dst++ = *p_src++ >> down_shift; + for (x = 0; x < w; x++) *p_dst++ = *p_src++ >> down_shift; } } } @@ -443,19 +415,15 @@ static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src, if (dst->d_w != src->d_w || dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || dst->y_chroma_shift != src->y_chroma_shift || - src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH || - down_shift < 0) { + src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH || down_shift < 0) { fatal("Unsupported image conversion"); } switch (dst->fmt) { case VPX_IMG_FMT_I420: case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: - case VPX_IMG_FMT_I440: - break; - default: - fatal("Unsupported image conversion"); - break; + case VPX_IMG_FMT_I440: break; + default: fatal("Unsupported image conversion"); break; } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -476,8 +444,7 @@ static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src, } } -void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, - int down_shift) { +void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift) { if (dst->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { highbd_img_downshift(dst, src, down_shift); } else { diff --git a/tools_common.h b/tools_common.h index 98347b6f27150f6d9fe5f50bfd72ae68e8459fea..ce4fbf8f66e114e9a4a7ff866bc652d5aee9cd2e 100644 --- a/tools_common.h +++ b/tools_common.h @@ -30,24 +30,24 @@ /* MinGW uses f{seek,tell}o64 for large files. */ #define fseeko fseeko64 #define ftello ftello64 -#endif /* _WIN32 */ +#endif /* _WIN32 */ #if CONFIG_OS_SUPPORT #if defined(_MSC_VER) -#include <io.h> /* NOLINT */ -#define isatty _isatty -#define fileno _fileno +#include <io.h> /* NOLINT */ +#define isatty _isatty +#define fileno _fileno #else -#include <unistd.h> /* NOLINT */ -#endif /* _MSC_VER */ -#endif /* CONFIG_OS_SUPPORT */ +#include <unistd.h> /* NOLINT */ +#endif /* _MSC_VER */ +#endif /* CONFIG_OS_SUPPORT */ /* Use 32-bit file operations in WebM file format when building ARM * executables (.axf) with RVCT. */ #if !CONFIG_OS_SUPPORT #define fseeko fseek #define ftello ftell -#endif /* CONFIG_OS_SUPPORT */ +#endif /* CONFIG_OS_SUPPORT */ #define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo) @@ -55,7 +55,7 @@ #define PATH_MAX 512 #endif -#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */ +#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */ #define IVF_FILE_HDR_SZ 32 #define RAW_FRAME_HDR_SZ sizeof(uint32_t) @@ -158,7 +158,7 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src); #endif #ifdef __cplusplus -} /* extern "C" */ +} /* extern "C" */ #endif #endif // TOOLS_COMMON_H_ diff --git a/video_reader.c b/video_reader.c index 39c7edba1e3deb5f8d3362deeb26ba581f8a5136..a0ba2521c6135f06bff917b920d321eccbcf1f5f 100644 --- a/video_reader.c +++ b/video_reader.c @@ -30,21 +30,17 @@ VpxVideoReader *vpx_video_reader_open(const char *filename) { char header[32]; VpxVideoReader *reader = NULL; FILE *const file = fopen(filename, "rb"); - if (!file) - return NULL; // Can't open file + if (!file) return NULL; // Can't open file - if (fread(header, 1, 32, file) != 32) - return NULL; // Can't read file header + if (fread(header, 1, 32, file) != 32) return NULL; // Can't read file header if (memcmp(kIVFSignature, header, 4) != 0) return NULL; // Wrong IVF signature - if (mem_get_le16(header + 4) != 0) - return NULL; // Wrong IVF version + if (mem_get_le16(header + 4) != 0) return NULL; // Wrong IVF version reader = calloc(1, sizeof(*reader)); - if (!reader) - return NULL; // Can't allocate VpxVideoReader + if (!reader) return NULL; // Can't allocate VpxVideoReader reader->file = file; reader->info.codec_fourcc = mem_get_le32(header + 8); @@ -71,8 +67,7 @@ int vpx_video_reader_read_frame(VpxVideoReader *reader) { const uint8_t *vpx_video_reader_get_frame(VpxVideoReader *reader, size_t *size) { - if (size) - *size = reader->frame_size; + if (size) *size = reader->frame_size; return reader->buffer; } @@ -80,4 +75,3 @@ const uint8_t *vpx_video_reader_get_frame(VpxVideoReader *reader, const VpxVideoInfo *vpx_video_reader_get_info(VpxVideoReader *reader) { return &reader->info; } - diff --git a/video_reader.h b/video_reader.h index a62c6d7109ad7d6ddeabc23686194033dd5ef298..73c25b00a7d94740dda8e80ece3148d6f5a018a2 100644 --- a/video_reader.h +++ b/video_reader.h @@ -39,8 +39,7 @@ int vpx_video_reader_read_frame(VpxVideoReader *reader); // Returns the pointer to memory buffer with frame data read by last call to // vpx_video_reader_read_frame(). -const uint8_t *vpx_video_reader_get_frame(VpxVideoReader *reader, - size_t *size); +const uint8_t *vpx_video_reader_get_frame(VpxVideoReader *reader, size_t *size); // Fills VpxVideoInfo with information from opened video file. const VpxVideoInfo *vpx_video_reader_get_info(VpxVideoReader *reader); diff --git a/video_writer.c b/video_writer.c index 3695236bfa5232801598953b99f511dac538a7ad..56d428b0720f7101451ff3b130fa9d47550d9dea 100644 --- a/video_writer.c +++ b/video_writer.c @@ -37,12 +37,10 @@ VpxVideoWriter *vpx_video_writer_open(const char *filename, if (container == kContainerIVF) { VpxVideoWriter *writer = NULL; FILE *const file = fopen(filename, "wb"); - if (!file) - return NULL; + if (!file) return NULL; writer = malloc(sizeof(*writer)); - if (!writer) - return NULL; + if (!writer) return NULL; writer->frame_count = 0; writer->info = *info; @@ -67,12 +65,10 @@ void vpx_video_writer_close(VpxVideoWriter *writer) { } } -int vpx_video_writer_write_frame(VpxVideoWriter *writer, - const uint8_t *buffer, size_t size, - int64_t pts) { +int vpx_video_writer_write_frame(VpxVideoWriter *writer, const uint8_t *buffer, + size_t size, int64_t pts) { ivf_write_frame_header(writer->file, pts, size); - if (fwrite(buffer, 1, size, writer->file) != size) - return 0; + if (fwrite(buffer, 1, size, writer->file) != size) return 0; ++writer->frame_count; diff --git a/video_writer.h b/video_writer.h index 5dbfe52ea00f9c7e8dac94bbe189dd1aa611e69b..a769811c44042c3b96f6e9eea5c8802149facc95 100644 --- a/video_writer.h +++ b/video_writer.h @@ -13,9 +13,7 @@ #include "./video_common.h" -typedef enum { - kContainerIVF -} VpxContainer; +typedef enum { kContainerIVF } VpxContainer; struct VpxVideoWriterStruct; typedef struct VpxVideoWriterStruct VpxVideoWriter; @@ -36,9 +34,8 @@ VpxVideoWriter *vpx_video_writer_open(const char *filename, void vpx_video_writer_close(VpxVideoWriter *writer); // Writes frame bytes to the file. -int vpx_video_writer_write_frame(VpxVideoWriter *writer, - const uint8_t *buffer, size_t size, - int64_t pts); +int vpx_video_writer_write_frame(VpxVideoWriter *writer, const uint8_t *buffer, + size_t size, int64_t pts); #ifdef __cplusplus } // extern "C" diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index c61b836831c18fa990ce21bf0761b6a99673431e..6a253a57e2438238d806ec3c56b661856c2e4574 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - /*!\file * \brief Describes the decoder algorithm interface for algorithm * implementations. @@ -61,7 +60,7 @@ extern "C" { */ #define VPX_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/ -typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t; +typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t; typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t; /*!\brief init function pointer prototype @@ -77,8 +76,8 @@ typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t; * \retval #VPX_CODEC_MEM_ERROR * Memory operation failed. */ -typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx, - vpx_codec_priv_enc_mr_cfg_t *data); +typedef vpx_codec_err_t (*vpx_codec_init_fn_t)( + vpx_codec_ctx_t *ctx, vpx_codec_priv_enc_mr_cfg_t *data); /*!\brief destroy function pointer prototype * @@ -112,8 +111,8 @@ typedef vpx_codec_err_t (*vpx_codec_destroy_fn_t)(vpx_codec_alg_priv_t *ctx); * \retval #VPX_CODEC_OK * Bitstream is parsable and stream information updated */ -typedef vpx_codec_err_t (*vpx_codec_peek_si_fn_t)(const uint8_t *data, - unsigned int data_sz, +typedef vpx_codec_err_t (*vpx_codec_peek_si_fn_t)(const uint8_t *data, + unsigned int data_sz, vpx_codec_stream_info_t *si); /*!\brief Return information about the current stream. @@ -129,7 +128,7 @@ typedef vpx_codec_err_t (*vpx_codec_peek_si_fn_t)(const uint8_t *data, * \retval #VPX_CODEC_OK * Bitstream is parsable and stream information updated */ -typedef vpx_codec_err_t (*vpx_codec_get_si_fn_t)(vpx_codec_alg_priv_t *ctx, +typedef vpx_codec_err_t (*vpx_codec_get_si_fn_t)(vpx_codec_alg_priv_t *ctx, vpx_codec_stream_info_t *si); /*!\brief control function pointer prototype @@ -193,11 +192,11 @@ typedef const struct vpx_codec_ctrl_fn_map { * see the descriptions of the other error codes in ::vpx_codec_err_t * for recoverability capabilities. */ -typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx, - const uint8_t *data, - unsigned int data_sz, - void *user_priv, - long deadline); +typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx, + const uint8_t *data, + unsigned int data_sz, + void *user_priv, + long deadline); /*!\brief Decoded frames iterator * @@ -206,7 +205,8 @@ typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx, * complete when this function returns NULL. * * The list of available frames becomes valid upon completion of the - * vpx_codec_decode call, and remains valid until the next call to vpx_codec_decode. + * vpx_codec_decode call, and remains valid until the next call to + * vpx_codec_decode. * * \param[in] ctx Pointer to this instance's context * \param[in out] iter Iterator storage, initialized to NULL @@ -215,7 +215,7 @@ typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx, * produced will always be in PTS (presentation time stamp) order. */ typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx, - vpx_codec_iter_t *iter); + vpx_codec_iter_t *iter); /*!\brief Pass in external frame buffers for the decoder to use. * @@ -244,32 +244,28 @@ typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx, * buffers. */ typedef vpx_codec_err_t (*vpx_codec_set_fb_fn_t)( - vpx_codec_alg_priv_t *ctx, - vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_codec_alg_priv_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get, vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); +typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx, + const vpx_image_t *img, + vpx_codec_pts_t pts, + unsigned long duration, + vpx_enc_frame_flags_t flags, + unsigned long deadline); +typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)( + vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter); -typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx, - const vpx_image_t *img, - vpx_codec_pts_t pts, - unsigned long duration, - vpx_enc_frame_flags_t flags, - unsigned long deadline); -typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)(vpx_codec_alg_priv_t *ctx, - vpx_codec_iter_t *iter); - -typedef vpx_codec_err_t -(*vpx_codec_enc_config_set_fn_t)(vpx_codec_alg_priv_t *ctx, - const vpx_codec_enc_cfg_t *cfg); -typedef vpx_fixed_buf_t * -(*vpx_codec_get_global_headers_fn_t)(vpx_codec_alg_priv_t *ctx); +typedef vpx_codec_err_t (*vpx_codec_enc_config_set_fn_t)( + vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg); +typedef vpx_fixed_buf_t *(*vpx_codec_get_global_headers_fn_t)( + vpx_codec_alg_priv_t *ctx); -typedef vpx_image_t * -(*vpx_codec_get_preview_frame_fn_t)(vpx_codec_alg_priv_t *ctx); +typedef vpx_image_t *(*vpx_codec_get_preview_frame_fn_t)( + vpx_codec_alg_priv_t *ctx); -typedef vpx_codec_err_t -(*vpx_codec_enc_mr_get_mem_loc_fn_t)(const vpx_codec_enc_cfg_t *cfg, - void **mem_loc); +typedef vpx_codec_err_t (*vpx_codec_enc_mr_get_mem_loc_fn_t)( + const vpx_codec_enc_cfg_t *cfg, void **mem_loc); /*!\brief usage configuration mapping * @@ -282,7 +278,7 @@ typedef vpx_codec_err_t * */ typedef const struct vpx_codec_enc_cfg_map { - int usage; + int usage; vpx_codec_enc_cfg_t cfg; } vpx_codec_enc_cfg_map_t; @@ -291,41 +287,47 @@ typedef const struct vpx_codec_enc_cfg_map { * All decoders \ref MUST expose a variable of this type. */ struct vpx_codec_iface { - const char *name; /**< Identification String */ - int abi_version; /**< Implemented ABI version */ - vpx_codec_caps_t caps; /**< Decoder capabilities */ - vpx_codec_init_fn_t init; /**< \copydoc ::vpx_codec_init_fn_t */ - vpx_codec_destroy_fn_t destroy; /**< \copydoc ::vpx_codec_destroy_fn_t */ - vpx_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::vpx_codec_ctrl_fn_map_t */ + const char *name; /**< Identification String */ + int abi_version; /**< Implemented ABI version */ + vpx_codec_caps_t caps; /**< Decoder capabilities */ + vpx_codec_init_fn_t init; /**< \copydoc ::vpx_codec_init_fn_t */ + vpx_codec_destroy_fn_t destroy; /**< \copydoc ::vpx_codec_destroy_fn_t */ + vpx_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::vpx_codec_ctrl_fn_map_t */ struct vpx_codec_dec_iface { - vpx_codec_peek_si_fn_t peek_si; /**< \copydoc ::vpx_codec_peek_si_fn_t */ - vpx_codec_get_si_fn_t get_si; /**< \copydoc ::vpx_codec_get_si_fn_t */ - vpx_codec_decode_fn_t decode; /**< \copydoc ::vpx_codec_decode_fn_t */ - vpx_codec_get_frame_fn_t get_frame; /**< \copydoc ::vpx_codec_get_frame_fn_t */ - vpx_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::vpx_codec_set_fb_fn_t */ + vpx_codec_peek_si_fn_t peek_si; /**< \copydoc ::vpx_codec_peek_si_fn_t */ + vpx_codec_get_si_fn_t get_si; /**< \copydoc ::vpx_codec_get_si_fn_t */ + vpx_codec_decode_fn_t decode; /**< \copydoc ::vpx_codec_decode_fn_t */ + vpx_codec_get_frame_fn_t + get_frame; /**< \copydoc ::vpx_codec_get_frame_fn_t */ + vpx_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::vpx_codec_set_fb_fn_t */ } dec; struct vpx_codec_enc_iface { - int cfg_map_count; - vpx_codec_enc_cfg_map_t *cfg_maps; /**< \copydoc ::vpx_codec_enc_cfg_map_t */ - vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */ - vpx_codec_get_cx_data_fn_t get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */ - vpx_codec_enc_config_set_fn_t cfg_set; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */ - vpx_codec_get_global_headers_fn_t get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */ - vpx_codec_get_preview_frame_fn_t get_preview; /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */ - vpx_codec_enc_mr_get_mem_loc_fn_t mr_get_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */ + int cfg_map_count; + vpx_codec_enc_cfg_map_t + *cfg_maps; /**< \copydoc ::vpx_codec_enc_cfg_map_t */ + vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */ + vpx_codec_get_cx_data_fn_t + get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */ + vpx_codec_enc_config_set_fn_t + cfg_set; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */ + vpx_codec_get_global_headers_fn_t + get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */ + vpx_codec_get_preview_frame_fn_t + get_preview; /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */ + vpx_codec_enc_mr_get_mem_loc_fn_t + mr_get_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */ } enc; }; /*!\brief Callback function pointer / user data pair storage */ typedef struct vpx_codec_priv_cb_pair { union { - vpx_codec_put_frame_cb_fn_t put_frame; - vpx_codec_put_slice_cb_fn_t put_slice; + vpx_codec_put_frame_cb_fn_t put_frame; + vpx_codec_put_slice_cb_fn_t put_slice; } u; - void *user_priv; + void *user_priv; } vpx_codec_priv_cb_pair_t; - /*!\brief Instance private storage * * This structure is allocated by the algorithm's init function. It can be @@ -335,39 +337,38 @@ typedef struct vpx_codec_priv_cb_pair { * and the pointer cast to the proper type. */ struct vpx_codec_priv { - const char *err_detail; - vpx_codec_flags_t init_flags; + const char *err_detail; + vpx_codec_flags_t init_flags; struct { - vpx_codec_priv_cb_pair_t put_frame_cb; - vpx_codec_priv_cb_pair_t put_slice_cb; + vpx_codec_priv_cb_pair_t put_frame_cb; + vpx_codec_priv_cb_pair_t put_slice_cb; } dec; struct { - vpx_fixed_buf_t cx_data_dst_buf; - unsigned int cx_data_pad_before; - unsigned int cx_data_pad_after; - vpx_codec_cx_pkt_t cx_data_pkt; - unsigned int total_encoders; + vpx_fixed_buf_t cx_data_dst_buf; + unsigned int cx_data_pad_before; + unsigned int cx_data_pad_after; + vpx_codec_cx_pkt_t cx_data_pkt; + unsigned int total_encoders; } enc; }; /* * Multi-resolution encoding internal configuration */ -struct vpx_codec_priv_enc_mr_cfg -{ - unsigned int mr_total_resolutions; - unsigned int mr_encoder_id; - struct vpx_rational mr_down_sampling_factor; - void* mr_low_res_mode_info; +struct vpx_codec_priv_enc_mr_cfg { + unsigned int mr_total_resolutions; + unsigned int mr_encoder_id; + struct vpx_rational mr_down_sampling_factor; + void *mr_low_res_mode_info; }; #undef VPX_CTRL_USE_TYPE #define VPX_CTRL_USE_TYPE(id, typ) \ - static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);} + static VPX_INLINE typ id##__value(va_list args) { return va_arg(args, typ); } #undef VPX_CTRL_USE_TYPE_DEPRECATED #define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ - static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);} + static VPX_INLINE typ id##__value(va_list args) { return va_arg(args, typ); } #define CAST(id, arg) id##__value(arg) @@ -380,10 +381,9 @@ struct vpx_codec_priv_enc_mr_cfg * the same name as the struct, less the _algo suffix. The CODEC_INTERFACE * macro is provided to define this getter function automatically. */ -#define CODEC_INTERFACE(id)\ - vpx_codec_iface_t* id(void) { return &id##_algo; }\ - vpx_codec_iface_t id##_algo - +#define CODEC_INTERFACE(id) \ + vpx_codec_iface_t *id(void) { return &id##_algo; } \ + vpx_codec_iface_t id##_algo /* Internal Utility Functions * @@ -391,38 +391,39 @@ struct vpx_codec_priv_enc_mr_cfg * utilities for manipulating vpx_codec_* data structures. */ struct vpx_codec_pkt_list { - unsigned int cnt; - unsigned int max; + unsigned int cnt; + unsigned int max; struct vpx_codec_cx_pkt pkts[1]; }; -#define vpx_codec_pkt_list_decl(n)\ - union {struct vpx_codec_pkt_list head;\ - struct {struct vpx_codec_pkt_list head;\ - struct vpx_codec_cx_pkt pkts[n];} alloc;} - -#define vpx_codec_pkt_list_init(m)\ - (m)->alloc.head.cnt = 0,\ - (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0]) +#define vpx_codec_pkt_list_decl(n) \ + union { \ + struct vpx_codec_pkt_list head; \ + struct { \ + struct vpx_codec_pkt_list head; \ + struct vpx_codec_cx_pkt pkts[n]; \ + } alloc; \ + } -int -vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *, - const struct vpx_codec_cx_pkt *); +#define vpx_codec_pkt_list_init(m) \ + (m)->alloc.head.cnt = 0, \ + (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0]) -const vpx_codec_cx_pkt_t * -vpx_codec_pkt_list_get(struct vpx_codec_pkt_list *list, - vpx_codec_iter_t *iter); +int vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *, + const struct vpx_codec_cx_pkt *); +const vpx_codec_cx_pkt_t *vpx_codec_pkt_list_get( + struct vpx_codec_pkt_list *list, vpx_codec_iter_t *iter); #include <stdio.h> #include <setjmp.h> struct vpx_internal_error_info { - vpx_codec_err_t error_code; - int has_detail; - char detail[80]; - int setjmp; - jmp_buf jmp; + vpx_codec_err_t error_code; + int has_detail; + char detail[80]; + int setjmp; + jmp_buf jmp; }; #define CLANG_ANALYZER_NORETURN @@ -434,8 +435,7 @@ struct vpx_internal_error_info { #endif void vpx_internal_error(struct vpx_internal_error_info *info, - vpx_codec_err_t error, - const char *fmt, + vpx_codec_err_t error, const char *fmt, ...) CLANG_ANALYZER_NORETURN; #ifdef __cplusplus diff --git a/vpx/src/vpx_codec.c b/vpx/src/vpx_codec.c index 5a495ce814b814fe3fef105fb76894e754fda420..f222b9e5cb6f5073f551fec9f332bca7d01a2793 100644 --- a/vpx/src/vpx_codec.c +++ b/vpx/src/vpx_codec.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - /*!\file * \brief Provides the high level interface to wrap decoder algorithms. * @@ -19,67 +18,50 @@ #include "vpx/internal/vpx_codec_internal.h" #include "vpx_version.h" -#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var) - -int vpx_codec_version(void) { - return VERSION_PACKED; -} - - -const char *vpx_codec_version_str(void) { - return VERSION_STRING_NOSP; -} +#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) +int vpx_codec_version(void) { return VERSION_PACKED; } -const char *vpx_codec_version_extra_str(void) { - return VERSION_EXTRA; -} +const char *vpx_codec_version_str(void) { return VERSION_STRING_NOSP; } +const char *vpx_codec_version_extra_str(void) { return VERSION_EXTRA; } const char *vpx_codec_iface_name(vpx_codec_iface_t *iface) { return iface ? iface->name : "<invalid interface>"; } -const char *vpx_codec_err_to_string(vpx_codec_err_t err) { +const char *vpx_codec_err_to_string(vpx_codec_err_t err) { switch (err) { - case VPX_CODEC_OK: - return "Success"; - case VPX_CODEC_ERROR: - return "Unspecified internal error"; - case VPX_CODEC_MEM_ERROR: - return "Memory allocation error"; - case VPX_CODEC_ABI_MISMATCH: - return "ABI version mismatch"; + case VPX_CODEC_OK: return "Success"; + case VPX_CODEC_ERROR: return "Unspecified internal error"; + case VPX_CODEC_MEM_ERROR: return "Memory allocation error"; + case VPX_CODEC_ABI_MISMATCH: return "ABI version mismatch"; case VPX_CODEC_INCAPABLE: return "Codec does not implement requested capability"; case VPX_CODEC_UNSUP_BITSTREAM: return "Bitstream not supported by this decoder"; case VPX_CODEC_UNSUP_FEATURE: return "Bitstream required feature not supported by this decoder"; - case VPX_CODEC_CORRUPT_FRAME: - return "Corrupt frame detected"; - case VPX_CODEC_INVALID_PARAM: - return "Invalid parameter"; - case VPX_CODEC_LIST_END: - return "End of iterated list"; + case VPX_CODEC_CORRUPT_FRAME: return "Corrupt frame detected"; + case VPX_CODEC_INVALID_PARAM: return "Invalid parameter"; + case VPX_CODEC_LIST_END: return "End of iterated list"; } return "Unrecognized error code"; } -const char *vpx_codec_error(vpx_codec_ctx_t *ctx) { +const char *vpx_codec_error(vpx_codec_ctx_t *ctx) { return (ctx) ? vpx_codec_err_to_string(ctx->err) - : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM); + : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM); } -const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx) { +const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx) { if (ctx && ctx->err) return ctx->priv ? ctx->priv->err_detail : ctx->err_detail; return NULL; } - vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) { vpx_codec_err_t res; @@ -99,15 +81,11 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) { return SAVE_STATUS(ctx, res); } - vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface) { return (iface) ? iface->caps : 0; } - -vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, - int ctrl_id, - ...) { +vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) { vpx_codec_err_t res; if (!ctx || !ctrl_id) @@ -121,7 +99,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) { if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) { - va_list ap; + va_list ap; va_start(ap, ctrl_id); res = entry->fn((vpx_codec_alg_priv_t *)ctx->priv, ap); @@ -135,16 +113,14 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, } void vpx_internal_error(struct vpx_internal_error_info *info, - vpx_codec_err_t error, - const char *fmt, - ...) { + vpx_codec_err_t error, const char *fmt, ...) { va_list ap; info->error_code = error; info->has_detail = 0; if (fmt) { - size_t sz = sizeof(info->detail); + size_t sz = sizeof(info->detail); info->has_detail = 1; va_start(ap, fmt); @@ -153,6 +129,5 @@ void vpx_internal_error(struct vpx_internal_error_info *info, info->detail[sz - 1] = '\0'; } - if (info->setjmp) - longjmp(info->jmp, info->error_code); + if (info->setjmp) longjmp(info->jmp, info->error_code); } diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c index 802d8edd8a437a3e424941adb3273485ff00b23d..fc1c2bccae77f5067c26bd44af66e316d8908fbd 100644 --- a/vpx/src/vpx_decoder.c +++ b/vpx/src/vpx_decoder.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - /*!\file * \brief Provides the high level interface to wrap decoder algorithms. * @@ -16,17 +15,16 @@ #include <string.h> #include "vpx/internal/vpx_codec_internal.h" -#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var) +#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) { return (vpx_codec_alg_priv_t *)ctx->priv; } -vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, - vpx_codec_iface_t *iface, +vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, const vpx_codec_dec_cfg_t *cfg, - vpx_codec_flags_t flags, - int ver) { + vpx_codec_flags_t flags, int ver) { vpx_codec_err_t res; if (ver != VPX_DECODER_ABI_VERSION) @@ -35,7 +33,8 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, res = VPX_CODEC_INVALID_PARAM; else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION) res = VPX_CODEC_ABI_MISMATCH; - else if ((flags & VPX_CODEC_USE_POSTPROC) && !(iface->caps & VPX_CODEC_CAP_POSTPROC)) + else if ((flags & VPX_CODEC_USE_POSTPROC) && + !(iface->caps & VPX_CODEC_CAP_POSTPROC)) res = VPX_CODEC_INCAPABLE; else if ((flags & VPX_CODEC_USE_ERROR_CONCEALMENT) && !(iface->caps & VPX_CODEC_CAP_ERROR_CONCEALMENT)) @@ -63,15 +62,14 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } - -vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface, - const uint8_t *data, - unsigned int data_sz, +vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface, + const uint8_t *data, + unsigned int data_sz, vpx_codec_stream_info_t *si) { vpx_codec_err_t res; - if (!iface || !data || !data_sz || !si - || si->sz < sizeof(vpx_codec_stream_info_t)) + if (!iface || !data || !data_sz || !si || + si->sz < sizeof(vpx_codec_stream_info_t)) res = VPX_CODEC_INVALID_PARAM; else { /* Set default/unknown values */ @@ -84,8 +82,7 @@ vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface, return res; } - -vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, +vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, vpx_codec_stream_info_t *si) { vpx_codec_err_t res; @@ -104,12 +101,9 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } - -vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, - const uint8_t *data, - unsigned int data_sz, - void *user_priv, - long deadline) { +vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data, + unsigned int data_sz, void *user_priv, + long deadline) { vpx_codec_err_t res; /* Sanity checks */ @@ -126,8 +120,7 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } -vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, - vpx_codec_iter_t *iter) { +vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, vpx_codec_iter_t *iter) { vpx_image_t *img; if (!ctx || !iter || !ctx->iface || !ctx->priv) @@ -138,16 +131,15 @@ vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, return img; } - -vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx, - vpx_codec_put_frame_cb_fn_t cb, - void *user_priv) { +vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx, + vpx_codec_put_frame_cb_fn_t cb, + void *user_priv) { vpx_codec_err_t res; if (!ctx || !cb) res = VPX_CODEC_INVALID_PARAM; - else if (!ctx->iface || !ctx->priv - || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME)) + else if (!ctx->iface || !ctx->priv || + !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME)) res = VPX_CODEC_ERROR; else { ctx->priv->dec.put_frame_cb.u.put_frame = cb; @@ -158,16 +150,15 @@ vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx return SAVE_STATUS(ctx, res); } - -vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx, - vpx_codec_put_slice_cb_fn_t cb, - void *user_priv) { +vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx, + vpx_codec_put_slice_cb_fn_t cb, + void *user_priv) { vpx_codec_err_t res; if (!ctx || !cb) res = VPX_CODEC_INVALID_PARAM; - else if (!ctx->iface || !ctx->priv - || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE)) + else if (!ctx->iface || !ctx->priv || + !(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE)) res = VPX_CODEC_ERROR; else { ctx->priv->dec.put_slice_cb.u.put_slice = cb; diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c index cd10c411ceaf5ec0e33e58da7bbbf61c4cc2f08e..4390cf7c8f1f5ecf8dd356b97a722a384f071113 100644 --- a/vpx/src/vpx_encoder.c +++ b/vpx/src/vpx_encoder.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - /*!\file * \brief Provides the high level interface to wrap encoder algorithms. * @@ -18,17 +17,16 @@ #include "vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" -#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var) +#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) { return (vpx_codec_alg_priv_t *)ctx->priv; } -vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, - vpx_codec_iface_t *iface, +vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, const vpx_codec_enc_cfg_t *cfg, - vpx_codec_flags_t flags, - int ver) { + vpx_codec_flags_t flags, int ver) { vpx_codec_err_t res; if (ver != VPX_ENCODER_ABI_VERSION) @@ -39,11 +37,10 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, res = VPX_CODEC_ABI_MISMATCH; else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; - else if ((flags & VPX_CODEC_USE_PSNR) - && !(iface->caps & VPX_CODEC_CAP_PSNR)) + else if ((flags & VPX_CODEC_USE_PSNR) && !(iface->caps & VPX_CODEC_CAP_PSNR)) res = VPX_CODEC_INCAPABLE; - else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION) - && !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION)) + else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION) && + !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION)) res = VPX_CODEC_INCAPABLE; else { ctx->iface = iface; @@ -62,13 +59,9 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } -vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, - vpx_codec_iface_t *iface, - vpx_codec_enc_cfg_t *cfg, - int num_enc, - vpx_codec_flags_t flags, - vpx_rational_t *dsf, - int ver) { +vpx_codec_err_t vpx_codec_enc_init_multi_ver( + vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg, + int num_enc, vpx_codec_flags_t flags, vpx_rational_t *dsf, int ver) { vpx_codec_err_t res = VPX_CODEC_OK; if (ver != VPX_ENCODER_ABI_VERSION) @@ -79,11 +72,10 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, res = VPX_CODEC_ABI_MISMATCH; else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; - else if ((flags & VPX_CODEC_USE_PSNR) - && !(iface->caps & VPX_CODEC_CAP_PSNR)) + else if ((flags & VPX_CODEC_USE_PSNR) && !(iface->caps & VPX_CODEC_CAP_PSNR)) res = VPX_CODEC_INCAPABLE; - else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION) - && !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION)) + else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION) && + !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION)) res = VPX_CODEC_INCAPABLE; else { int i; @@ -110,8 +102,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, * resolution always use the same frame_type chosen by the * lowest-resolution encoder. */ - if (mr_cfg.mr_encoder_id) - cfg->kf_mode = VPX_KF_DISABLED; + if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED; ctx->iface = iface; ctx->name = iface->name; @@ -121,8 +112,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, res = ctx->iface->init(ctx, &mr_cfg); if (res) { - const char *error_detail = - ctx->priv ? ctx->priv->err_detail : NULL; + const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL; /* Destroy current ctx */ ctx->err_detail = error_detail; vpx_codec_destroy(ctx); @@ -136,8 +126,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, } } - if (res) - break; + if (res) break; ctx++; cfg++; @@ -150,10 +139,9 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } - -vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, - vpx_codec_enc_cfg_t *cfg, - unsigned int usage) { +vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg, + unsigned int usage) { vpx_codec_err_t res; vpx_codec_enc_cfg_map_t *map; int i; @@ -179,30 +167,28 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, return res; } - #if ARCH_X86 || ARCH_X86_64 /* On X86, disable the x87 unit's internal 80 bit precision for better * consistency with the SSE unit's 64 bit precision. */ #include "vpx_ports/x86.h" -#define FLOATING_POINT_INIT() do {\ +#define FLOATING_POINT_INIT() \ + do { \ unsigned short x87_orig_mode = x87_set_double_precision(); -#define FLOATING_POINT_RESTORE() \ - x87_set_control_word(x87_orig_mode); }while(0) - +#define FLOATING_POINT_RESTORE() \ + x87_set_control_word(x87_orig_mode); \ + } \ + while (0) #else static void FLOATING_POINT_INIT() {} static void FLOATING_POINT_RESTORE() {} #endif - -vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, - const vpx_image_t *img, - vpx_codec_pts_t pts, - unsigned long duration, - vpx_enc_frame_flags_t flags, - unsigned long deadline) { +vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, + vpx_codec_pts_t pts, unsigned long duration, + vpx_enc_frame_flags_t flags, + unsigned long deadline) { vpx_codec_err_t res = VPX_CODEC_OK; if (!ctx || (img && !duration)) @@ -220,8 +206,8 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, FLOATING_POINT_INIT(); if (num_enc == 1) - res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, - duration, flags, deadline); + res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags, + deadline); else { /* Multi-resolution encoding: * Encode multi-levels in reverse order. For example, @@ -234,8 +220,8 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, if (img) img += num_enc - 1; for (i = num_enc - 1; i >= 0; i--) { - if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, - duration, flags, deadline))) + if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, + flags, deadline))) break; ctx--; @@ -250,7 +236,6 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } - const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx, vpx_codec_iter_t *iter) { const vpx_codec_cx_pkt_t *pkt = NULL; @@ -273,18 +258,18 @@ const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx, vpx_codec_priv_t *const priv = ctx->priv; char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf; - if (dst_buf && - pkt->data.raw.buf != dst_buf && + if (dst_buf && pkt->data.raw.buf != dst_buf && pkt->data.raw.sz + priv->enc.cx_data_pad_before + - priv->enc.cx_data_pad_after <= priv->enc.cx_data_dst_buf.sz) { + priv->enc.cx_data_pad_after <= + priv->enc.cx_data_dst_buf.sz) { vpx_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt; memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf, pkt->data.raw.sz); *modified_pkt = *pkt; modified_pkt->data.raw.buf = dst_buf; - modified_pkt->data.raw.sz += priv->enc.cx_data_pad_before + - priv->enc.cx_data_pad_after; + modified_pkt->data.raw.sz += + priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after; pkt = modified_pkt; } @@ -297,13 +282,11 @@ const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx, return pkt; } - -vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx, +vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx, const vpx_fixed_buf_t *buf, - unsigned int pad_before, - unsigned int pad_after) { - if (!ctx || !ctx->priv) - return VPX_CODEC_INVALID_PARAM; + unsigned int pad_before, + unsigned int pad_after) { + if (!ctx || !ctx->priv) return VPX_CODEC_INVALID_PARAM; if (buf) { ctx->priv->enc.cx_data_dst_buf = *buf; @@ -319,8 +302,7 @@ vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx, return VPX_CODEC_OK; } - -const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx) { +const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx) { vpx_image_t *img = NULL; if (ctx) { @@ -337,8 +319,7 @@ const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx) { return img; } - -vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx) { +vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx) { vpx_fixed_buf_t *buf = NULL; if (ctx) { @@ -355,9 +336,8 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx) { return buf; } - -vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx, - const vpx_codec_enc_cfg_t *cfg) { +vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx, + const vpx_codec_enc_cfg_t *cfg) { vpx_codec_err_t res; if (!ctx || !ctx->iface || !ctx->priv || !cfg) @@ -370,7 +350,6 @@ vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } - int vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *list, const struct vpx_codec_cx_pkt *pkt) { if (list->cnt < list->max) { @@ -381,9 +360,8 @@ int vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *list, return 1; } - -const vpx_codec_cx_pkt_t *vpx_codec_pkt_list_get(struct vpx_codec_pkt_list *list, - vpx_codec_iter_t *iter) { +const vpx_codec_cx_pkt_t *vpx_codec_pkt_list_get( + struct vpx_codec_pkt_list *list, vpx_codec_iter_t *iter) { const vpx_codec_cx_pkt_t *pkt; if (!(*iter)) { diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c index 9aae12c794ba225ed78a74eb6c36108c431910ef..dba439c10a8490f82e1a7dc03f2f90f7f66309ca 100644 --- a/vpx/src/vpx_image.c +++ b/vpx/src/vpx_image.c @@ -15,10 +15,8 @@ #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" -static vpx_image_t *img_alloc_helper(vpx_image_t *img, - vpx_img_fmt_t fmt, - unsigned int d_w, - unsigned int d_h, +static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, unsigned int buf_align, unsigned int stride_align, unsigned char *img_data) { @@ -27,68 +25,44 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, int align; /* Treat align==0 like align==1 */ - if (!buf_align) - buf_align = 1; + if (!buf_align) buf_align = 1; /* Validate alignment (must be power of 2) */ - if (buf_align & (buf_align - 1)) - goto fail; + if (buf_align & (buf_align - 1)) goto fail; /* Treat align==0 like align==1 */ - if (!stride_align) - stride_align = 1; + if (!stride_align) stride_align = 1; /* Validate alignment (must be power of 2) */ - if (stride_align & (stride_align - 1)) - goto fail; + if (stride_align & (stride_align - 1)) goto fail; /* Get sample size for this format */ switch (fmt) { case VPX_IMG_FMT_RGB32: case VPX_IMG_FMT_RGB32_LE: case VPX_IMG_FMT_ARGB: - case VPX_IMG_FMT_ARGB_LE: - bps = 32; - break; + case VPX_IMG_FMT_ARGB_LE: bps = 32; break; case VPX_IMG_FMT_RGB24: - case VPX_IMG_FMT_BGR24: - bps = 24; - break; + case VPX_IMG_FMT_BGR24: bps = 24; break; case VPX_IMG_FMT_RGB565: case VPX_IMG_FMT_RGB565_LE: case VPX_IMG_FMT_RGB555: case VPX_IMG_FMT_RGB555_LE: case VPX_IMG_FMT_UYVY: case VPX_IMG_FMT_YUY2: - case VPX_IMG_FMT_YVYU: - bps = 16; - break; + case VPX_IMG_FMT_YVYU: bps = 16; break; case VPX_IMG_FMT_I420: case VPX_IMG_FMT_YV12: case VPX_IMG_FMT_VPXI420: - case VPX_IMG_FMT_VPXYV12: - bps = 12; - break; + case VPX_IMG_FMT_VPXYV12: bps = 12; break; case VPX_IMG_FMT_I422: - case VPX_IMG_FMT_I440: - bps = 16; - break; - case VPX_IMG_FMT_I444: - bps = 24; - break; - case VPX_IMG_FMT_I42016: - bps = 24; - break; + case VPX_IMG_FMT_I440: bps = 16; break; + case VPX_IMG_FMT_I444: bps = 24; break; + case VPX_IMG_FMT_I42016: bps = 24; break; case VPX_IMG_FMT_I42216: - case VPX_IMG_FMT_I44016: - bps = 32; - break; - case VPX_IMG_FMT_I44416: - bps = 48; - break; - default: - bps = 16; - break; + case VPX_IMG_FMT_I44016: bps = 32; break; + case VPX_IMG_FMT_I44416: bps = 48; break; + default: bps = 16; break; } /* Get chroma shift values for this format */ @@ -99,12 +73,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, case VPX_IMG_FMT_VPXYV12: case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I42016: - case VPX_IMG_FMT_I42216: - xcs = 1; - break; - default: - xcs = 0; - break; + case VPX_IMG_FMT_I42216: xcs = 1; break; + default: xcs = 0; break; } switch (fmt) { @@ -114,12 +84,8 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, case VPX_IMG_FMT_VPXI420: case VPX_IMG_FMT_VPXYV12: case VPX_IMG_FMT_I42016: - case VPX_IMG_FMT_I44016: - ycs = 1; - break; - default: - ycs = 0; - break; + case VPX_IMG_FMT_I44016: ycs = 1; break; + default: ycs = 0; break; } /* Calculate storage sizes given the chroma subsampling */ @@ -135,8 +101,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, if (!img) { img = (vpx_image_t *)calloc(1, sizeof(vpx_image_t)); - if (!img) - goto fail; + if (!img) goto fail; img->self_allocd = 1; } else { @@ -146,18 +111,17 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, img->img_data = img_data; if (!img_data) { - const uint64_t alloc_size = (fmt & VPX_IMG_FMT_PLANAR) ? - (uint64_t)h * s * bps / 8 : (uint64_t)h * s; + const uint64_t alloc_size = (fmt & VPX_IMG_FMT_PLANAR) + ? (uint64_t)h * s * bps / 8 + : (uint64_t)h * s; - if (alloc_size != (size_t)alloc_size) - goto fail; + if (alloc_size != (size_t)alloc_size) goto fail; img->img_data = (uint8_t *)vpx_memalign(buf_align, (size_t)alloc_size); img->img_data_owner = 1; } - if (!img->img_data) - goto fail; + if (!img->img_data) goto fail; img->fmt = fmt; img->bit_depth = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; @@ -172,39 +136,30 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, img->stride[VPX_PLANE_U] = img->stride[VPX_PLANE_V] = stride_in_bytes >> xcs; /* Default viewport to entire image */ - if (!vpx_img_set_rect(img, 0, 0, d_w, d_h)) - return img; + if (!vpx_img_set_rect(img, 0, 0, d_w, d_h)) return img; fail: vpx_img_free(img); return NULL; } -vpx_image_t *vpx_img_alloc(vpx_image_t *img, - vpx_img_fmt_t fmt, - unsigned int d_w, - unsigned int d_h, - unsigned int align) { +vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align) { return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL); } -vpx_image_t *vpx_img_wrap(vpx_image_t *img, - vpx_img_fmt_t fmt, - unsigned int d_w, - unsigned int d_h, - unsigned int stride_align, - unsigned char *img_data) { +vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, + unsigned int d_h, unsigned int stride_align, + unsigned char *img_data) { /* By setting buf_align = 1, we don't change buffer alignment in this * function. */ return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data); } -int vpx_img_set_rect(vpx_image_t *img, - unsigned int x, - unsigned int y, - unsigned int w, - unsigned int h) { - unsigned char *data; +int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y, + unsigned int w, unsigned int h) { + unsigned char *data; if (x + w <= img->w && y + h <= img->h) { img->d_w = w; @@ -213,7 +168,7 @@ int vpx_img_set_rect(vpx_image_t *img, /* Calculate plane pointers */ if (!(img->fmt & VPX_IMG_FMT_PLANAR)) { img->planes[VPX_PLANE_PACKED] = - img->img_data + x * img->bps / 8 + y * img->stride[VPX_PLANE_PACKED]; + img->img_data + x * img->bps / 8 + y * img->stride[VPX_PLANE_PACKED]; } else { const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; @@ -225,8 +180,8 @@ int vpx_img_set_rect(vpx_image_t *img, data += img->h * img->stride[VPX_PLANE_ALPHA]; } - img->planes[VPX_PLANE_Y] = data + x * bytes_per_sample + - y * img->stride[VPX_PLANE_Y]; + img->planes[VPX_PLANE_Y] = + data + x * bytes_per_sample + y * img->stride[VPX_PLANE_Y]; data += img->h * img->stride[VPX_PLANE_Y]; if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) { @@ -262,24 +217,23 @@ void vpx_img_flip(vpx_image_t *img) { img->planes[VPX_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[VPX_PLANE_Y]; img->stride[VPX_PLANE_Y] = -img->stride[VPX_PLANE_Y]; - img->planes[VPX_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) - * img->stride[VPX_PLANE_U]; + img->planes[VPX_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) * + img->stride[VPX_PLANE_U]; img->stride[VPX_PLANE_U] = -img->stride[VPX_PLANE_U]; - img->planes[VPX_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) - * img->stride[VPX_PLANE_V]; + img->planes[VPX_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) * + img->stride[VPX_PLANE_V]; img->stride[VPX_PLANE_V] = -img->stride[VPX_PLANE_V]; - img->planes[VPX_PLANE_ALPHA] += (signed)(img->d_h - 1) * img->stride[VPX_PLANE_ALPHA]; + img->planes[VPX_PLANE_ALPHA] += + (signed)(img->d_h - 1) * img->stride[VPX_PLANE_ALPHA]; img->stride[VPX_PLANE_ALPHA] = -img->stride[VPX_PLANE_ALPHA]; } void vpx_img_free(vpx_image_t *img) { if (img) { - if (img->img_data && img->img_data_owner) - vpx_free(img->img_data); + if (img->img_data && img->img_data_owner) vpx_free(img->img_data); - if (img->self_allocd) - free(img); + if (img->self_allocd) free(img); } } diff --git a/vpx/vp8.h b/vpx/vp8.h index ba67c38366516daff3f91e2660c2cbc6e6bb752b..e27b705a9b76f8e4c59098224babbc7db0fb0568 100644 --- a/vpx/vp8.h +++ b/vpx/vp8.h @@ -42,24 +42,27 @@ extern "C" { * The set of macros define the control functions of VP8 interface */ enum vp8_com_control_id { - VP8_SET_REFERENCE = 1, /**< pass in an external frame into decoder to be used as reference frame */ - VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */ - VP8_SET_POSTPROC = 3, /**< set the decoder's post processing settings */ - VP8_SET_DBG_COLOR_REF_FRAME = 4, /**< set the reference frames to color for each macroblock */ - VP8_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */ - VP8_SET_DBG_COLOR_B_MODES = 6, /**< set which blocks modes to color */ - VP8_SET_DBG_DISPLAY_MV = 7, /**< set which motion vector modes to draw */ + /*!\brief pass in an external frame into decoder to be used as reference frame + */ + VP8_SET_REFERENCE = 1, + VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */ + VP8_SET_POSTPROC = 3, /**< set the decoder's post processing settings */ + VP8_SET_DBG_COLOR_REF_FRAME = + 4, /**< set the reference frames to color for each macroblock */ + VP8_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */ + VP8_SET_DBG_COLOR_B_MODES = 6, /**< set which blocks modes to color */ + VP8_SET_DBG_DISPLAY_MV = 7, /**< set which motion vector modes to draw */ /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+) * for its control ids. These should be migrated to something like the * VP8_DECODER_CTRL_ID_START range next time we're ready to break the ABI. */ - VP9_GET_REFERENCE = 128, /**< get a pointer to a reference frame */ + VP9_GET_REFERENCE = 128, /**< get a pointer to a reference frame */ VP8_COMMON_CTRL_ID_MAX, - VP10_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */ + VP10_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */ - VP8_DECODER_CTRL_ID_START = 256 + VP8_DECODER_CTRL_ID_START = 256 }; /*!\brief post process flags @@ -67,15 +70,16 @@ enum vp8_com_control_id { * The set of macros define VP8 decoder post processing flags */ enum vp8_postproc_level { - VP8_NOFILTERING = 0, - VP8_DEBLOCK = 1 << 0, - VP8_DEMACROBLOCK = 1 << 1, - VP8_ADDNOISE = 1 << 2, - VP8_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */ - VP8_DEBUG_TXT_MBLK_MODES = 1 << 4, /**< print macro block modes over each macro block */ - VP8_DEBUG_TXT_DC_DIFF = 1 << 5, /**< print dc diff for each macro block */ - VP8_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */ - VP8_MFQE = 1 << 10 + VP8_NOFILTERING = 0, + VP8_DEBLOCK = 1 << 0, + VP8_DEMACROBLOCK = 1 << 1, + VP8_ADDNOISE = 1 << 2, + VP8_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */ + VP8_DEBUG_TXT_MBLK_MODES = + 1 << 4, /**< print macro block modes over each macro block */ + VP8_DEBUG_TXT_DC_DIFF = 1 << 5, /**< print dc diff for each macro block */ + VP8_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */ + VP8_MFQE = 1 << 10 }; /*!\brief post process flags @@ -86,9 +90,11 @@ enum vp8_postproc_level { */ typedef struct vp8_postproc_cfg { - int post_proc_flag; /**< the types of post processing to be done, should be combination of "vp8_postproc_level" */ - int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */ - int noise_level; /**< the strength of additive noise, valid range [0, 16] */ + /*!\brief the types of post processing to be done, should be combination of + * "vp8_postproc_level" */ + int post_proc_flag; + int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */ + int noise_level; /**< the strength of additive noise, valid range [0, 16] */ } vp8_postproc_cfg_t; /*!\brief reference frame type @@ -106,8 +112,8 @@ typedef enum vpx_ref_frame_type { * Define the data struct to access vp8 reference frames. */ typedef struct vpx_ref_frame { - vpx_ref_frame_type_t frame_type; /**< which reference frame */ - vpx_image_t img; /**< reference frame data in image format */ + vpx_ref_frame_type_t frame_type; /**< which reference frame */ + vpx_image_t img; /**< reference frame data in image format */ } vpx_ref_frame_t; /*!\brief VP9 specific reference frame data struct @@ -115,8 +121,8 @@ typedef struct vpx_ref_frame { * Define the data struct to access vp9 reference frames. */ typedef struct vp9_ref_frame { - int idx; /**< frame index to get (input) */ - vpx_image_t img; /**< img structure to populate (output) */ + int idx; /**< frame index to get (input) */ + vpx_image_t img; /**< img structure to populate (output) */ } vp9_ref_frame_t; /*!\cond */ @@ -124,23 +130,23 @@ typedef struct vp9_ref_frame { * * defines the data type for each of VP8 decoder control function requires */ -VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE, vpx_ref_frame_t *) +VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE, vpx_ref_frame_t *) #define VPX_CTRL_VP8_SET_REFERENCE -VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *) +VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *) #define VPX_CTRL_VP8_COPY_REFERENCE -VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *) +VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *) #define VPX_CTRL_VP8_SET_POSTPROC VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int) #define VPX_CTRL_VP8_SET_DBG_COLOR_REF_FRAME -VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES, int) +VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES, int) #define VPX_CTRL_VP8_SET_DBG_COLOR_MB_MODES -VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES, int) +VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES, int) #define VPX_CTRL_VP8_SET_DBG_COLOR_B_MODES -VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV, int) +VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV, int) #define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV -VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *) +VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *) #define VPX_CTRL_VP9_GET_REFERENCE -VPX_CTRL_USE_TYPE(VP10_GET_NEW_FRAME_IMAGE, vpx_image_t *) +VPX_CTRL_USE_TYPE(VP10_GET_NEW_FRAME_IMAGE, vpx_image_t *) #define VPX_CTRL_VP10_GET_NEW_FRAME_IMAGE /*!\endcond */ diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 954ad3295351a1f3654f0904b24dac52acc19e7d..3b410580182f7bceadf6abb8d38d24f4223c60cc 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -32,7 +32,7 @@ extern "C" { * This interface provides the capability to encode raw VP10 streams. * @{ */ -extern vpx_codec_iface_t vpx_codec_vp10_cx_algo; +extern vpx_codec_iface_t vpx_codec_vp10_cx_algo; extern vpx_codec_iface_t *vpx_codec_vp10_cx(void); /*!@} - end algorithm interface member group*/ @@ -46,8 +46,7 @@ extern vpx_codec_iface_t *vpx_codec_vp10_cx(void); * predictor. When not set, the encoder will choose whether to use the * last frame or not automatically. */ -#define VP8_EFLAG_NO_REF_LAST (1<<16) - +#define VP8_EFLAG_NO_REF_LAST (1 << 16) /*!\brief Don't reference the golden frame * @@ -55,8 +54,7 @@ extern vpx_codec_iface_t *vpx_codec_vp10_cx(void); * predictor. When not set, the encoder will choose whether to use the * golden frame or not automatically. */ -#define VP8_EFLAG_NO_REF_GF (1<<17) - +#define VP8_EFLAG_NO_REF_GF (1 << 17) /*!\brief Don't reference the alternate reference frame * @@ -64,56 +62,49 @@ extern vpx_codec_iface_t *vpx_codec_vp10_cx(void); * predictor. When not set, the encoder will choose whether to use the * alt ref frame or not automatically. */ -#define VP8_EFLAG_NO_REF_ARF (1<<21) - +#define VP8_EFLAG_NO_REF_ARF (1 << 21) /*!\brief Don't update the last frame * * When this flag is set, the encoder will not update the last frame with * the contents of the current frame. */ -#define VP8_EFLAG_NO_UPD_LAST (1<<18) - +#define VP8_EFLAG_NO_UPD_LAST (1 << 18) /*!\brief Don't update the golden frame * * When this flag is set, the encoder will not update the golden frame with * the contents of the current frame. */ -#define VP8_EFLAG_NO_UPD_GF (1<<22) - +#define VP8_EFLAG_NO_UPD_GF (1 << 22) /*!\brief Don't update the alternate reference frame * * When this flag is set, the encoder will not update the alt ref frame with * the contents of the current frame. */ -#define VP8_EFLAG_NO_UPD_ARF (1<<23) - +#define VP8_EFLAG_NO_UPD_ARF (1 << 23) /*!\brief Force golden frame update * * When this flag is set, the encoder copy the contents of the current frame * to the golden frame buffer. */ -#define VP8_EFLAG_FORCE_GF (1<<19) - +#define VP8_EFLAG_FORCE_GF (1 << 19) /*!\brief Force alternate reference frame update * * When this flag is set, the encoder copy the contents of the current frame * to the alternate reference frame buffer. */ -#define VP8_EFLAG_FORCE_ARF (1<<24) - +#define VP8_EFLAG_FORCE_ARF (1 << 24) /*!\brief Disable entropy update * * When this flag is set, the encoder will not update its internal entropy * model based on the entropy of this frame. */ -#define VP8_EFLAG_NO_UPD_ENTROPY (1<<20) - +#define VP8_EFLAG_NO_UPD_ENTROPY (1 << 20) /*!\brief VPx encoder control functions * @@ -127,13 +118,13 @@ enum vp8e_enc_control_id { * * Supported in codecs: VP8, VP9 */ - VP8E_USE_REFERENCE = 7, + VP8E_USE_REFERENCE = 7, /*!\brief Codec control function to pass an ROI map to encoder. * * Supported in codecs: VP8, VP9 */ - VP8E_SET_ROI_MAP = 8, + VP8E_SET_ROI_MAP = 8, /*!\brief Codec control function to pass an Active map to encoder. * @@ -145,7 +136,7 @@ enum vp8e_enc_control_id { * * Supported in codecs: VP8, VP9 */ - VP8E_SET_SCALEMODE = 11, + VP8E_SET_SCALEMODE = 11, /*!\brief Codec control function to set encoder internal speed settings. * @@ -158,7 +149,7 @@ enum vp8e_enc_control_id { * * Supported in codecs: VP8, VP9 */ - VP8E_SET_CPUUSED = 13, + VP8E_SET_CPUUSED = 13, /*!\brief Codec control function to enable automatic set and use alf frames. * @@ -510,10 +501,10 @@ enum vp8e_enc_control_id { * This set of constants define 1-D vpx scaling modes */ typedef enum vpx_scaling_mode_1d { - VP8E_NORMAL = 0, - VP8E_FOURFIVE = 1, - VP8E_THREEFIVE = 2, - VP8E_ONETWO = 3 + VP8E_NORMAL = 0, + VP8E_FOURFIVE = 1, + VP8E_THREEFIVE = 2, + VP8E_ONETWO = 3 } VPX_SCALING_MODE; /*!\brief vpx region of interest map @@ -525,13 +516,13 @@ typedef enum vpx_scaling_mode_1d { typedef struct vpx_roi_map { /*! An id between 0 and 3 for each 16x16 region within a frame. */ unsigned char *roi_map; - unsigned int rows; /**< Number of rows. */ - unsigned int cols; /**< Number of columns. */ + unsigned int rows; /**< Number of rows. */ + unsigned int cols; /**< Number of columns. */ // TODO(paulwilkins): broken for VP9 which has 8 segments // q and loop filter deltas for each segment // (see MAX_MB_SEGMENTS) - int delta_q[4]; /**< Quantizer deltas. */ - int delta_lf[4]; /**< Loop filter deltas. */ + int delta_q[4]; /**< Quantizer deltas. */ + int delta_lf[4]; /**< Loop filter deltas. */ /*! Static breakout threshold for each segment. */ unsigned int static_threshold[4]; } vpx_roi_map_t; @@ -542,11 +533,11 @@ typedef struct vpx_roi_map { * */ - typedef struct vpx_active_map { - unsigned char *active_map; /**< specify an on (1) or off (0) each 16x16 region within a frame */ - unsigned int rows; /**< number of rows */ - unsigned int cols; /**< number of cols */ + /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */ + unsigned char *active_map; + unsigned int rows; /**< number of rows */ + unsigned int cols; /**< number of cols */ } vpx_active_map_t; /*!\brief vpx image scaling mode @@ -555,8 +546,8 @@ typedef struct vpx_active_map { * */ typedef struct vpx_scaling_mode { - VPX_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */ - VPX_SCALING_MODE v_scaling_mode; /**< vertical scaling mode */ + VPX_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */ + VPX_SCALING_MODE v_scaling_mode; /**< vertical scaling mode */ } vpx_scaling_mode_t; /*!\brief VP8 token partition mode @@ -567,9 +558,9 @@ typedef struct vpx_scaling_mode { */ typedef enum { - VP8_ONE_TOKENPARTITION = 0, - VP8_TWO_TOKENPARTITION = 1, - VP8_FOUR_TOKENPARTITION = 2, + VP8_ONE_TOKENPARTITION = 0, + VP8_TWO_TOKENPARTITION = 1, + VP8_FOUR_TOKENPARTITION = 2, VP8_EIGHT_TOKENPARTITION = 3 } vp8e_token_partitions; @@ -585,10 +576,7 @@ typedef enum { * Changes the encoder to tune for certain types of input material. * */ -typedef enum { - VPX_TUNE_PSNR, - VPX_TUNE_SSIM -} vpx_tune_metric; +typedef enum { VPX_TUNE_PSNR, VPX_TUNE_SSIM } vpx_tune_metric; /*!\cond */ /*!\brief VP8 encoder control function parameter type @@ -600,53 +588,53 @@ typedef enum { VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_USE_REFERENCE, int) #define VPX_CTRL_VP8E_USE_REFERENCE -VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int) +VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int) #define VPX_CTRL_VP8E_SET_FRAME_FLAGS -VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *) +VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *) #define VPX_CTRL_VP8E_SET_ROI_MAP -VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *) +VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *) #define VPX_CTRL_VP8E_SET_ACTIVEMAP -VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *) +VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *) #define VPX_CTRL_VP8E_SET_SCALEMODE -VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int) +VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int) #define VPX_CTRL_VP8E_SET_CPUUSED -VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF, unsigned int) +VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF, unsigned int) #define VPX_CTRL_VP8E_SET_ENABLEAUTOALTREF #if CONFIG_EXT_REFS -VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOBWDREF, unsigned int) +VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOBWDREF, unsigned int) #define VPX_CTRL_VP8E_SET_ENABLEAUTOBWDREF #endif // CONFIG_EXT_REFS -VPX_CTRL_USE_TYPE(VP8E_SET_NOISE_SENSITIVITY, unsigned int) +VPX_CTRL_USE_TYPE(VP8E_SET_NOISE_SENSITIVITY, unsigned int) #define VPX_CTRL_VP8E_SET_NOISE_SENSITIVITY -VPX_CTRL_USE_TYPE(VP8E_SET_SHARPNESS, unsigned int) +VPX_CTRL_USE_TYPE(VP8E_SET_SHARPNESS, unsigned int) #define VPX_CTRL_VP8E_SET_SHARPNESS -VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD, unsigned int) +VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD, unsigned int) #define VPX_CTRL_VP8E_SET_STATIC_THRESHOLD -VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS, int) /* vp8e_token_partitions */ +VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS, int) /* vp8e_token_partitions */ #define VPX_CTRL_VP8E_SET_TOKEN_PARTITIONS -VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES, unsigned int) +VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES, unsigned int) #define VPX_CTRL_VP8E_SET_ARNR_MAXFRAMES -VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH, unsigned int) +VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH, unsigned int) #define VPX_CTRL_VP8E_SET_ARNR_STRENGTH -VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_SET_ARNR_TYPE, unsigned int) +VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_SET_ARNR_TYPE, unsigned int) #define VPX_CTRL_VP8E_SET_ARNR_TYPE -VPX_CTRL_USE_TYPE(VP8E_SET_TUNING, int) /* vpx_tune_metric */ +VPX_CTRL_USE_TYPE(VP8E_SET_TUNING, int) /* vpx_tune_metric */ #define VPX_CTRL_VP8E_SET_TUNING -VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int) +VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int) #define VPX_CTRL_VP8E_SET_CQ_LEVEL -VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int) +VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int) #define VPX_CTRL_VP9E_SET_TILE_COLUMNS -VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int) +VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int) #define VPX_CTRL_VP9E_SET_TILE_ROWS -VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *) +VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *) #define VPX_CTRL_VP8E_GET_LAST_QUANTIZER -VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *) +VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *) #define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64 VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int) @@ -672,7 +660,7 @@ VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int) #define VPX_CTRL_VP9E_SET_FRAME_PERIODIC_BOOST -VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int) +VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int) #define VPX_CTRL_VP9E_SET_NOISE_SENSITIVITY VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vpx_tune_content */ @@ -681,10 +669,10 @@ VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vpx_tune_content */ VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int) #define VPX_CTRL_VP9E_SET_COLOR_SPACE -VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL, unsigned int) +VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL, unsigned int) #define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL -VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL, unsigned int) +VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL, unsigned int) #define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *) @@ -703,7 +691,7 @@ VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *) VPX_CTRL_USE_TYPE(VP10E_SET_SUPERBLOCK_SIZE, unsigned int) #define VPX_CTRL_VP10E_SET_SUPERBLOCK_SIZE -VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int) +VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int) #define VPX_CTRL_VP9E_SET_TARGET_LEVEL VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *) diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h index a8c411af3ff6fff45e35d071b04d6415eac14964..2239b8698e1d3f38643acb49ab77ab957092b703 100644 --- a/vpx/vp8dx.h +++ b/vpx/vp8dx.h @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - /*!\defgroup vp8_decoder WebM VP8/VP9 Decoder * \ingroup vp8 * @@ -33,7 +32,7 @@ extern "C" { * This interface provides the capability to decode VP10 streams. * @{ */ -extern vpx_codec_iface_t vpx_codec_vp10_dx_algo; +extern vpx_codec_iface_t vpx_codec_vp10_dx_algo; extern vpx_codec_iface_t *vpx_codec_vp10_dx(void); /*!@} - end algorithm interface member group*/ @@ -126,18 +125,17 @@ typedef void (*vpx_decrypt_cb)(void *decrypt_state, const unsigned char *input, * Defines a structure to hold the decryption state and access function. */ typedef struct vpx_decrypt_init { - /*! Decrypt callback. */ - vpx_decrypt_cb decrypt_cb; + /*! Decrypt callback. */ + vpx_decrypt_cb decrypt_cb; - /*! Decryption state. */ - void *decrypt_state; + /*! Decryption state. */ + void *decrypt_state; } vpx_decrypt_init; /*!\brief A deprecated alias for vpx_decrypt_init. */ typedef vpx_decrypt_init vp8_decrypt_init; - /*!\cond */ /*!\brief VP8 decoder control function parameter type * @@ -146,28 +144,27 @@ typedef vpx_decrypt_init vp8_decrypt_init; * */ - -VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES, int *) +VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES, int *) #define VPX_CTRL_VP8D_GET_LAST_REF_UPDATES -VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *) +VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *) #define VPX_CTRL_VP8D_GET_FRAME_CORRUPTED -VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *) +VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *) #define VPX_CTRL_VP8D_GET_LAST_REF_USED -VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *) +VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *) #define VPX_CTRL_VPXD_SET_DECRYPTOR -VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *) +VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *) #define VPX_CTRL_VP8D_SET_DECRYPTOR -VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *) +VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *) #define VPX_CTRL_VP9D_GET_DISPLAY_SIZE -VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *) +VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *) #define VPX_CTRL_VP9D_GET_BIT_DEPTH -VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *) +VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *) #define VPX_CTRL_VP9D_GET_FRAME_SIZE VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER -VPX_CTRL_USE_TYPE(VP10_SET_DECODE_TILE_ROW, int) +VPX_CTRL_USE_TYPE(VP10_SET_DECODE_TILE_ROW, int) #define VPX_CTRL_VP10_SET_DECODE_TILE_ROW -VPX_CTRL_USE_TYPE(VP10_SET_DECODE_TILE_COL, int) +VPX_CTRL_USE_TYPE(VP10_SET_DECODE_TILE_COL, int) #define VPX_CTRL_VP10_SET_DECODE_TILE_COL /*!\endcond */ /*! @} - end defgroup vp8_decoder */ diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h index e65e3f41f53971e2d3bdb9eb7d039abb2100934b..107469fbe6505604c8a3838ac17069d310e904ce 100644 --- a/vpx/vpx_codec.h +++ b/vpx/vpx_codec.h @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - /*!\defgroup codec Common Algorithm Interface * This abstraction allows applications to easily support multiple video * formats with minimal code duplication. This section describes the interface @@ -46,446 +45,431 @@ extern "C" { #include "./vpx_integer.h" #include "./vpx_image.h" - /*!\brief Decorator indicating a function is deprecated */ +/*!\brief Decorator indicating a function is deprecated */ #ifndef DEPRECATED #if defined(__GNUC__) && __GNUC__ -#define DEPRECATED __attribute__ ((deprecated)) +#define DEPRECATED __attribute__((deprecated)) #elif defined(_MSC_VER) #define DEPRECATED #else #define DEPRECATED #endif -#endif /* DEPRECATED */ +#endif /* DEPRECATED */ #ifndef DECLSPEC_DEPRECATED #if defined(__GNUC__) && __GNUC__ #define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */ #elif defined(_MSC_VER) -#define DECLSPEC_DEPRECATED __declspec(deprecated) /**< \copydoc #DEPRECATED */ +/*!\brief \copydoc #DEPRECATED */ +#define DECLSPEC_DEPRECATED __declspec(deprecated) #else #define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */ #endif -#endif /* DECLSPEC_DEPRECATED */ +#endif /* DECLSPEC_DEPRECATED */ - /*!\brief Decorator indicating a function is potentially unused */ +/*!\brief Decorator indicating a function is potentially unused */ #ifdef UNUSED #elif defined(__GNUC__) || defined(__clang__) -#define UNUSED __attribute__ ((unused)) +#define UNUSED __attribute__((unused)) #else #define UNUSED #endif - /*!\brief Current ABI version number - * - * \internal - * If this file is altered in any way that changes the ABI, this value - * must be bumped. Examples include, but are not limited to, changing - * types, removing or reassigning enums, adding/removing/rearranging - * fields to structures - */ +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ #define VPX_CODEC_ABI_VERSION (3 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/ - /*!\brief Algorithm return codes */ - typedef enum { - /*!\brief Operation completed without error */ - VPX_CODEC_OK, - - /*!\brief Unspecified error */ - VPX_CODEC_ERROR, - - /*!\brief Memory operation failed */ - VPX_CODEC_MEM_ERROR, - - /*!\brief ABI version mismatch */ - VPX_CODEC_ABI_MISMATCH, - - /*!\brief Algorithm does not have required capability */ - VPX_CODEC_INCAPABLE, - - /*!\brief The given bitstream is not supported. - * - * The bitstream was unable to be parsed at the highest level. The decoder - * is unable to proceed. This error \ref SHOULD be treated as fatal to the - * stream. */ - VPX_CODEC_UNSUP_BITSTREAM, - - /*!\brief Encoded bitstream uses an unsupported feature - * - * The decoder does not implement a feature required by the encoder. This - * return code should only be used for features that prevent future - * pictures from being properly decoded. This error \ref MAY be treated as - * fatal to the stream or \ref MAY be treated as fatal to the current GOP. - */ - VPX_CODEC_UNSUP_FEATURE, - - /*!\brief The coded data for this stream is corrupt or incomplete - * - * There was a problem decoding the current frame. This return code - * should only be used for failures that prevent future pictures from - * being properly decoded. This error \ref MAY be treated as fatal to the - * stream or \ref MAY be treated as fatal to the current GOP. If decoding - * is continued for the current GOP, artifacts may be present. - */ - VPX_CODEC_CORRUPT_FRAME, - - /*!\brief An application-supplied parameter is not valid. - * - */ - VPX_CODEC_INVALID_PARAM, - - /*!\brief An iterator reached the end of list. - * - */ - VPX_CODEC_LIST_END - - } - vpx_codec_err_t; - - - /*! \brief Codec capabilities bitfield - * - * Each codec advertises the capabilities it supports as part of its - * ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces - * or functionality, and are not required to be supported. - * - * The available flags are specified by VPX_CODEC_CAP_* defines. - */ - typedef long vpx_codec_caps_t; -#define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */ -#define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */ +/*!\brief Algorithm return codes */ +typedef enum { + /*!\brief Operation completed without error */ + VPX_CODEC_OK, + /*!\brief Unspecified error */ + VPX_CODEC_ERROR, - /*! \brief Initialization-time Feature Enabling - * - * Certain codec features must be known at initialization time, to allow for - * proper memory allocation. - * - * The available flags are specified by VPX_CODEC_USE_* defines. - */ - typedef long vpx_codec_flags_t; - + /*!\brief Memory operation failed */ + VPX_CODEC_MEM_ERROR, - /*!\brief Codec interface structure. - * - * Contains function pointers and other data private to the codec - * implementation. This structure is opaque to the application. - */ - typedef const struct vpx_codec_iface vpx_codec_iface_t; + /*!\brief ABI version mismatch */ + VPX_CODEC_ABI_MISMATCH, + /*!\brief Algorithm does not have required capability */ + VPX_CODEC_INCAPABLE, - /*!\brief Codec private data structure. + /*!\brief The given bitstream is not supported. * - * Contains data private to the codec implementation. This structure is opaque - * to the application. - */ - typedef struct vpx_codec_priv vpx_codec_priv_t; - + * The bitstream was unable to be parsed at the highest level. The decoder + * is unable to proceed. This error \ref SHOULD be treated as fatal to the + * stream. */ + VPX_CODEC_UNSUP_BITSTREAM, - /*!\brief Iterator + /*!\brief Encoded bitstream uses an unsupported feature * - * Opaque storage used for iterating over lists. + * The decoder does not implement a feature required by the encoder. This + * return code should only be used for features that prevent future + * pictures from being properly decoded. This error \ref MAY be treated as + * fatal to the stream or \ref MAY be treated as fatal to the current GOP. */ - typedef const void *vpx_codec_iter_t; - + VPX_CODEC_UNSUP_FEATURE, - /*!\brief Codec context structure + /*!\brief The coded data for this stream is corrupt or incomplete * - * All codecs \ref MUST support this context structure fully. In general, - * this data should be considered private to the codec algorithm, and - * not be manipulated or examined by the calling application. Applications - * may reference the 'name' member to get a printable description of the - * algorithm. + * There was a problem decoding the current frame. This return code + * should only be used for failures that prevent future pictures from + * being properly decoded. This error \ref MAY be treated as fatal to the + * stream or \ref MAY be treated as fatal to the current GOP. If decoding + * is continued for the current GOP, artifacts may be present. */ - typedef struct vpx_codec_ctx { - const char *name; /**< Printable interface name */ - vpx_codec_iface_t *iface; /**< Interface pointers */ - vpx_codec_err_t err; /**< Last returned error */ - const char *err_detail; /**< Detailed info, if available */ - vpx_codec_flags_t init_flags; /**< Flags passed at init time */ - union { - /**< Decoder Configuration Pointer */ - const struct vpx_codec_dec_cfg *dec; - /**< Encoder Configuration Pointer */ - const struct vpx_codec_enc_cfg *enc; - const void *raw; - } config; /**< Configuration pointer aliasing union */ - vpx_codec_priv_t *priv; /**< Algorithm private storage */ - } vpx_codec_ctx_t; - - /*!\brief Bit depth for codec - * * - * This enumeration determines the bit depth of the codec. - */ - typedef enum vpx_bit_depth { - VPX_BITS_8 = 8, /**< 8 bits */ - VPX_BITS_10 = 10, /**< 10 bits */ - VPX_BITS_12 = 12, /**< 12 bits */ - } vpx_bit_depth_t; + VPX_CODEC_CORRUPT_FRAME, - /*!\brief Superblock size selection. + /*!\brief An application-supplied parameter is not valid. * - * Defines the superblock size used for encoding. The superblock size can - * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically - * selected by the encoder for each frame. - */ - typedef enum vpx_superblock_size { - VPX_SUPERBLOCK_SIZE_64X64, /**< Always use 64x64 superblocks. */ - VPX_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */ - VPX_SUPERBLOCK_SIZE_DYNAMIC /**< Select superblock size dynamically. */ - } vpx_superblock_size_t; - - /* - * Library Version Number Interface - * - * For example, see the following sample return values: - * vpx_codec_version() (1<<16 | 2<<8 | 3) - * vpx_codec_version_str() "v1.2.3-rc1-16-gec6a1ba" - * vpx_codec_version_extra_str() "rc1-16-gec6a1ba" */ + VPX_CODEC_INVALID_PARAM, - /*!\brief Return the version information (as an integer) - * - * Returns a packed encoding of the library version number. This will only include - * the major.minor.patch component of the version number. Note that this encoded - * value should be accessed through the macros provided, as the encoding may change - * in the future. + /*!\brief An iterator reached the end of list. * */ - int vpx_codec_version(void); -#define VPX_VERSION_MAJOR(v) ((v>>16)&0xff) /**< extract major from packed version */ -#define VPX_VERSION_MINOR(v) ((v>>8)&0xff) /**< extract minor from packed version */ -#define VPX_VERSION_PATCH(v) ((v>>0)&0xff) /**< extract patch from packed version */ + VPX_CODEC_LIST_END - /*!\brief Return the version major number */ -#define vpx_codec_version_major() ((vpx_codec_version()>>16)&0xff) +} vpx_codec_err_t; - /*!\brief Return the version minor number */ -#define vpx_codec_version_minor() ((vpx_codec_version()>>8)&0xff) +/*! \brief Codec capabilities bitfield + * + * Each codec advertises the capabilities it supports as part of its + * ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces + * or functionality, and are not required to be supported. + * + * The available flags are specified by VPX_CODEC_CAP_* defines. + */ +typedef long vpx_codec_caps_t; +#define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */ +#define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */ - /*!\brief Return the version patch number */ -#define vpx_codec_version_patch() ((vpx_codec_version()>>0)&0xff) +/*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow for + * proper memory allocation. + * + * The available flags are specified by VPX_CODEC_USE_* defines. + */ +typedef long vpx_codec_flags_t; +/*!\brief Codec interface structure. + * + * Contains function pointers and other data private to the codec + * implementation. This structure is opaque to the application. + */ +typedef const struct vpx_codec_iface vpx_codec_iface_t; - /*!\brief Return the version information (as a string) - * - * Returns a printable string containing the full library version number. This may - * contain additional text following the three digit version number, as to indicate - * release candidates, prerelease versions, etc. - * - */ - const char *vpx_codec_version_str(void); +/*!\brief Codec private data structure. + * + * Contains data private to the codec implementation. This structure is opaque + * to the application. + */ +typedef struct vpx_codec_priv vpx_codec_priv_t; +/*!\brief Iterator + * + * Opaque storage used for iterating over lists. + */ +typedef const void *vpx_codec_iter_t; - /*!\brief Return the version information (as a string) - * - * Returns a printable "extra string". This is the component of the string returned - * by vpx_codec_version_str() following the three digit version number. - * - */ - const char *vpx_codec_version_extra_str(void); +/*!\brief Codec context structure + * + * All codecs \ref MUST support this context structure fully. In general, + * this data should be considered private to the codec algorithm, and + * not be manipulated or examined by the calling application. Applications + * may reference the 'name' member to get a printable description of the + * algorithm. + */ +typedef struct vpx_codec_ctx { + const char *name; /**< Printable interface name */ + vpx_codec_iface_t *iface; /**< Interface pointers */ + vpx_codec_err_t err; /**< Last returned error */ + const char *err_detail; /**< Detailed info, if available */ + vpx_codec_flags_t init_flags; /**< Flags passed at init time */ + union { + /**< Decoder Configuration Pointer */ + const struct vpx_codec_dec_cfg *dec; + /**< Encoder Configuration Pointer */ + const struct vpx_codec_enc_cfg *enc; + const void *raw; + } config; /**< Configuration pointer aliasing union */ + vpx_codec_priv_t *priv; /**< Algorithm private storage */ +} vpx_codec_ctx_t; + +/*!\brief Bit depth for codec + * * + * This enumeration determines the bit depth of the codec. + */ +typedef enum vpx_bit_depth { + VPX_BITS_8 = 8, /**< 8 bits */ + VPX_BITS_10 = 10, /**< 10 bits */ + VPX_BITS_12 = 12, /**< 12 bits */ +} vpx_bit_depth_t; +/*!\brief Superblock size selection. + * + * Defines the superblock size used for encoding. The superblock size can + * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically + * selected by the encoder for each frame. + */ +typedef enum vpx_superblock_size { + VPX_SUPERBLOCK_SIZE_64X64, /**< Always use 64x64 superblocks. */ + VPX_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */ + VPX_SUPERBLOCK_SIZE_DYNAMIC /**< Select superblock size dynamically. */ +} vpx_superblock_size_t; - /*!\brief Return the build configuration - * - * Returns a printable string containing an encoded version of the build - * configuration. This may be useful to vpx support. - * - */ - const char *vpx_codec_build_config(void); +/* + * Library Version Number Interface + * + * For example, see the following sample return values: + * vpx_codec_version() (1<<16 | 2<<8 | 3) + * vpx_codec_version_str() "v1.2.3-rc1-16-gec6a1ba" + * vpx_codec_version_extra_str() "rc1-16-gec6a1ba" + */ +/*!\brief Return the version information (as an integer) + * + * Returns a packed encoding of the library version number. This will only + * include + * the major.minor.patch component of the version number. Note that this encoded + * value should be accessed through the macros provided, as the encoding may + * change + * in the future. + * + */ +int vpx_codec_version(void); +#define VPX_VERSION_MAJOR(v) \ + ((v >> 16) & 0xff) /**< extract major from packed version */ +#define VPX_VERSION_MINOR(v) \ + ((v >> 8) & 0xff) /**< extract minor from packed version */ +#define VPX_VERSION_PATCH(v) \ + ((v >> 0) & 0xff) /**< extract patch from packed version */ - /*!\brief Return the name for a given interface - * - * Returns a human readable string for name of the given codec interface. - * - * \param[in] iface Interface pointer - * - */ - const char *vpx_codec_iface_name(vpx_codec_iface_t *iface); +/*!\brief Return the version major number */ +#define vpx_codec_version_major() ((vpx_codec_version() >> 16) & 0xff) +/*!\brief Return the version minor number */ +#define vpx_codec_version_minor() ((vpx_codec_version() >> 8) & 0xff) - /*!\brief Convert error number to printable string - * - * Returns a human readable string for the last error returned by the - * algorithm. The returned error will be one line and will not contain - * any newline characters. - * - * - * \param[in] err Error number. - * - */ - const char *vpx_codec_err_to_string(vpx_codec_err_t err); +/*!\brief Return the version patch number */ +#define vpx_codec_version_patch() ((vpx_codec_version() >> 0) & 0xff) +/*!\brief Return the version information (as a string) + * + * Returns a printable string containing the full library version number. This + * may + * contain additional text following the three digit version number, as to + * indicate + * release candidates, prerelease versions, etc. + * + */ +const char *vpx_codec_version_str(void); - /*!\brief Retrieve error synopsis for codec context - * - * Returns a human readable string for the last error returned by the - * algorithm. The returned error will be one line and will not contain - * any newline characters. - * - * - * \param[in] ctx Pointer to this instance's context. - * - */ - const char *vpx_codec_error(vpx_codec_ctx_t *ctx); +/*!\brief Return the version information (as a string) + * + * Returns a printable "extra string". This is the component of the string + * returned + * by vpx_codec_version_str() following the three digit version number. + * + */ +const char *vpx_codec_version_extra_str(void); +/*!\brief Return the build configuration + * + * Returns a printable string containing an encoded version of the build + * configuration. This may be useful to vpx support. + * + */ +const char *vpx_codec_build_config(void); - /*!\brief Retrieve detailed error information for codec context - * - * Returns a human readable string providing detailed information about - * the last error. - * - * \param[in] ctx Pointer to this instance's context. - * - * \retval NULL - * No detailed information is available. - */ - const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx); +/*!\brief Return the name for a given interface + * + * Returns a human readable string for name of the given codec interface. + * + * \param[in] iface Interface pointer + * + */ +const char *vpx_codec_iface_name(vpx_codec_iface_t *iface); +/*!\brief Convert error number to printable string + * + * Returns a human readable string for the last error returned by the + * algorithm. The returned error will be one line and will not contain + * any newline characters. + * + * + * \param[in] err Error number. + * + */ +const char *vpx_codec_err_to_string(vpx_codec_err_t err); - /* REQUIRED FUNCTIONS - * - * The following functions are required to be implemented for all codecs. - * They represent the base case functionality expected of all codecs. - */ +/*!\brief Retrieve error synopsis for codec context + * + * Returns a human readable string for the last error returned by the + * algorithm. The returned error will be one line and will not contain + * any newline characters. + * + * + * \param[in] ctx Pointer to this instance's context. + * + */ +const char *vpx_codec_error(vpx_codec_ctx_t *ctx); - /*!\brief Destroy a codec instance - * - * Destroys a codec context, freeing any associated memory buffers. - * - * \param[in] ctx Pointer to this instance's context - * - * \retval #VPX_CODEC_OK - * The codec algorithm initialized. - * \retval #VPX_CODEC_MEM_ERROR - * Memory allocation failed. - */ - vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx); +/*!\brief Retrieve detailed error information for codec context + * + * Returns a human readable string providing detailed information about + * the last error. + * + * \param[in] ctx Pointer to this instance's context. + * + * \retval NULL + * No detailed information is available. + */ +const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx); +/* REQUIRED FUNCTIONS + * + * The following functions are required to be implemented for all codecs. + * They represent the base case functionality expected of all codecs. + */ - /*!\brief Get the capabilities of an algorithm. - * - * Retrieves the capabilities bitfield from the algorithm's interface. - * - * \param[in] iface Pointer to the algorithm interface - * - */ - vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface); +/*!\brief Destroy a codec instance + * + * Destroys a codec context, freeing any associated memory buffers. + * + * \param[in] ctx Pointer to this instance's context + * + * \retval #VPX_CODEC_OK + * The codec algorithm initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ +vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx); +/*!\brief Get the capabilities of an algorithm. + * + * Retrieves the capabilities bitfield from the algorithm's interface. + * + * \param[in] iface Pointer to the algorithm interface + * + */ +vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface); - /*!\brief Control algorithm - * - * This function is used to exchange algorithm specific data with the codec - * instance. This can be used to implement features specific to a particular - * algorithm. - * - * This wrapper function dispatches the request to the helper function - * associated with the given ctrl_id. It tries to call this function - * transparently, but will return #VPX_CODEC_ERROR if the request could not - * be dispatched. - * - * Note that this function should not be used directly. Call the - * #vpx_codec_control wrapper macro instead. - * - * \param[in] ctx Pointer to this instance's context - * \param[in] ctrl_id Algorithm specific control identifier - * - * \retval #VPX_CODEC_OK - * The control request was processed. - * \retval #VPX_CODEC_ERROR - * The control request was not processed. - * \retval #VPX_CODEC_INVALID_PARAM - * The data was not valid. - */ - vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, - int ctrl_id, - ...); +/*!\brief Control algorithm + * + * This function is used to exchange algorithm specific data with the codec + * instance. This can be used to implement features specific to a particular + * algorithm. + * + * This wrapper function dispatches the request to the helper function + * associated with the given ctrl_id. It tries to call this function + * transparently, but will return #VPX_CODEC_ERROR if the request could not + * be dispatched. + * + * Note that this function should not be used directly. Call the + * #vpx_codec_control wrapper macro instead. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] ctrl_id Algorithm specific control identifier + * + * \retval #VPX_CODEC_OK + * The control request was processed. + * \retval #VPX_CODEC_ERROR + * The control request was not processed. + * \retval #VPX_CODEC_INVALID_PARAM + * The data was not valid. + */ +vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); #if defined(VPX_DISABLE_CTRL_TYPECHECKS) && VPX_DISABLE_CTRL_TYPECHECKS -# define vpx_codec_control(ctx,id,data) vpx_codec_control_(ctx,id,data) -# define VPX_CTRL_USE_TYPE(id, typ) -# define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) -# define VPX_CTRL_VOID(id, typ) +#define vpx_codec_control(ctx, id, data) vpx_codec_control_(ctx, id, data) +#define VPX_CTRL_USE_TYPE(id, typ) +#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) +#define VPX_CTRL_VOID(id, typ) #else - /*!\brief vpx_codec_control wrapper macro - * - * This macro allows for type safe conversions across the variadic parameter - * to vpx_codec_control_(). - * - * \internal - * It works by dispatching the call to the control function through a wrapper - * function named with the id parameter. - */ -# define vpx_codec_control(ctx,id,data) vpx_codec_control_##id(ctx,id,data)\ - /**<\hideinitializer*/ - +/*!\brief vpx_codec_control wrapper macro + * + * This macro allows for type safe conversions across the variadic parameter + * to vpx_codec_control_(). + * + * \internal + * It works by dispatching the call to the control function through a wrapper + * function named with the id parameter. + */ +#define vpx_codec_control(ctx, id, data) \ + vpx_codec_control_##id(ctx, id, data) /**<\hideinitializer*/ - /*!\brief vpx_codec_control type definition macro - * - * This macro allows for type safe conversions across the variadic parameter - * to vpx_codec_control_(). It defines the type of the argument for a given - * control identifier. - * - * \internal - * It defines a static function with - * the correctly typed arguments as a wrapper to the type-unsafe internal - * function. - */ -# define VPX_CTRL_USE_TYPE(id, typ) \ - static vpx_codec_err_t \ - vpx_codec_control_##id(vpx_codec_ctx_t*, int, typ) UNUSED;\ - \ - static vpx_codec_err_t \ - vpx_codec_control_##id(vpx_codec_ctx_t *ctx, int ctrl_id, typ data) {\ - return vpx_codec_control_(ctx, ctrl_id, data);\ +/*!\brief vpx_codec_control type definition macro + * + * This macro allows for type safe conversions across the variadic parameter + * to vpx_codec_control_(). It defines the type of the argument for a given + * control identifier. + * + * \internal + * It defines a static function with + * the correctly typed arguments as a wrapper to the type-unsafe internal + * function. + */ +#define VPX_CTRL_USE_TYPE(id, typ) \ + static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int, typ) \ + UNUSED; \ + \ + static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx, \ + int ctrl_id, typ data) { \ + return vpx_codec_control_(ctx, ctrl_id, data); \ } /**<\hideinitializer*/ - - /*!\brief vpx_codec_control deprecated type definition macro - * - * Like #VPX_CTRL_USE_TYPE, but indicates that the specified control is - * deprecated and should not be used. Consult the documentation for your - * codec for more information. - * - * \internal - * It defines a static function with the correctly typed arguments as a - * wrapper to the type-unsafe internal function. - */ -# define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ - DECLSPEC_DEPRECATED static vpx_codec_err_t \ - vpx_codec_control_##id(vpx_codec_ctx_t*, int, typ) DEPRECATED UNUSED;\ - \ - DECLSPEC_DEPRECATED static vpx_codec_err_t \ - vpx_codec_control_##id(vpx_codec_ctx_t *ctx, int ctrl_id, typ data) {\ - return vpx_codec_control_(ctx, ctrl_id, data);\ +/*!\brief vpx_codec_control deprecated type definition macro + * + * Like #VPX_CTRL_USE_TYPE, but indicates that the specified control is + * deprecated and should not be used. Consult the documentation for your + * codec for more information. + * + * \internal + * It defines a static function with the correctly typed arguments as a + * wrapper to the type-unsafe internal function. + */ +#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ + DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ + vpx_codec_ctx_t *, int, typ) DEPRECATED UNUSED; \ + \ + DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ + vpx_codec_ctx_t *ctx, int ctrl_id, typ data) { \ + return vpx_codec_control_(ctx, ctrl_id, data); \ } /**<\hideinitializer*/ - - /*!\brief vpx_codec_control void type definition macro - * - * This macro allows for type safe conversions across the variadic parameter - * to vpx_codec_control_(). It indicates that a given control identifier takes - * no argument. - * - * \internal - * It defines a static function without a data argument as a wrapper to the - * type-unsafe internal function. - */ -# define VPX_CTRL_VOID(id) \ - static vpx_codec_err_t \ - vpx_codec_control_##id(vpx_codec_ctx_t*, int) UNUSED;\ - \ - static vpx_codec_err_t \ - vpx_codec_control_##id(vpx_codec_ctx_t *ctx, int ctrl_id) {\ - return vpx_codec_control_(ctx, ctrl_id);\ +/*!\brief vpx_codec_control void type definition macro + * + * This macro allows for type safe conversions across the variadic parameter + * to vpx_codec_control_(). It indicates that a given control identifier takes + * no argument. + * + * \internal + * It defines a static function without a data argument as a wrapper to the + * type-unsafe internal function. + */ +#define VPX_CTRL_VOID(id) \ + static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int) \ + UNUSED; \ + \ + static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx, \ + int ctrl_id) { \ + return vpx_codec_control_(ctx, ctrl_id); \ } /**<\hideinitializer*/ - #endif - /*!@} - end defgroup codec*/ +/*!@} - end defgroup codec*/ #ifdef __cplusplus } #endif #endif // VPX_VPX_CODEC_H_ - diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h index bfe90c6112e9c87fcf2bf0487cd8bb84a59f6a0b..3d8dd6ccb661396fb7269e1c26e13dd49bc19b71 100644 --- a/vpx/vpx_decoder.h +++ b/vpx/vpx_decoder.h @@ -32,347 +32,334 @@ extern "C" { #include "./vpx_codec.h" #include "./vpx_frame_buffer.h" - /*!\brief Current ABI version number - * - * \internal - * If this file is altered in any way that changes the ABI, this value - * must be bumped. Examples include, but are not limited to, changing - * types, removing or reassigning enums, adding/removing/rearranging - * fields to structures - */ -#define VPX_DECODER_ABI_VERSION (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ - - /*! \brief Decoder capabilities bitfield - * - * Each decoder advertises the capabilities it supports as part of its - * ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces - * or functionality, and are not required to be supported by a decoder. - * - * The available flags are specified by VPX_CODEC_CAP_* defines. - */ -#define VPX_CODEC_CAP_PUT_SLICE 0x10000 /**< Will issue put_slice callbacks */ -#define VPX_CODEC_CAP_PUT_FRAME 0x20000 /**< Will issue put_frame callbacks */ -#define VPX_CODEC_CAP_POSTPROC 0x40000 /**< Can postprocess decoded frame */ -#define VPX_CODEC_CAP_ERROR_CONCEALMENT 0x80000 /**< Can conceal errors due to - packet loss */ -#define VPX_CODEC_CAP_INPUT_FRAGMENTS 0x100000 /**< Can receive encoded frames - one fragment at a time */ - - /*! \brief Initialization-time Feature Enabling - * - * Certain codec features must be known at initialization time, to allow for - * proper memory allocation. - * - * The available flags are specified by VPX_CODEC_USE_* defines. - */ -#define VPX_CODEC_CAP_FRAME_THREADING 0x200000 /**< Can support frame-based - multi-threading */ -#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000 /**< Can support external - frame buffers */ - -#define VPX_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */ -#define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000 /**< Conceal errors in decoded - frames */ -#define VPX_CODEC_USE_INPUT_FRAGMENTS 0x40000 /**< The input frame should be - passed to the decoder one - fragment at a time */ -#define VPX_CODEC_USE_FRAME_THREADING 0x80000 /**< Enable frame-based - multi-threading */ - - /*!\brief Stream properties - * - * This structure is used to query or set properties of the decoded - * stream. Algorithms may extend this structure with data specific - * to their bitstream by setting the sz member appropriately. - */ - typedef struct vpx_codec_stream_info { - unsigned int sz; /**< Size of this structure */ - unsigned int w; /**< Width (or 0 for unknown/default) */ - unsigned int h; /**< Height (or 0 for unknown/default) */ - unsigned int is_kf; /**< Current frame is a keyframe */ - } vpx_codec_stream_info_t; - - /* REQUIRED FUNCTIONS - * - * The following functions are required to be implemented for all decoders. - * They represent the base case functionality expected of all decoders. - */ - +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_DECODER_ABI_VERSION \ + (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ - /*!\brief Initialization Configurations - * - * This structure is used to pass init time configuration options to the - * decoder. - */ - typedef struct vpx_codec_dec_cfg { - unsigned int threads; /**< Maximum number of threads to use, default 1 */ - unsigned int w; /**< Width */ - unsigned int h; /**< Height */ - } vpx_codec_dec_cfg_t; /**< alias for struct vpx_codec_dec_cfg */ +/*! \brief Decoder capabilities bitfield + * + * Each decoder advertises the capabilities it supports as part of its + * ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces + * or functionality, and are not required to be supported by a decoder. + * + * The available flags are specified by VPX_CODEC_CAP_* defines. + */ +#define VPX_CODEC_CAP_PUT_SLICE 0x10000 /**< Will issue put_slice callbacks */ +#define VPX_CODEC_CAP_PUT_FRAME 0x20000 /**< Will issue put_frame callbacks */ +#define VPX_CODEC_CAP_POSTPROC 0x40000 /**< Can postprocess decoded frame */ +/*!\brief Can conceal errors due to packet loss */ +#define VPX_CODEC_CAP_ERROR_CONCEALMENT 0x80000 +/*!\brief Can receive encoded frames one fragment at a time */ +#define VPX_CODEC_CAP_INPUT_FRAGMENTS 0x100000 + +/*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow for + * proper memory allocation. + * + * The available flags are specified by VPX_CODEC_USE_* defines. + */ +/*!\brief Can support frame-based multi-threading */ +#define VPX_CODEC_CAP_FRAME_THREADING 0x200000 +/*!brief Can support external frame buffers */ +#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000 + +#define VPX_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */ +/*!\brief Conceal errors in decoded frames */ +#define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000 +/*!\brief The input frame should be passed to the decoder one fragment at a + * time */ +#define VPX_CODEC_USE_INPUT_FRAGMENTS 0x40000 +/*!\brief Enable frame-based multi-threading */ +#define VPX_CODEC_USE_FRAME_THREADING 0x80000 + +/*!\brief Stream properties + * + * This structure is used to query or set properties of the decoded + * stream. Algorithms may extend this structure with data specific + * to their bitstream by setting the sz member appropriately. + */ +typedef struct vpx_codec_stream_info { + unsigned int sz; /**< Size of this structure */ + unsigned int w; /**< Width (or 0 for unknown/default) */ + unsigned int h; /**< Height (or 0 for unknown/default) */ + unsigned int is_kf; /**< Current frame is a keyframe */ +} vpx_codec_stream_info_t; + +/* REQUIRED FUNCTIONS + * + * The following functions are required to be implemented for all decoders. + * They represent the base case functionality expected of all decoders. + */ +/*!\brief Initialization Configurations + * + * This structure is used to pass init time configuration options to the + * decoder. + */ +typedef struct vpx_codec_dec_cfg { + unsigned int threads; /**< Maximum number of threads to use, default 1 */ + unsigned int w; /**< Width */ + unsigned int h; /**< Height */ +} vpx_codec_dec_cfg_t; /**< alias for struct vpx_codec_dec_cfg */ - /*!\brief Initialize a decoder instance - * - * Initializes a decoder context using the given interface. Applications - * should call the vpx_codec_dec_init convenience macro instead of this - * function directly, to ensure that the ABI version number parameter - * is properly initialized. - * - * If the library was configured with --disable-multithread, this call - * is not thread safe and should be guarded with a lock if being used - * in a multithreaded context. - * - * \param[in] ctx Pointer to this instance's context. - * \param[in] iface Pointer to the algorithm interface to use. - * \param[in] cfg Configuration to use, if known. May be NULL. - * \param[in] flags Bitfield of VPX_CODEC_USE_* flags - * \param[in] ver ABI version number. Must be set to - * VPX_DECODER_ABI_VERSION - * \retval #VPX_CODEC_OK - * The decoder algorithm initialized. - * \retval #VPX_CODEC_MEM_ERROR - * Memory allocation failed. - */ - vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, - vpx_codec_iface_t *iface, - const vpx_codec_dec_cfg_t *cfg, - vpx_codec_flags_t flags, - int ver); +/*!\brief Initialize a decoder instance + * + * Initializes a decoder context using the given interface. Applications + * should call the vpx_codec_dec_init convenience macro instead of this + * function directly, to ensure that the ABI version number parameter + * is properly initialized. + * + * If the library was configured with --disable-multithread, this call + * is not thread safe and should be guarded with a lock if being used + * in a multithreaded context. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] flags Bitfield of VPX_CODEC_USE_* flags + * \param[in] ver ABI version number. Must be set to + * VPX_DECODER_ABI_VERSION + * \retval #VPX_CODEC_OK + * The decoder algorithm initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ +vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + const vpx_codec_dec_cfg_t *cfg, + vpx_codec_flags_t flags, int ver); - /*!\brief Convenience macro for vpx_codec_dec_init_ver() - * - * Ensures the ABI version parameter is properly set. - */ +/*!\brief Convenience macro for vpx_codec_dec_init_ver() + * + * Ensures the ABI version parameter is properly set. + */ #define vpx_codec_dec_init(ctx, iface, cfg, flags) \ vpx_codec_dec_init_ver(ctx, iface, cfg, flags, VPX_DECODER_ABI_VERSION) +/*!\brief Parse stream info from a buffer + * + * Performs high level parsing of the bitstream. Construction of a decoder + * context is not necessary. Can be used to determine if the bitstream is + * of the proper format, and to extract information from the stream. + * + * \param[in] iface Pointer to the algorithm interface + * \param[in] data Pointer to a block of data to parse + * \param[in] data_sz Size of the data buffer + * \param[in,out] si Pointer to stream info to update. The size member + * \ref MUST be properly initialized, but \ref MAY be + * clobbered by the algorithm. This parameter \ref MAY + * be NULL. + * + * \retval #VPX_CODEC_OK + * Bitstream is parsable and stream information updated + */ +vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface, + const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si); - /*!\brief Parse stream info from a buffer - * - * Performs high level parsing of the bitstream. Construction of a decoder - * context is not necessary. Can be used to determine if the bitstream is - * of the proper format, and to extract information from the stream. - * - * \param[in] iface Pointer to the algorithm interface - * \param[in] data Pointer to a block of data to parse - * \param[in] data_sz Size of the data buffer - * \param[in,out] si Pointer to stream info to update. The size member - * \ref MUST be properly initialized, but \ref MAY be - * clobbered by the algorithm. This parameter \ref MAY - * be NULL. - * - * \retval #VPX_CODEC_OK - * Bitstream is parsable and stream information updated - */ - vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface, - const uint8_t *data, - unsigned int data_sz, - vpx_codec_stream_info_t *si); - - - /*!\brief Return information about the current stream. - * - * Returns information about the stream that has been parsed during decoding. - * - * \param[in] ctx Pointer to this instance's context - * \param[in,out] si Pointer to stream info to update. The size member - * \ref MUST be properly initialized, but \ref MAY be - * clobbered by the algorithm. This parameter \ref MAY - * be NULL. - * - * \retval #VPX_CODEC_OK - * Bitstream is parsable and stream information updated - */ - vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, - vpx_codec_stream_info_t *si); - - - /*!\brief Decode data - * - * Processes a buffer of coded data. If the processing results in a new - * decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be - * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode - * time stamp) order. Frames produced will always be in PTS (presentation - * time stamp) order. - * If the decoder is configured with VPX_CODEC_USE_INPUT_FRAGMENTS enabled, - * data and data_sz can contain a fragment of the encoded frame. Fragment - * \#n must contain at least partition \#n, but can also contain subsequent - * partitions (\#n+1 - \#n+i), and if so, fragments \#n+1, .., \#n+i must - * be empty. When no more data is available, this function should be called - * with NULL as data and 0 as data_sz. The memory passed to this function - * must be available until the frame has been decoded. - * - * \param[in] ctx Pointer to this instance's context - * \param[in] data Pointer to this block of new coded data. If - * NULL, a VPX_CODEC_CB_PUT_FRAME event is posted - * for the previously decoded frame. - * \param[in] data_sz Size of the coded data, in bytes. - * \param[in] user_priv Application specific data to associate with - * this frame. - * \param[in] deadline Soft deadline the decoder should attempt to meet, - * in us. Set to zero for unlimited. - * - * \return Returns #VPX_CODEC_OK if the coded data was processed completely - * and future pictures can be decoded without error. Otherwise, - * see the descriptions of the other error codes in ::vpx_codec_err_t - * for recoverability capabilities. - */ - vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, - const uint8_t *data, - unsigned int data_sz, - void *user_priv, - long deadline); - - - /*!\brief Decoded frames iterator - * - * Iterates over a list of the frames available for display. The iterator - * storage should be initialized to NULL to start the iteration. Iteration is - * complete when this function returns NULL. - * - * The list of available frames becomes valid upon completion of the - * vpx_codec_decode call, and remains valid until the next call to vpx_codec_decode. - * - * \param[in] ctx Pointer to this instance's context - * \param[in,out] iter Iterator storage, initialized to NULL - * - * \return Returns a pointer to an image, if one is ready for display. Frames - * produced will always be in PTS (presentation time stamp) order. - */ - vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, - vpx_codec_iter_t *iter); - - - /*!\defgroup cap_put_frame Frame-Based Decoding Functions - * - * The following functions are required to be implemented for all decoders - * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling these functions - * for codecs that don't advertise this capability will result in an error - * code being returned, usually VPX_CODEC_ERROR - * @{ - */ - - /*!\brief put frame callback prototype - * - * This callback is invoked by the decoder to notify the application of - * the availability of decoded image data. - */ - typedef void (*vpx_codec_put_frame_cb_fn_t)(void *user_priv, - const vpx_image_t *img); +/*!\brief Return information about the current stream. + * + * Returns information about the stream that has been parsed during decoding. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] si Pointer to stream info to update. The size member + * \ref MUST be properly initialized, but \ref MAY be + * clobbered by the algorithm. This parameter \ref MAY + * be NULL. + * + * \retval #VPX_CODEC_OK + * Bitstream is parsable and stream information updated + */ +vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, + vpx_codec_stream_info_t *si); +/*!\brief Decode data + * + * Processes a buffer of coded data. If the processing results in a new + * decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be + * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode + * time stamp) order. Frames produced will always be in PTS (presentation + * time stamp) order. + * If the decoder is configured with VPX_CODEC_USE_INPUT_FRAGMENTS enabled, + * data and data_sz can contain a fragment of the encoded frame. Fragment + * \#n must contain at least partition \#n, but can also contain subsequent + * partitions (\#n+1 - \#n+i), and if so, fragments \#n+1, .., \#n+i must + * be empty. When no more data is available, this function should be called + * with NULL as data and 0 as data_sz. The memory passed to this function + * must be available until the frame has been decoded. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] data Pointer to this block of new coded data. If + * NULL, a VPX_CODEC_CB_PUT_FRAME event is posted + * for the previously decoded frame. + * \param[in] data_sz Size of the coded data, in bytes. + * \param[in] user_priv Application specific data to associate with + * this frame. + * \param[in] deadline Soft deadline the decoder should attempt to meet, + * in us. Set to zero for unlimited. + * + * \return Returns #VPX_CODEC_OK if the coded data was processed completely + * and future pictures can be decoded without error. Otherwise, + * see the descriptions of the other error codes in ::vpx_codec_err_t + * for recoverability capabilities. + */ +vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data, + unsigned int data_sz, void *user_priv, + long deadline); - /*!\brief Register for notification of frame completion. - * - * Registers a given function to be called when a decoded frame is - * available. - * - * \param[in] ctx Pointer to this instance's context - * \param[in] cb Pointer to the callback function - * \param[in] user_priv User's private data - * - * \retval #VPX_CODEC_OK - * Callback successfully registered. - * \retval #VPX_CODEC_ERROR - * Decoder context not initialized, or algorithm not capable of - * posting slice completion. - */ - vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx, - vpx_codec_put_frame_cb_fn_t cb, - void *user_priv); +/*!\brief Decoded frames iterator + * + * Iterates over a list of the frames available for display. The iterator + * storage should be initialized to NULL to start the iteration. Iteration is + * complete when this function returns NULL. + * + * The list of available frames becomes valid upon completion of the + * vpx_codec_decode call, and remains valid until the next call to + * vpx_codec_decode. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an image, if one is ready for display. Frames + * produced will always be in PTS (presentation time stamp) order. + */ +vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, vpx_codec_iter_t *iter); +/*!\defgroup cap_put_frame Frame-Based Decoding Functions + * + * The following functions are required to be implemented for all decoders + * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling these + * functions + * for codecs that don't advertise this capability will result in an error + * code being returned, usually VPX_CODEC_ERROR + * @{ + */ - /*!@} - end defgroup cap_put_frame */ +/*!\brief put frame callback prototype + * + * This callback is invoked by the decoder to notify the application of + * the availability of decoded image data. + */ +typedef void (*vpx_codec_put_frame_cb_fn_t)(void *user_priv, + const vpx_image_t *img); - /*!\defgroup cap_put_slice Slice-Based Decoding Functions - * - * The following functions are required to be implemented for all decoders - * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling these functions - * for codecs that don't advertise this capability will result in an error - * code being returned, usually VPX_CODEC_ERROR - * @{ - */ +/*!\brief Register for notification of frame completion. + * + * Registers a given function to be called when a decoded frame is + * available. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb Pointer to the callback function + * \param[in] user_priv User's private data + * + * \retval #VPX_CODEC_OK + * Callback successfully registered. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized, or algorithm not capable of + * posting slice completion. + */ +vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx, + vpx_codec_put_frame_cb_fn_t cb, + void *user_priv); - /*!\brief put slice callback prototype - * - * This callback is invoked by the decoder to notify the application of - * the availability of partially decoded image data. The - */ - typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv, - const vpx_image_t *img, - const vpx_image_rect_t *valid, - const vpx_image_rect_t *update); +/*!@} - end defgroup cap_put_frame */ +/*!\defgroup cap_put_slice Slice-Based Decoding Functions + * + * The following functions are required to be implemented for all decoders + * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling these + * functions + * for codecs that don't advertise this capability will result in an error + * code being returned, usually VPX_CODEC_ERROR + * @{ + */ - /*!\brief Register for notification of slice completion. - * - * Registers a given function to be called when a decoded slice is - * available. - * - * \param[in] ctx Pointer to this instance's context - * \param[in] cb Pointer to the callback function - * \param[in] user_priv User's private data - * - * \retval #VPX_CODEC_OK - * Callback successfully registered. - * \retval #VPX_CODEC_ERROR - * Decoder context not initialized, or algorithm not capable of - * posting slice completion. - */ - vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx, - vpx_codec_put_slice_cb_fn_t cb, - void *user_priv); +/*!\brief put slice callback prototype + * + * This callback is invoked by the decoder to notify the application of + * the availability of partially decoded image data. The + */ +typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv, + const vpx_image_t *img, + const vpx_image_rect_t *valid, + const vpx_image_rect_t *update); +/*!\brief Register for notification of slice completion. + * + * Registers a given function to be called when a decoded slice is + * available. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb Pointer to the callback function + * \param[in] user_priv User's private data + * + * \retval #VPX_CODEC_OK + * Callback successfully registered. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized, or algorithm not capable of + * posting slice completion. + */ +vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx, + vpx_codec_put_slice_cb_fn_t cb, + void *user_priv); - /*!@} - end defgroup cap_put_slice*/ +/*!@} - end defgroup cap_put_slice*/ - /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions - * - * The following section is required to be implemented for all decoders - * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability. - * Calling this function for codecs that don't advertise this capability - * will result in an error code being returned, usually VPX_CODEC_ERROR. - * - * \note - * Currently this only works with VP9. - * @{ - */ +/*!\defgroup cap_external_frame_buffer External Frame Buffer Functions + * + * The following section is required to be implemented for all decoders + * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability. + * Calling this function for codecs that don't advertise this capability + * will result in an error code being returned, usually VPX_CODEC_ERROR. + * + * \note + * Currently this only works with VP9. + * @{ + */ - /*!\brief Pass in external frame buffers for the decoder to use. - * - * Registers functions to be called when libvpx needs a frame buffer - * to decode the current frame and a function to be called when libvpx does - * not internally reference the frame buffer. This set function must - * be called before the first call to decode or libvpx will assume the - * default behavior of allocating frame buffers internally. - * - * \param[in] ctx Pointer to this instance's context - * \param[in] cb_get Pointer to the get callback function - * \param[in] cb_release Pointer to the release callback function - * \param[in] cb_priv Callback's private data - * - * \retval #VPX_CODEC_OK - * External frame buffers will be used by libvpx. - * \retval #VPX_CODEC_INVALID_PARAM - * One or more of the callbacks were NULL. - * \retval #VPX_CODEC_ERROR - * Decoder context not initialized, or algorithm not capable of - * using external frame buffers. - * - * \note - * When decoding VP9, the application may be required to pass in at least - * #VPX_MAXIMUM_WORK_BUFFERS external frame - * buffers. - */ - vpx_codec_err_t vpx_codec_set_frame_buffer_functions( - vpx_codec_ctx_t *ctx, - vpx_get_frame_buffer_cb_fn_t cb_get, - vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); +/*!\brief Pass in external frame buffers for the decoder to use. + * + * Registers functions to be called when libvpx needs a frame buffer + * to decode the current frame and a function to be called when libvpx does + * not internally reference the frame buffer. This set function must + * be called before the first call to decode or libvpx will assume the + * default behavior of allocating frame buffers internally. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb_get Pointer to the get callback function + * \param[in] cb_release Pointer to the release callback function + * \param[in] cb_priv Callback's private data + * + * \retval #VPX_CODEC_OK + * External frame buffers will be used by libvpx. + * \retval #VPX_CODEC_INVALID_PARAM + * One or more of the callbacks were NULL. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized, or algorithm not capable of + * using external frame buffers. + * + * \note + * When decoding VP9, the application may be required to pass in at least + * #VPX_MAXIMUM_WORK_BUFFERS external frame + * buffers. + */ +vpx_codec_err_t vpx_codec_set_frame_buffer_functions( + vpx_codec_ctx_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); - /*!@} - end defgroup cap_external_frame_buffer */ +/*!@} - end defgroup cap_external_frame_buffer */ - /*!@} - end defgroup decoder*/ +/*!@} - end defgroup decoder*/ #ifdef __cplusplus } #endif #endif // VPX_VPX_DECODER_H_ - diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 9a0f48ef824597b23c6282cb9ad9224b959364cb..62c3ce001e61e521d1632bab00a47d36dd4b60d3 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -31,869 +31,806 @@ extern "C" { #include "./vpx_codec.h" - /*!\brief Current ABI version number - * - * \internal - * If this file is altered in any way that changes the ABI, this value - * must be bumped. Examples include, but are not limited to, changing - * types, removing or reassigning enums, adding/removing/rearranging - * fields to structures - */ -#define VPX_ENCODER_ABI_VERSION (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ - +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_ENCODER_ABI_VERSION \ + (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ - /*! \brief Encoder capabilities bitfield - * - * Each encoder advertises the capabilities it supports as part of its - * ::vpx_codec_iface_t interface structure. Capabilities are extra - * interfaces or functionality, and are not required to be supported - * by an encoder. - * - * The available flags are specified by VPX_CODEC_CAP_* defines. - */ -#define VPX_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */ +/*! \brief Encoder capabilities bitfield + * + * Each encoder advertises the capabilities it supports as part of its + * ::vpx_codec_iface_t interface structure. Capabilities are extra + * interfaces or functionality, and are not required to be supported + * by an encoder. + * + * The available flags are specified by VPX_CODEC_CAP_* defines. + */ +#define VPX_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */ - /*! Can output one partition at a time. Each partition is returned in its - * own VPX_CODEC_CX_FRAME_PKT, with the FRAME_IS_FRAGMENT flag set for - * every partition but the last. In this mode all frames are always - * returned partition by partition. - */ -#define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000 +/*! Can output one partition at a time. Each partition is returned in its + * own VPX_CODEC_CX_FRAME_PKT, with the FRAME_IS_FRAGMENT flag set for + * every partition but the last. In this mode all frames are always + * returned partition by partition. + */ +#define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000 /*! Can support input images at greater than 8 bitdepth. */ -#define VPX_CODEC_CAP_HIGHBITDEPTH 0x40000 +#define VPX_CODEC_CAP_HIGHBITDEPTH 0x40000 - /*! \brief Initialization-time Feature Enabling - * - * Certain codec features must be known at initialization time, to allow - * for proper memory allocation. - * - * The available flags are specified by VPX_CODEC_USE_* defines. - */ -#define VPX_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */ -#define VPX_CODEC_USE_OUTPUT_PARTITION 0x20000 /**< Make the encoder output one - partition at a time. */ +/*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow + * for proper memory allocation. + * + * The available flags are specified by VPX_CODEC_USE_* defines. + */ +#define VPX_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */ +/*!\brief Make the encoder output one partition at a time. */ +#define VPX_CODEC_USE_OUTPUT_PARTITION 0x20000 #define VPX_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */ +/*!\brief Generic fixed size buffer structure + * + * This structure is able to hold a reference to any fixed size buffer. + */ +typedef struct vpx_fixed_buf { + void *buf; /**< Pointer to the data */ + size_t sz; /**< Length of the buffer, in chars */ +} vpx_fixed_buf_t; /**< alias for struct vpx_fixed_buf */ - /*!\brief Generic fixed size buffer structure - * - * This structure is able to hold a reference to any fixed size buffer. - */ - typedef struct vpx_fixed_buf { - void *buf; /**< Pointer to the data */ - size_t sz; /**< Length of the buffer, in chars */ - } vpx_fixed_buf_t; /**< alias for struct vpx_fixed_buf */ - - - /*!\brief Time Stamp Type - * - * An integer, which when multiplied by the stream's time base, provides - * the absolute time of a sample. - */ - typedef int64_t vpx_codec_pts_t; - - - /*!\brief Compressed Frame Flags - * - * This type represents a bitfield containing information about a compressed - * frame that may be useful to an application. The most significant 16 bits - * can be used by an algorithm to provide additional detail, for example to - * support frame types that are codec specific (MPEG-1 D-frames for example) - */ - typedef uint32_t vpx_codec_frame_flags_t; -#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */ -#define VPX_FRAME_IS_DROPPABLE 0x2 /**< frame can be dropped without affecting - the stream (no future frame depends on - this one) */ -#define VPX_FRAME_IS_INVISIBLE 0x4 /**< frame should be decoded but will not - be shown */ -#define VPX_FRAME_IS_FRAGMENT 0x8 /**< this is a fragment of the encoded - frame */ - - /*!\brief Error Resilient flags - * - * These flags define which error resilient features to enable in the - * encoder. The flags are specified through the - * vpx_codec_enc_cfg::g_error_resilient variable. - */ - typedef uint32_t vpx_codec_er_flags_t; -#define VPX_ERROR_RESILIENT_DEFAULT 0x1 /**< Improve resiliency against - losses of whole frames */ -#define VPX_ERROR_RESILIENT_PARTITIONS 0x2 /**< The frame partitions are - independently decodable by the - bool decoder, meaning that - partitions can be decoded even - though earlier partitions have - been lost. Note that intra - prediction is still done over - the partition boundary. */ - - /*!\brief Encoder output packet variants - * - * This enumeration lists the different kinds of data packets that can be - * returned by calls to vpx_codec_get_cx_data(). Algorithms \ref MAY - * extend this list to provide additional functionality. - */ - enum vpx_codec_cx_pkt_kind { - VPX_CODEC_CX_FRAME_PKT, /**< Compressed video frame */ - VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ - VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ - VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ - VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */ - }; - - - /*!\brief Encoder output packet - * - * This structure contains the different kinds of output data the encoder - * may produce while compressing a frame. - */ - typedef struct vpx_codec_cx_pkt { - enum vpx_codec_cx_pkt_kind kind; /**< packet variant */ - union { - struct { - void *buf; /**< compressed data buffer */ - size_t sz; /**< length of compressed data */ - vpx_codec_pts_t pts; /**< time stamp to show frame - (in timebase units) */ - unsigned long duration; /**< duration to show frame - (in timebase units) */ - vpx_codec_frame_flags_t flags; /**< flags for this frame */ - int partition_id; /**< the partition id - defines the decoding order - of the partitions. Only - applicable when "output partition" - mode is enabled. First partition - has id 0.*/ - - } frame; /**< data for compressed frame packet */ - vpx_fixed_buf_t twopass_stats; /**< data for two-pass packet */ - vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ - struct vpx_psnr_pkt { - unsigned int samples[4]; /**< Number of samples, total/y/u/v */ - uint64_t sse[4]; /**< sum squared error, total/y/u/v */ - double psnr[4]; /**< PSNR, total/y/u/v */ - } psnr; /**< data for PSNR packet */ - vpx_fixed_buf_t raw; /**< data for arbitrary packets */ - - /* This packet size is fixed to allow codecs to extend this - * interface without having to manage storage for raw packets, - * i.e., if it's smaller than 128 bytes, you can store in the - * packet list directly. - */ - char pad[128 - sizeof(enum vpx_codec_cx_pkt_kind)]; /**< fixed sz */ - } data; /**< packet data */ - } vpx_codec_cx_pkt_t; /**< alias for struct vpx_codec_cx_pkt */ - - /*!\brief Rational Number - * - * This structure holds a fractional value. - */ - typedef struct vpx_rational { - int num; /**< fraction numerator */ - int den; /**< fraction denominator */ - } vpx_rational_t; /**< alias for struct vpx_rational */ - - - /*!\brief Multi-pass Encoding Pass */ - enum vpx_enc_pass { - VPX_RC_ONE_PASS, /**< Single pass mode */ - VPX_RC_FIRST_PASS, /**< First pass of multi-pass mode */ - VPX_RC_LAST_PASS /**< Final pass of multi-pass mode */ - }; - - - /*!\brief Rate control mode */ - enum vpx_rc_mode { - VPX_VBR, /**< Variable Bit Rate (VBR) mode */ - VPX_CBR, /**< Constant Bit Rate (CBR) mode */ - VPX_CQ, /**< Constrained Quality (CQ) mode */ - VPX_Q, /**< Constant Quality (Q) mode */ - }; - - - /*!\brief Keyframe placement mode. - * - * This enumeration determines whether keyframes are placed automatically by - * the encoder or whether this behavior is disabled. Older releases of this - * SDK were implemented such that VPX_KF_FIXED meant keyframes were disabled. - * This name is confusing for this behavior, so the new symbols to be used - * are VPX_KF_AUTO and VPX_KF_DISABLED. - */ - enum vpx_kf_mode { - VPX_KF_FIXED, /**< deprecated, implies VPX_KF_DISABLED */ - VPX_KF_AUTO, /**< Encoder determines optimal placement automatically */ - VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */ - }; - - - /*!\brief Encoded Frame Flags - * - * This type indicates a bitfield to be passed to vpx_codec_encode(), defining - * per-frame boolean values. By convention, bits common to all codecs will be - * named VPX_EFLAG_*, and bits specific to an algorithm will be named - * /algo/_eflag_*. The lower order 16 bits are reserved for common use. - */ - typedef long vpx_enc_frame_flags_t; -#define VPX_EFLAG_FORCE_KF (1<<0) /**< Force this frame to be a keyframe */ - - - /*!\brief Encoder configuration structure - * - * This structure contains the encoder settings that have common representations - * across all codecs. This doesn't imply that all codecs support all features, - * however. - */ - typedef struct vpx_codec_enc_cfg { - /* - * generic settings (g) - */ - - /*!\brief Algorithm specific "usage" value - * - * Algorithms may define multiple values for usage, which may convey the - * intent of how the application intends to use the stream. If this value - * is non-zero, consult the documentation for the codec to determine its - * meaning. - */ - unsigned int g_usage; - - - /*!\brief Maximum number of threads to use - * - * For multi-threaded implementations, use no more than this number of - * threads. The codec may use fewer threads than allowed. The value - * 0 is equivalent to the value 1. - */ - unsigned int g_threads; - - - /*!\brief Bitstream profile to use - * - * Some codecs support a notion of multiple bitstream profiles. Typically - * this maps to a set of features that are turned on or off. Often the - * profile to use is determined by the features of the intended decoder. - * Consult the documentation for the codec to determine the valid values - * for this parameter, or set to zero for a sane default. - */ - unsigned int g_profile; /**< profile of bitstream to use */ - - - - /*!\brief Width of the frame - * - * This value identifies the presentation resolution of the frame, - * in pixels. Note that the frames passed as input to the encoder must - * have this resolution. Frames will be presented by the decoder in this - * resolution, independent of any spatial resampling the encoder may do. - */ - unsigned int g_w; - - - /*!\brief Height of the frame - * - * This value identifies the presentation resolution of the frame, - * in pixels. Note that the frames passed as input to the encoder must - * have this resolution. Frames will be presented by the decoder in this - * resolution, independent of any spatial resampling the encoder may do. - */ - unsigned int g_h; - - /*!\brief Bit-depth of the codec - * - * This value identifies the bit_depth of the codec, - * Only certain bit-depths are supported as identified in the - * vpx_bit_depth_t enum. - */ - vpx_bit_depth_t g_bit_depth; - - /*!\brief Bit-depth of the input frames - * - * This value identifies the bit_depth of the input frames in bits. - * Note that the frames passed as input to the encoder must have - * this bit-depth. - */ - unsigned int g_input_bit_depth; - - /*!\brief Stream timebase units - * - * Indicates the smallest interval of time, in seconds, used by the stream. - * For fixed frame rate material, or variable frame rate material where - * frames are timed at a multiple of a given clock (ex: video capture), - * the \ref RECOMMENDED method is to set the timebase to the reciprocal - * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the - * pts to correspond to the frame number, which can be handy. For - * re-encoding video from containers with absolute time timestamps, the - * \ref RECOMMENDED method is to set the timebase to that of the parent - * container or multimedia framework (ex: 1/1000 for ms, as in FLV). - */ - struct vpx_rational g_timebase; - - - /*!\brief Enable error resilient modes. - * - * The error resilient bitfield indicates to the encoder which features - * it should enable to take measures for streaming over lossy or noisy - * links. - */ - vpx_codec_er_flags_t g_error_resilient; - - - /*!\brief Multi-pass Encoding Mode - * - * This value should be set to the current phase for multi-pass encoding. - * For single pass, set to #VPX_RC_ONE_PASS. - */ - enum vpx_enc_pass g_pass; - - - /*!\brief Allow lagged encoding - * - * If set, this value allows the encoder to consume a number of input - * frames before producing output frames. This allows the encoder to - * base decisions for the current frame on future frames. This does - * increase the latency of the encoding pipeline, so it is not appropriate - * in all situations (ex: realtime encoding). - * - * Note that this is a maximum value -- the encoder may produce frames - * sooner than the given limit. Set this value to 0 to disable this - * feature. - */ - unsigned int g_lag_in_frames; - - - /* - * rate control settings (rc) - */ - - /*!\brief Temporal resampling configuration, if supported by the codec. - * - * Temporal resampling allows the codec to "drop" frames as a strategy to - * meet its target data rate. This can cause temporal discontinuities in - * the encoded video, which may appear as stuttering during playback. This - * trade-off is often acceptable, but for many applications is not. It can - * be disabled in these cases. - * - * Note that not all codecs support this feature. All vpx VPx codecs do. - * For other codecs, consult the documentation for that algorithm. - * - * This threshold is described as a percentage of the target data buffer. - * When the data buffer falls below this percentage of fullness, a - * dropped frame is indicated. Set the threshold to zero (0) to disable - * this feature. - */ - unsigned int rc_dropframe_thresh; - - - /*!\brief Enable/disable spatial resampling, if supported by the codec. - * - * Spatial resampling allows the codec to compress a lower resolution - * version of the frame, which is then upscaled by the encoder to the - * correct presentation resolution. This increases visual quality at - * low data rates, at the expense of CPU time on the encoder/decoder. - */ - unsigned int rc_resize_allowed; - - /*!\brief Internal coded frame width. - * - * If spatial resampling is enabled this specifies the width of the - * encoded frame. - */ - unsigned int rc_scaled_width; - - /*!\brief Internal coded frame height. - * - * If spatial resampling is enabled this specifies the height of the - * encoded frame. - */ - unsigned int rc_scaled_height; - - /*!\brief Spatial resampling up watermark. - * - * This threshold is described as a percentage of the target data buffer. - * When the data buffer rises above this percentage of fullness, the - * encoder will step up to a higher resolution version of the frame. - */ - unsigned int rc_resize_up_thresh; - - - /*!\brief Spatial resampling down watermark. - * - * This threshold is described as a percentage of the target data buffer. - * When the data buffer falls below this percentage of fullness, the - * encoder will step down to a lower resolution version of the frame. - */ - unsigned int rc_resize_down_thresh; - - - /*!\brief Rate control algorithm to use. - * - * Indicates whether the end usage of this stream is to be streamed over - * a bandwidth constrained link, indicating that Constant Bit Rate (CBR) - * mode should be used, or whether it will be played back on a high - * bandwidth link, as from a local disk, where higher variations in - * bitrate are acceptable. - */ - enum vpx_rc_mode rc_end_usage; - - - /*!\brief Two-pass stats buffer. - * - * A buffer containing all of the stats packets produced in the first - * pass, concatenated. - */ - vpx_fixed_buf_t rc_twopass_stats_in; - - /*!\brief first pass mb stats buffer. - * - * A buffer containing all of the first pass mb stats packets produced - * in the first pass, concatenated. - */ - vpx_fixed_buf_t rc_firstpass_mb_stats_in; - - /*!\brief Target data rate - * - * Target bandwidth to use for this stream, in kilobits per second. - */ - unsigned int rc_target_bitrate; - - - /* - * quantizer settings - */ - - - /*!\brief Minimum (Best Quality) Quantizer - * - * The quantizer is the most direct control over the quality of the - * encoded image. The range of valid values for the quantizer is codec - * specific. Consult the documentation for the codec to determine the - * values to use. To determine the range programmatically, call - * vpx_codec_enc_config_default() with a usage value of 0. - */ - unsigned int rc_min_quantizer; - - - /*!\brief Maximum (Worst Quality) Quantizer - * - * The quantizer is the most direct control over the quality of the - * encoded image. The range of valid values for the quantizer is codec - * specific. Consult the documentation for the codec to determine the - * values to use. To determine the range programmatically, call - * vpx_codec_enc_config_default() with a usage value of 0. - */ - unsigned int rc_max_quantizer; - - - /* - * bitrate tolerance - */ - - - /*!\brief Rate control adaptation undershoot control - * - * This value, expressed as a percentage of the target bitrate, - * controls the maximum allowed adaptation speed of the codec. - * This factor controls the maximum amount of bits that can - * be subtracted from the target bitrate in order to compensate - * for prior overshoot. - * - * Valid values in the range 0-1000. - */ - unsigned int rc_undershoot_pct; - - - /*!\brief Rate control adaptation overshoot control - * - * This value, expressed as a percentage of the target bitrate, - * controls the maximum allowed adaptation speed of the codec. - * This factor controls the maximum amount of bits that can - * be added to the target bitrate in order to compensate for - * prior undershoot. - * - * Valid values in the range 0-1000. - */ - unsigned int rc_overshoot_pct; - - - /* - * decoder buffer model parameters - */ - - - /*!\brief Decoder Buffer Size - * - * This value indicates the amount of data that may be buffered by the - * decoding application. Note that this value is expressed in units of - * time (milliseconds). For example, a value of 5000 indicates that the - * client will buffer (at least) 5000ms worth of encoded data. Use the - * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if - * necessary. - */ - unsigned int rc_buf_sz; - - - /*!\brief Decoder Buffer Initial Size - * - * This value indicates the amount of data that will be buffered by the - * decoding application prior to beginning playback. This value is - * expressed in units of time (milliseconds). Use the target bitrate - * (#rc_target_bitrate) to convert to bits/bytes, if necessary. - */ - unsigned int rc_buf_initial_sz; - - - /*!\brief Decoder Buffer Optimal Size - * - * This value indicates the amount of data that the encoder should try - * to maintain in the decoder's buffer. This value is expressed in units - * of time (milliseconds). Use the target bitrate (#rc_target_bitrate) - * to convert to bits/bytes, if necessary. - */ - unsigned int rc_buf_optimal_sz; - +/*!\brief Time Stamp Type + * + * An integer, which when multiplied by the stream's time base, provides + * the absolute time of a sample. + */ +typedef int64_t vpx_codec_pts_t; - /* - * 2 pass rate control parameters - */ +/*!\brief Compressed Frame Flags + * + * This type represents a bitfield containing information about a compressed + * frame that may be useful to an application. The most significant 16 bits + * can be used by an algorithm to provide additional detail, for example to + * support frame types that are codec specific (MPEG-1 D-frames for example) + */ +typedef uint32_t vpx_codec_frame_flags_t; +#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */ +/*!\brief frame can be dropped without affecting the stream (no future frame + * depends on this one) */ +#define VPX_FRAME_IS_DROPPABLE 0x2 +/*!\brief frame should be decoded but will not be shown */ +#define VPX_FRAME_IS_INVISIBLE 0x4 +/*!\brief this is a fragment of the encoded frame */ +#define VPX_FRAME_IS_FRAGMENT 0x8 + +/*!\brief Error Resilient flags + * + * These flags define which error resilient features to enable in the + * encoder. The flags are specified through the + * vpx_codec_enc_cfg::g_error_resilient variable. + */ +typedef uint32_t vpx_codec_er_flags_t; +/*!\brief Improve resiliency against losses of whole frames */ +#define VPX_ERROR_RESILIENT_DEFAULT 0x1 +/*!\brief The frame partitions are independently decodable by the bool decoder, + * meaning that partitions can be decoded even though earlier partitions have + * been lost. Note that intra prediction is still done over the partition + * boundary. */ +#define VPX_ERROR_RESILIENT_PARTITIONS 0x2 + +/*!\brief Encoder output packet variants + * + * This enumeration lists the different kinds of data packets that can be + * returned by calls to vpx_codec_get_cx_data(). Algorithms \ref MAY + * extend this list to provide additional functionality. + */ +enum vpx_codec_cx_pkt_kind { + VPX_CODEC_CX_FRAME_PKT, /**< Compressed video frame */ + VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ + VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ + VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ + VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */ +}; + +/*!\brief Encoder output packet + * + * This structure contains the different kinds of output data the encoder + * may produce while compressing a frame. + */ +typedef struct vpx_codec_cx_pkt { + enum vpx_codec_cx_pkt_kind kind; /**< packet variant */ + union { + struct { + void *buf; /**< compressed data buffer */ + size_t sz; /**< length of compressed data */ + /*!\brief time stamp to show frame (in timebase units) */ + vpx_codec_pts_t pts; + /*!\brief duration to show frame (in timebase units) */ + unsigned long duration; + vpx_codec_frame_flags_t flags; /**< flags for this frame */ + /*!\brief the partition id defines the decoding order of the partitions. + * Only applicable when "output partition" mode is enabled. First + * partition has id 0.*/ + int partition_id; + } frame; /**< data for compressed frame packet */ + vpx_fixed_buf_t twopass_stats; /**< data for two-pass packet */ + vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ + struct vpx_psnr_pkt { + unsigned int samples[4]; /**< Number of samples, total/y/u/v */ + uint64_t sse[4]; /**< sum squared error, total/y/u/v */ + double psnr[4]; /**< PSNR, total/y/u/v */ + } psnr; /**< data for PSNR packet */ + vpx_fixed_buf_t raw; /**< data for arbitrary packets */ + + /* This packet size is fixed to allow codecs to extend this + * interface without having to manage storage for raw packets, + * i.e., if it's smaller than 128 bytes, you can store in the + * packet list directly. + */ + char pad[128 - sizeof(enum vpx_codec_cx_pkt_kind)]; /**< fixed sz */ + } data; /**< packet data */ +} vpx_codec_cx_pkt_t; /**< alias for struct vpx_codec_cx_pkt */ + +/*!\brief Rational Number + * + * This structure holds a fractional value. + */ +typedef struct vpx_rational { + int num; /**< fraction numerator */ + int den; /**< fraction denominator */ +} vpx_rational_t; /**< alias for struct vpx_rational */ + +/*!\brief Multi-pass Encoding Pass */ +enum vpx_enc_pass { + VPX_RC_ONE_PASS, /**< Single pass mode */ + VPX_RC_FIRST_PASS, /**< First pass of multi-pass mode */ + VPX_RC_LAST_PASS /**< Final pass of multi-pass mode */ +}; + +/*!\brief Rate control mode */ +enum vpx_rc_mode { + VPX_VBR, /**< Variable Bit Rate (VBR) mode */ + VPX_CBR, /**< Constant Bit Rate (CBR) mode */ + VPX_CQ, /**< Constrained Quality (CQ) mode */ + VPX_Q, /**< Constant Quality (Q) mode */ +}; + +/*!\brief Keyframe placement mode. + * + * This enumeration determines whether keyframes are placed automatically by + * the encoder or whether this behavior is disabled. Older releases of this + * SDK were implemented such that VPX_KF_FIXED meant keyframes were disabled. + * This name is confusing for this behavior, so the new symbols to be used + * are VPX_KF_AUTO and VPX_KF_DISABLED. + */ +enum vpx_kf_mode { + VPX_KF_FIXED, /**< deprecated, implies VPX_KF_DISABLED */ + VPX_KF_AUTO, /**< Encoder determines optimal placement automatically */ + VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */ +}; +/*!\brief Encoded Frame Flags + * + * This type indicates a bitfield to be passed to vpx_codec_encode(), defining + * per-frame boolean values. By convention, bits common to all codecs will be + * named VPX_EFLAG_*, and bits specific to an algorithm will be named + * /algo/_eflag_*. The lower order 16 bits are reserved for common use. + */ +typedef long vpx_enc_frame_flags_t; +#define VPX_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */ - /*!\brief Two-pass mode CBR/VBR bias - * - * Bias, expressed on a scale of 0 to 100, for determining target size - * for the current frame. The value 0 indicates the optimal CBR mode - * value should be used. The value 100 indicates the optimal VBR mode - * value should be used. Values in between indicate which way the - * encoder should "lean." - */ - unsigned int rc_2pass_vbr_bias_pct; /**< RC mode bias between CBR and VBR(0-100: 0->CBR, 100->VBR) */ +/*!\brief Encoder configuration structure + * + * This structure contains the encoder settings that have common representations + * across all codecs. This doesn't imply that all codecs support all features, + * however. + */ +typedef struct vpx_codec_enc_cfg { + /* + * generic settings (g) + */ + /*!\brief Algorithm specific "usage" value + * + * Algorithms may define multiple values for usage, which may convey the + * intent of how the application intends to use the stream. If this value + * is non-zero, consult the documentation for the codec to determine its + * meaning. + */ + unsigned int g_usage; - /*!\brief Two-pass mode per-GOP minimum bitrate - * - * This value, expressed as a percentage of the target bitrate, indicates - * the minimum bitrate to be used for a single GOP (aka "section") - */ - unsigned int rc_2pass_vbr_minsection_pct; + /*!\brief Maximum number of threads to use + * + * For multi-threaded implementations, use no more than this number of + * threads. The codec may use fewer threads than allowed. The value + * 0 is equivalent to the value 1. + */ + unsigned int g_threads; + /*!\brief Bitstream profile to use + * + * Some codecs support a notion of multiple bitstream profiles. Typically + * this maps to a set of features that are turned on or off. Often the + * profile to use is determined by the features of the intended decoder. + * Consult the documentation for the codec to determine the valid values + * for this parameter, or set to zero for a sane default. + */ + unsigned int g_profile; /**< profile of bitstream to use */ - /*!\brief Two-pass mode per-GOP maximum bitrate - * - * This value, expressed as a percentage of the target bitrate, indicates - * the maximum bitrate to be used for a single GOP (aka "section") - */ - unsigned int rc_2pass_vbr_maxsection_pct; + /*!\brief Width of the frame + * + * This value identifies the presentation resolution of the frame, + * in pixels. Note that the frames passed as input to the encoder must + * have this resolution. Frames will be presented by the decoder in this + * resolution, independent of any spatial resampling the encoder may do. + */ + unsigned int g_w; + /*!\brief Height of the frame + * + * This value identifies the presentation resolution of the frame, + * in pixels. Note that the frames passed as input to the encoder must + * have this resolution. Frames will be presented by the decoder in this + * resolution, independent of any spatial resampling the encoder may do. + */ + unsigned int g_h; - /* - * keyframing settings (kf) - */ + /*!\brief Bit-depth of the codec + * + * This value identifies the bit_depth of the codec, + * Only certain bit-depths are supported as identified in the + * vpx_bit_depth_t enum. + */ + vpx_bit_depth_t g_bit_depth; - /*!\brief Keyframe placement mode - * - * This value indicates whether the encoder should place keyframes at a - * fixed interval, or determine the optimal placement automatically - * (as governed by the #kf_min_dist and #kf_max_dist parameters) - */ - enum vpx_kf_mode kf_mode; + /*!\brief Bit-depth of the input frames + * + * This value identifies the bit_depth of the input frames in bits. + * Note that the frames passed as input to the encoder must have + * this bit-depth. + */ + unsigned int g_input_bit_depth; + + /*!\brief Stream timebase units + * + * Indicates the smallest interval of time, in seconds, used by the stream. + * For fixed frame rate material, or variable frame rate material where + * frames are timed at a multiple of a given clock (ex: video capture), + * the \ref RECOMMENDED method is to set the timebase to the reciprocal + * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the + * pts to correspond to the frame number, which can be handy. For + * re-encoding video from containers with absolute time timestamps, the + * \ref RECOMMENDED method is to set the timebase to that of the parent + * container or multimedia framework (ex: 1/1000 for ms, as in FLV). + */ + struct vpx_rational g_timebase; + /*!\brief Enable error resilient modes. + * + * The error resilient bitfield indicates to the encoder which features + * it should enable to take measures for streaming over lossy or noisy + * links. + */ + vpx_codec_er_flags_t g_error_resilient; - /*!\brief Keyframe minimum interval - * - * This value, expressed as a number of frames, prevents the encoder from - * placing a keyframe nearer than kf_min_dist to the previous keyframe. At - * least kf_min_dist frames non-keyframes will be coded before the next - * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval. - */ - unsigned int kf_min_dist; + /*!\brief Multi-pass Encoding Mode + * + * This value should be set to the current phase for multi-pass encoding. + * For single pass, set to #VPX_RC_ONE_PASS. + */ + enum vpx_enc_pass g_pass; + /*!\brief Allow lagged encoding + * + * If set, this value allows the encoder to consume a number of input + * frames before producing output frames. This allows the encoder to + * base decisions for the current frame on future frames. This does + * increase the latency of the encoding pipeline, so it is not appropriate + * in all situations (ex: realtime encoding). + * + * Note that this is a maximum value -- the encoder may produce frames + * sooner than the given limit. Set this value to 0 to disable this + * feature. + */ + unsigned int g_lag_in_frames; - /*!\brief Keyframe maximum interval - * - * This value, expressed as a number of frames, forces the encoder to code - * a keyframe if one has not been coded in the last kf_max_dist frames. - * A value of 0 implies all frames will be keyframes. Set kf_min_dist - * equal to kf_max_dist for a fixed interval. - */ - unsigned int kf_max_dist; - } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ + /* + * rate control settings (rc) + */ - /*!\brief Initialize an encoder instance + /*!\brief Temporal resampling configuration, if supported by the codec. * - * Initializes a encoder context using the given interface. Applications - * should call the vpx_codec_enc_init convenience macro instead of this - * function directly, to ensure that the ABI version number parameter - * is properly initialized. + * Temporal resampling allows the codec to "drop" frames as a strategy to + * meet its target data rate. This can cause temporal discontinuities in + * the encoded video, which may appear as stuttering during playback. This + * trade-off is often acceptable, but for many applications is not. It can + * be disabled in these cases. * - * If the library was configured with --disable-multithread, this call - * is not thread safe and should be guarded with a lock if being used - * in a multithreaded context. + * Note that not all codecs support this feature. All vpx VPx codecs do. + * For other codecs, consult the documentation for that algorithm. * - * \param[in] ctx Pointer to this instance's context. - * \param[in] iface Pointer to the algorithm interface to use. - * \param[in] cfg Configuration to use, if known. May be NULL. - * \param[in] flags Bitfield of VPX_CODEC_USE_* flags - * \param[in] ver ABI version number. Must be set to - * VPX_ENCODER_ABI_VERSION - * \retval #VPX_CODEC_OK - * The decoder algorithm initialized. - * \retval #VPX_CODEC_MEM_ERROR - * Memory allocation failed. + * This threshold is described as a percentage of the target data buffer. + * When the data buffer falls below this percentage of fullness, a + * dropped frame is indicated. Set the threshold to zero (0) to disable + * this feature. */ - vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, - vpx_codec_iface_t *iface, - const vpx_codec_enc_cfg_t *cfg, - vpx_codec_flags_t flags, - int ver); + unsigned int rc_dropframe_thresh; - - /*!\brief Convenience macro for vpx_codec_enc_init_ver() + /*!\brief Enable/disable spatial resampling, if supported by the codec. * - * Ensures the ABI version parameter is properly set. + * Spatial resampling allows the codec to compress a lower resolution + * version of the frame, which is then upscaled by the encoder to the + * correct presentation resolution. This increases visual quality at + * low data rates, at the expense of CPU time on the encoder/decoder. */ -#define vpx_codec_enc_init(ctx, iface, cfg, flags) \ - vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION) - + unsigned int rc_resize_allowed; - /*!\brief Initialize multi-encoder instance + /*!\brief Internal coded frame width. * - * Initializes multi-encoder context using the given interface. - * Applications should call the vpx_codec_enc_init_multi convenience macro - * instead of this function directly, to ensure that the ABI version number - * parameter is properly initialized. - * - * \param[in] ctx Pointer to this instance's context. - * \param[in] iface Pointer to the algorithm interface to use. - * \param[in] cfg Configuration to use, if known. May be NULL. - * \param[in] num_enc Total number of encoders. - * \param[in] flags Bitfield of VPX_CODEC_USE_* flags - * \param[in] dsf Pointer to down-sampling factors. - * \param[in] ver ABI version number. Must be set to - * VPX_ENCODER_ABI_VERSION - * \retval #VPX_CODEC_OK - * The decoder algorithm initialized. - * \retval #VPX_CODEC_MEM_ERROR - * Memory allocation failed. + * If spatial resampling is enabled this specifies the width of the + * encoded frame. */ - vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, - vpx_codec_iface_t *iface, - vpx_codec_enc_cfg_t *cfg, - int num_enc, - vpx_codec_flags_t flags, - vpx_rational_t *dsf, - int ver); - + unsigned int rc_scaled_width; - /*!\brief Convenience macro for vpx_codec_enc_init_multi_ver() + /*!\brief Internal coded frame height. * - * Ensures the ABI version parameter is properly set. + * If spatial resampling is enabled this specifies the height of the + * encoded frame. */ -#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \ - vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf, \ - VPX_ENCODER_ABI_VERSION) + unsigned int rc_scaled_height; - - /*!\brief Get a default configuration - * - * Initializes a encoder configuration structure with default values. Supports - * the notion of "usages" so that an algorithm may offer different default - * settings depending on the user's intended goal. This function \ref SHOULD - * be called by all applications to initialize the configuration structure - * before specializing the configuration with application specific values. + /*!\brief Spatial resampling up watermark. * - * \param[in] iface Pointer to the algorithm interface to use. - * \param[out] cfg Configuration buffer to populate. - * \param[in] reserved Must set to 0 for VP8 and VP9. + * This threshold is described as a percentage of the target data buffer. + * When the data buffer rises above this percentage of fullness, the + * encoder will step up to a higher resolution version of the frame. + */ + unsigned int rc_resize_up_thresh; + + /*!\brief Spatial resampling down watermark. * - * \retval #VPX_CODEC_OK - * The configuration was populated. - * \retval #VPX_CODEC_INCAPABLE - * Interface is not an encoder interface. - * \retval #VPX_CODEC_INVALID_PARAM - * A parameter was NULL, or the usage value was not recognized. - */ - vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, - vpx_codec_enc_cfg_t *cfg, - unsigned int reserved); - - - /*!\brief Set or change configuration - * - * Reconfigures an encoder instance according to the given configuration. - * - * \param[in] ctx Pointer to this instance's context - * \param[in] cfg Configuration buffer to use - * - * \retval #VPX_CODEC_OK - * The configuration was populated. - * \retval #VPX_CODEC_INCAPABLE - * Interface is not an encoder interface. - * \retval #VPX_CODEC_INVALID_PARAM - * A parameter was NULL, or the usage value was not recognized. - */ - vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx, - const vpx_codec_enc_cfg_t *cfg); - - - /*!\brief Get global stream headers - * - * Retrieves a stream level global header packet, if supported by the codec. - * - * \param[in] ctx Pointer to this instance's context - * - * \retval NULL - * Encoder does not support global header - * \retval Non-NULL - * Pointer to buffer containing global header packet - */ - vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx); - - -#define VPX_DL_REALTIME (1) /**< deadline parameter analogous to - * VPx REALTIME mode. */ -#define VPX_DL_GOOD_QUALITY (1000000) /**< deadline parameter analogous to - * VPx GOOD QUALITY mode. */ -#define VPX_DL_BEST_QUALITY (0) /**< deadline parameter analogous to - * VPx BEST QUALITY mode. */ - /*!\brief Encode a frame - * - * Encodes a video frame at the given "presentation time." The presentation - * time stamp (PTS) \ref MUST be strictly increasing. - * - * The encoder supports the notion of a soft real-time deadline. Given a - * non-zero value to the deadline parameter, the encoder will make a "best - * effort" guarantee to return before the given time slice expires. It is - * implicit that limiting the available time to encode will degrade the - * output quality. The encoder can be given an unlimited time to produce the - * best possible frame by specifying a deadline of '0'. This deadline - * supercedes the VPx notion of "best quality, good quality, realtime". - * Applications that wish to map these former settings to the new deadline - * based system can use the symbols #VPX_DL_REALTIME, #VPX_DL_GOOD_QUALITY, - * and #VPX_DL_BEST_QUALITY. + * This threshold is described as a percentage of the target data buffer. + * When the data buffer falls below this percentage of fullness, the + * encoder will step down to a lower resolution version of the frame. + */ + unsigned int rc_resize_down_thresh; + + /*!\brief Rate control algorithm to use. * - * When the last frame has been passed to the encoder, this function should - * continue to be called, with the img parameter set to NULL. This will - * signal the end-of-stream condition to the encoder and allow it to encode - * any held buffers. Encoding is complete when vpx_codec_encode() is called - * and vpx_codec_get_cx_data() returns no data. + * Indicates whether the end usage of this stream is to be streamed over + * a bandwidth constrained link, indicating that Constant Bit Rate (CBR) + * mode should be used, or whether it will be played back on a high + * bandwidth link, as from a local disk, where higher variations in + * bitrate are acceptable. + */ + enum vpx_rc_mode rc_end_usage; + + /*!\brief Two-pass stats buffer. * - * \param[in] ctx Pointer to this instance's context - * \param[in] img Image data to encode, NULL to flush. - * \param[in] pts Presentation time stamp, in timebase units. - * \param[in] duration Duration to show frame, in timebase units. - * \param[in] flags Flags to use for encoding this frame. - * \param[in] deadline Time to spend encoding, in microseconds. (0=infinite) + * A buffer containing all of the stats packets produced in the first + * pass, concatenated. + */ + vpx_fixed_buf_t rc_twopass_stats_in; + + /*!\brief first pass mb stats buffer. * - * \retval #VPX_CODEC_OK - * The configuration was populated. - * \retval #VPX_CODEC_INCAPABLE - * Interface is not an encoder interface. - * \retval #VPX_CODEC_INVALID_PARAM - * A parameter was NULL, the image format is unsupported, etc. + * A buffer containing all of the first pass mb stats packets produced + * in the first pass, concatenated. */ - vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, - const vpx_image_t *img, - vpx_codec_pts_t pts, - unsigned long duration, - vpx_enc_frame_flags_t flags, - unsigned long deadline); + vpx_fixed_buf_t rc_firstpass_mb_stats_in; - /*!\brief Set compressed data output buffer + /*!\brief Target data rate * - * Sets the buffer that the codec should output the compressed data - * into. This call effectively sets the buffer pointer returned in the - * next VPX_CODEC_CX_FRAME_PKT packet. Subsequent packets will be - * appended into this buffer. The buffer is preserved across frames, - * so applications must periodically call this function after flushing - * the accumulated compressed data to disk or to the network to reset - * the pointer to the buffer's head. + * Target bandwidth to use for this stream, in kilobits per second. + */ + unsigned int rc_target_bitrate; + + /* + * quantizer settings + */ + + /*!\brief Minimum (Best Quality) Quantizer * - * `pad_before` bytes will be skipped before writing the compressed - * data, and `pad_after` bytes will be appended to the packet. The size - * of the packet will be the sum of the size of the actual compressed - * data, pad_before, and pad_after. The padding bytes will be preserved - * (not overwritten). + * The quantizer is the most direct control over the quality of the + * encoded image. The range of valid values for the quantizer is codec + * specific. Consult the documentation for the codec to determine the + * values to use. To determine the range programmatically, call + * vpx_codec_enc_config_default() with a usage value of 0. + */ + unsigned int rc_min_quantizer; + + /*!\brief Maximum (Worst Quality) Quantizer * - * Note that calling this function does not guarantee that the returned - * compressed data will be placed into the specified buffer. In the - * event that the encoded data will not fit into the buffer provided, - * the returned packet \ref MAY point to an internal buffer, as it would - * if this call were never used. In this event, the output packet will - * NOT have any padding, and the application must free space and copy it - * to the proper place. This is of particular note in configurations - * that may output multiple packets for a single encoded frame (e.g., lagged - * encoding) or if the application does not reset the buffer periodically. + * The quantizer is the most direct control over the quality of the + * encoded image. The range of valid values for the quantizer is codec + * specific. Consult the documentation for the codec to determine the + * values to use. To determine the range programmatically, call + * vpx_codec_enc_config_default() with a usage value of 0. + */ + unsigned int rc_max_quantizer; + + /* + * bitrate tolerance + */ + + /*!\brief Rate control adaptation undershoot control * - * Applications may restore the default behavior of the codec providing - * the compressed data buffer by calling this function with a NULL - * buffer. + * This value, expressed as a percentage of the target bitrate, + * controls the maximum allowed adaptation speed of the codec. + * This factor controls the maximum amount of bits that can + * be subtracted from the target bitrate in order to compensate + * for prior overshoot. * - * Applications \ref MUSTNOT call this function during iteration of - * vpx_codec_get_cx_data(). + * Valid values in the range 0-1000. + */ + unsigned int rc_undershoot_pct; + + /*!\brief Rate control adaptation overshoot control * - * \param[in] ctx Pointer to this instance's context - * \param[in] buf Buffer to store compressed data into - * \param[in] pad_before Bytes to skip before writing compressed data - * \param[in] pad_after Bytes to skip after writing compressed data + * This value, expressed as a percentage of the target bitrate, + * controls the maximum allowed adaptation speed of the codec. + * This factor controls the maximum amount of bits that can + * be added to the target bitrate in order to compensate for + * prior undershoot. * - * \retval #VPX_CODEC_OK - * The buffer was set successfully. - * \retval #VPX_CODEC_INVALID_PARAM - * A parameter was NULL, the image format is unsupported, etc. + * Valid values in the range 0-1000. */ - vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx, - const vpx_fixed_buf_t *buf, - unsigned int pad_before, - unsigned int pad_after); + unsigned int rc_overshoot_pct; + /* + * decoder buffer model parameters + */ - /*!\brief Encoded data iterator - * - * Iterates over a list of data packets to be passed from the encoder to the - * application. The different kinds of packets available are enumerated in - * #vpx_codec_cx_pkt_kind. + /*!\brief Decoder Buffer Size * - * #VPX_CODEC_CX_FRAME_PKT packets should be passed to the application's - * muxer. Multiple compressed frames may be in the list. - * #VPX_CODEC_STATS_PKT packets should be appended to a global buffer. + * This value indicates the amount of data that may be buffered by the + * decoding application. Note that this value is expressed in units of + * time (milliseconds). For example, a value of 5000 indicates that the + * client will buffer (at least) 5000ms worth of encoded data. Use the + * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if + * necessary. + */ + unsigned int rc_buf_sz; + + /*!\brief Decoder Buffer Initial Size * - * The application \ref MUST silently ignore any packet kinds that it does - * not recognize or support. + * This value indicates the amount of data that will be buffered by the + * decoding application prior to beginning playback. This value is + * expressed in units of time (milliseconds). Use the target bitrate + * (#rc_target_bitrate) to convert to bits/bytes, if necessary. + */ + unsigned int rc_buf_initial_sz; + + /*!\brief Decoder Buffer Optimal Size * - * The data buffers returned from this function are only guaranteed to be - * valid until the application makes another call to any vpx_codec_* function. + * This value indicates the amount of data that the encoder should try + * to maintain in the decoder's buffer. This value is expressed in units + * of time (milliseconds). Use the target bitrate (#rc_target_bitrate) + * to convert to bits/bytes, if necessary. + */ + unsigned int rc_buf_optimal_sz; + + /* + * 2 pass rate control parameters + */ + + /*!\brief Two-pass mode CBR/VBR bias * - * \param[in] ctx Pointer to this instance's context - * \param[in,out] iter Iterator storage, initialized to NULL + * Bias, expressed on a scale of 0 to 100, for determining target size + * for the current frame. The value 0 indicates the optimal CBR mode + * value should be used. The value 100 indicates the optimal VBR mode + * value should be used. Values in between indicate which way the + * encoder should "lean." + */ + unsigned int rc_2pass_vbr_bias_pct; + + /*!\brief Two-pass mode per-GOP minimum bitrate * - * \return Returns a pointer to an output data packet (compressed frame data, - * two-pass statistics, etc.) or NULL to signal end-of-list. + * This value, expressed as a percentage of the target bitrate, indicates + * the minimum bitrate to be used for a single GOP (aka "section") + */ + unsigned int rc_2pass_vbr_minsection_pct; + + /*!\brief Two-pass mode per-GOP maximum bitrate * + * This value, expressed as a percentage of the target bitrate, indicates + * the maximum bitrate to be used for a single GOP (aka "section") */ - const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx, - vpx_codec_iter_t *iter); + unsigned int rc_2pass_vbr_maxsection_pct; + /* + * keyframing settings (kf) + */ - /*!\brief Get Preview Frame - * - * Returns an image that can be used as a preview. Shows the image as it would - * exist at the decompressor. The application \ref MUST NOT write into this - * image buffer. + /*!\brief Keyframe placement mode * - * \param[in] ctx Pointer to this instance's context + * This value indicates whether the encoder should place keyframes at a + * fixed interval, or determine the optimal placement automatically + * (as governed by the #kf_min_dist and #kf_max_dist parameters) + */ + enum vpx_kf_mode kf_mode; + + /*!\brief Keyframe minimum interval * - * \return Returns a pointer to a preview image, or NULL if no image is - * available. + * This value, expressed as a number of frames, prevents the encoder from + * placing a keyframe nearer than kf_min_dist to the previous keyframe. At + * least kf_min_dist frames non-keyframes will be coded before the next + * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval. + */ + unsigned int kf_min_dist; + + /*!\brief Keyframe maximum interval * + * This value, expressed as a number of frames, forces the encoder to code + * a keyframe if one has not been coded in the last kf_max_dist frames. + * A value of 0 implies all frames will be keyframes. Set kf_min_dist + * equal to kf_max_dist for a fixed interval. */ - const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx); + unsigned int kf_max_dist; +} vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ + +/*!\brief Initialize an encoder instance + * + * Initializes a encoder context using the given interface. Applications + * should call the vpx_codec_enc_init convenience macro instead of this + * function directly, to ensure that the ABI version number parameter + * is properly initialized. + * + * If the library was configured with --disable-multithread, this call + * is not thread safe and should be guarded with a lock if being used + * in a multithreaded context. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] flags Bitfield of VPX_CODEC_USE_* flags + * \param[in] ver ABI version number. Must be set to + * VPX_ENCODER_ABI_VERSION + * \retval #VPX_CODEC_OK + * The decoder algorithm initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ +vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + const vpx_codec_enc_cfg_t *cfg, + vpx_codec_flags_t flags, int ver); + +/*!\brief Convenience macro for vpx_codec_enc_init_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define vpx_codec_enc_init(ctx, iface, cfg, flags) \ + vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION) + +/*!\brief Initialize multi-encoder instance + * + * Initializes multi-encoder context using the given interface. + * Applications should call the vpx_codec_enc_init_multi convenience macro + * instead of this function directly, to ensure that the ABI version number + * parameter is properly initialized. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] num_enc Total number of encoders. + * \param[in] flags Bitfield of VPX_CODEC_USE_* flags + * \param[in] dsf Pointer to down-sampling factors. + * \param[in] ver ABI version number. Must be set to + * VPX_ENCODER_ABI_VERSION + * \retval #VPX_CODEC_OK + * The decoder algorithm initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ +vpx_codec_err_t vpx_codec_enc_init_multi_ver( + vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg, + int num_enc, vpx_codec_flags_t flags, vpx_rational_t *dsf, int ver); + +/*!\brief Convenience macro for vpx_codec_enc_init_multi_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \ + vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf, \ + VPX_ENCODER_ABI_VERSION) + +/*!\brief Get a default configuration + * + * Initializes a encoder configuration structure with default values. Supports + * the notion of "usages" so that an algorithm may offer different default + * settings depending on the user's intended goal. This function \ref SHOULD + * be called by all applications to initialize the configuration structure + * before specializing the configuration with application specific values. + * + * \param[in] iface Pointer to the algorithm interface to use. + * \param[out] cfg Configuration buffer to populate. + * \param[in] reserved Must set to 0 for VP8 and VP9. + * + * \retval #VPX_CODEC_OK + * The configuration was populated. + * \retval #VPX_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #VPX_CODEC_INVALID_PARAM + * A parameter was NULL, or the usage value was not recognized. + */ +vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg, + unsigned int reserved); + +/*!\brief Set or change configuration + * + * Reconfigures an encoder instance according to the given configuration. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cfg Configuration buffer to use + * + * \retval #VPX_CODEC_OK + * The configuration was populated. + * \retval #VPX_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #VPX_CODEC_INVALID_PARAM + * A parameter was NULL, or the usage value was not recognized. + */ +vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx, + const vpx_codec_enc_cfg_t *cfg); + +/*!\brief Get global stream headers + * + * Retrieves a stream level global header packet, if supported by the codec. + * + * \param[in] ctx Pointer to this instance's context + * + * \retval NULL + * Encoder does not support global header + * \retval Non-NULL + * Pointer to buffer containing global header packet + */ +vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx); + +/*!\brief deadline parameter analogous to VPx REALTIME mode. */ +#define VPX_DL_REALTIME (1) +/*!\brief deadline parameter analogous to VPx GOOD QUALITY mode. */ +#define VPX_DL_GOOD_QUALITY (1000000) +/*!\brief deadline parameter analogous to VPx BEST QUALITY mode. */ +#define VPX_DL_BEST_QUALITY (0) +/*!\brief Encode a frame + * + * Encodes a video frame at the given "presentation time." The presentation + * time stamp (PTS) \ref MUST be strictly increasing. + * + * The encoder supports the notion of a soft real-time deadline. Given a + * non-zero value to the deadline parameter, the encoder will make a "best + * effort" guarantee to return before the given time slice expires. It is + * implicit that limiting the available time to encode will degrade the + * output quality. The encoder can be given an unlimited time to produce the + * best possible frame by specifying a deadline of '0'. This deadline + * supercedes the VPx notion of "best quality, good quality, realtime". + * Applications that wish to map these former settings to the new deadline + * based system can use the symbols #VPX_DL_REALTIME, #VPX_DL_GOOD_QUALITY, + * and #VPX_DL_BEST_QUALITY. + * + * When the last frame has been passed to the encoder, this function should + * continue to be called, with the img parameter set to NULL. This will + * signal the end-of-stream condition to the encoder and allow it to encode + * any held buffers. Encoding is complete when vpx_codec_encode() is called + * and vpx_codec_get_cx_data() returns no data. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] img Image data to encode, NULL to flush. + * \param[in] pts Presentation time stamp, in timebase units. + * \param[in] duration Duration to show frame, in timebase units. + * \param[in] flags Flags to use for encoding this frame. + * \param[in] deadline Time to spend encoding, in microseconds. (0=infinite) + * + * \retval #VPX_CODEC_OK + * The configuration was populated. + * \retval #VPX_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #VPX_CODEC_INVALID_PARAM + * A parameter was NULL, the image format is unsupported, etc. + */ +vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, + vpx_codec_pts_t pts, unsigned long duration, + vpx_enc_frame_flags_t flags, + unsigned long deadline); + +/*!\brief Set compressed data output buffer + * + * Sets the buffer that the codec should output the compressed data + * into. This call effectively sets the buffer pointer returned in the + * next VPX_CODEC_CX_FRAME_PKT packet. Subsequent packets will be + * appended into this buffer. The buffer is preserved across frames, + * so applications must periodically call this function after flushing + * the accumulated compressed data to disk or to the network to reset + * the pointer to the buffer's head. + * + * `pad_before` bytes will be skipped before writing the compressed + * data, and `pad_after` bytes will be appended to the packet. The size + * of the packet will be the sum of the size of the actual compressed + * data, pad_before, and pad_after. The padding bytes will be preserved + * (not overwritten). + * + * Note that calling this function does not guarantee that the returned + * compressed data will be placed into the specified buffer. In the + * event that the encoded data will not fit into the buffer provided, + * the returned packet \ref MAY point to an internal buffer, as it would + * if this call were never used. In this event, the output packet will + * NOT have any padding, and the application must free space and copy it + * to the proper place. This is of particular note in configurations + * that may output multiple packets for a single encoded frame (e.g., lagged + * encoding) or if the application does not reset the buffer periodically. + * + * Applications may restore the default behavior of the codec providing + * the compressed data buffer by calling this function with a NULL + * buffer. + * + * Applications \ref MUSTNOT call this function during iteration of + * vpx_codec_get_cx_data(). + * + * \param[in] ctx Pointer to this instance's context + * \param[in] buf Buffer to store compressed data into + * \param[in] pad_before Bytes to skip before writing compressed data + * \param[in] pad_after Bytes to skip after writing compressed data + * + * \retval #VPX_CODEC_OK + * The buffer was set successfully. + * \retval #VPX_CODEC_INVALID_PARAM + * A parameter was NULL, the image format is unsupported, etc. + */ +vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx, + const vpx_fixed_buf_t *buf, + unsigned int pad_before, + unsigned int pad_after); +/*!\brief Encoded data iterator + * + * Iterates over a list of data packets to be passed from the encoder to the + * application. The different kinds of packets available are enumerated in + * #vpx_codec_cx_pkt_kind. + * + * #VPX_CODEC_CX_FRAME_PKT packets should be passed to the application's + * muxer. Multiple compressed frames may be in the list. + * #VPX_CODEC_STATS_PKT packets should be appended to a global buffer. + * + * The application \ref MUST silently ignore any packet kinds that it does + * not recognize or support. + * + * The data buffers returned from this function are only guaranteed to be + * valid until the application makes another call to any vpx_codec_* function. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an output data packet (compressed frame data, + * two-pass statistics, etc.) or NULL to signal end-of-list. + * + */ +const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx, + vpx_codec_iter_t *iter); + +/*!\brief Get Preview Frame + * + * Returns an image that can be used as a preview. Shows the image as it would + * exist at the decompressor. The application \ref MUST NOT write into this + * image buffer. + * + * \param[in] ctx Pointer to this instance's context + * + * \return Returns a pointer to a preview image, or NULL if no image is + * available. + * + */ +const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx); - /*!@} - end defgroup encoder*/ +/*!@} - end defgroup encoder*/ #ifdef __cplusplus } #endif #endif // VPX_VPX_ENCODER_H_ - diff --git a/vpx/vpx_frame_buffer.h b/vpx/vpx_frame_buffer.h index 109aec445c1ec8d8e8596312021031762cf071b5..8adbe25aad815a51ace428e6b38b1325ffdeba3d 100644 --- a/vpx/vpx_frame_buffer.h +++ b/vpx/vpx_frame_buffer.h @@ -37,9 +37,9 @@ extern "C" { * This structure holds allocated frame buffers used by the decoder. */ typedef struct vpx_codec_frame_buffer { - uint8_t *data; /**< Pointer to the data buffer */ - size_t size; /**< Size of data in bytes */ - void *priv; /**< Frame's private data */ + uint8_t *data; /**< Pointer to the data buffer */ + size_t size; /**< Size of data in bytes */ + void *priv; /**< Frame's private data */ } vpx_codec_frame_buffer_t; /*!\brief get frame buffer callback prototype @@ -60,8 +60,8 @@ typedef struct vpx_codec_frame_buffer { * \param[in] new_size Size in bytes needed by the buffer * \param[in,out] fb Pointer to vpx_codec_frame_buffer_t */ -typedef int (*vpx_get_frame_buffer_cb_fn_t)( - void *priv, size_t min_size, vpx_codec_frame_buffer_t *fb); +typedef int (*vpx_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size, + vpx_codec_frame_buffer_t *fb); /*!\brief release frame buffer callback prototype * @@ -73,8 +73,8 @@ typedef int (*vpx_get_frame_buffer_cb_fn_t)( * \param[in] priv Callback's private data * \param[in] fb Pointer to vpx_codec_frame_buffer_t */ -typedef int (*vpx_release_frame_buffer_cb_fn_t)( - void *priv, vpx_codec_frame_buffer_t *fb); +typedef int (*vpx_release_frame_buffer_cb_fn_t)(void *priv, + vpx_codec_frame_buffer_t *fb); #ifdef __cplusplus } // extern "C" diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h index 7958c69806ed0fbefdcca9ce1b075a5eb734d815..d6d3166d2ffd08d900deddc69f9278bdf185b6d3 100644 --- a/vpx/vpx_image.h +++ b/vpx/vpx_image.h @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - /*!\file * \brief Describes the vpx image descriptor and associated operations * @@ -20,213 +19,203 @@ extern "C" { #endif - /*!\brief Current ABI version number - * - * \internal - * If this file is altered in any way that changes the ABI, this value - * must be bumped. Examples include, but are not limited to, changing - * types, removing or reassigning enums, adding/removing/rearranging - * fields to structures - */ +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ #define VPX_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/ - -#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ -#define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */ -#define VPX_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel. */ -#define VPX_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */ - - /*!\brief List of supported image formats */ - typedef enum vpx_img_fmt { - VPX_IMG_FMT_NONE, - VPX_IMG_FMT_RGB24, /**< 24 bit per pixel packed RGB */ - VPX_IMG_FMT_RGB32, /**< 32 bit per pixel packed 0RGB */ - VPX_IMG_FMT_RGB565, /**< 16 bit per pixel, 565 */ - VPX_IMG_FMT_RGB555, /**< 16 bit per pixel, 555 */ - VPX_IMG_FMT_UYVY, /**< UYVY packed YUV */ - VPX_IMG_FMT_YUY2, /**< YUYV packed YUV */ - VPX_IMG_FMT_YVYU, /**< YVYU packed YUV */ - VPX_IMG_FMT_BGR24, /**< 24 bit per pixel packed BGR */ - VPX_IMG_FMT_RGB32_LE, /**< 32 bit packed BGR0 */ - VPX_IMG_FMT_ARGB, /**< 32 bit packed ARGB, alpha=255 */ - VPX_IMG_FMT_ARGB_LE, /**< 32 bit packed BGRA, alpha=255 */ - VPX_IMG_FMT_RGB565_LE, /**< 16 bit per pixel, gggbbbbb rrrrrggg */ - VPX_IMG_FMT_RGB555_LE, /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */ - VPX_IMG_FMT_YV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */ - VPX_IMG_FMT_I420 = VPX_IMG_FMT_PLANAR | 2, - VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 3, /** < planar 4:2:0 format with vpx color space */ - VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4, - VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5, - VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6, - VPX_IMG_FMT_I440 = VPX_IMG_FMT_PLANAR | 7, - VPX_IMG_FMT_444A = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 6, - VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH, - VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH, - VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH, - VPX_IMG_FMT_I44016 = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH - } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */ - - /*!\brief List of supported color spaces */ - typedef enum vpx_color_space { - VPX_CS_UNKNOWN = 0, /**< Unknown */ - VPX_CS_BT_601 = 1, /**< BT.601 */ - VPX_CS_BT_709 = 2, /**< BT.709 */ - VPX_CS_SMPTE_170 = 3, /**< SMPTE.170 */ - VPX_CS_SMPTE_240 = 4, /**< SMPTE.240 */ - VPX_CS_BT_2020 = 5, /**< BT.2020 */ - VPX_CS_RESERVED = 6, /**< Reserved */ - VPX_CS_SRGB = 7 /**< sRGB */ - } vpx_color_space_t; /**< alias for enum vpx_color_space */ - - /*!\brief List of supported color range */ - typedef enum vpx_color_range { - VPX_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */ - VPX_CR_FULL_RANGE = 1 /**< YUV/RGB [0..255] */ - } vpx_color_range_t; /**< alias for enum vpx_color_range */ - - /**\brief Image Descriptor */ - typedef struct vpx_image { - vpx_img_fmt_t fmt; /**< Image Format */ - vpx_color_space_t cs; /**< Color Space */ - vpx_color_range_t range; /**< Color Range */ - - /* Image storage dimensions */ - unsigned int w; /**< Stored image width */ - unsigned int h; /**< Stored image height */ - unsigned int bit_depth; /**< Stored image bit-depth */ - - /* Image display dimensions */ - unsigned int d_w; /**< Displayed image width */ - unsigned int d_h; /**< Displayed image height */ - - /* Image intended rendering dimensions */ - unsigned int r_w; /**< Intended rendering image width */ - unsigned int r_h; /**< Intended rendering image height */ - - /* Chroma subsampling info */ - unsigned int x_chroma_shift; /**< subsampling order, X */ - unsigned int y_chroma_shift; /**< subsampling order, Y */ - - /* Image data pointers. */ -#define VPX_PLANE_PACKED 0 /**< To be used for all packed formats */ -#define VPX_PLANE_Y 0 /**< Y (Luminance) plane */ -#define VPX_PLANE_U 1 /**< U (Chroma) plane */ -#define VPX_PLANE_V 2 /**< V (Chroma) plane */ -#define VPX_PLANE_ALPHA 3 /**< A (Transparency) plane */ - unsigned char *planes[4]; /**< pointer to the top left pixel for each plane */ - int stride[4]; /**< stride between rows for each plane */ - - int bps; /**< bits per sample (for packed formats) */ - - /* The following member may be set by the application to associate data - * with this image. - */ - void *user_priv; /**< may be set by the application to associate data - * with this image. */ - - /* The following members should be treated as private. */ - unsigned char *img_data; /**< private */ - int img_data_owner; /**< private */ - int self_allocd; /**< private */ - - void *fb_priv; /**< Frame buffer data associated with the image. */ - } vpx_image_t; /**< alias for struct vpx_image */ - - /**\brief Representation of a rectangle on a surface */ - typedef struct vpx_image_rect { - unsigned int x; /**< leftmost column */ - unsigned int y; /**< topmost row */ - unsigned int w; /**< width */ - unsigned int h; /**< height */ - } vpx_image_rect_t; /**< alias for struct vpx_image_rect */ - - /*!\brief Open a descriptor, allocating storage for the underlying image - * - * Returns a descriptor for storing an image of the given format. The - * storage for the descriptor is allocated on the heap. - * - * \param[in] img Pointer to storage for descriptor. If this parameter - * is NULL, the storage for the descriptor will be - * allocated on the heap. - * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image - * \param[in] align Alignment, in bytes, of the image buffer and - * each row in the image(stride). - * - * \return Returns a pointer to the initialized image descriptor. If the img - * parameter is non-null, the value of the img parameter will be - * returned. +#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ +#define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */ +#define VPX_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel. */ +#define VPX_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */ + +/*!\brief List of supported image formats */ +typedef enum vpx_img_fmt { + VPX_IMG_FMT_NONE, + VPX_IMG_FMT_RGB24, /**< 24 bit per pixel packed RGB */ + VPX_IMG_FMT_RGB32, /**< 32 bit per pixel packed 0RGB */ + VPX_IMG_FMT_RGB565, /**< 16 bit per pixel, 565 */ + VPX_IMG_FMT_RGB555, /**< 16 bit per pixel, 555 */ + VPX_IMG_FMT_UYVY, /**< UYVY packed YUV */ + VPX_IMG_FMT_YUY2, /**< YUYV packed YUV */ + VPX_IMG_FMT_YVYU, /**< YVYU packed YUV */ + VPX_IMG_FMT_BGR24, /**< 24 bit per pixel packed BGR */ + VPX_IMG_FMT_RGB32_LE, /**< 32 bit packed BGR0 */ + VPX_IMG_FMT_ARGB, /**< 32 bit packed ARGB, alpha=255 */ + VPX_IMG_FMT_ARGB_LE, /**< 32 bit packed BGRA, alpha=255 */ + VPX_IMG_FMT_RGB565_LE, /**< 16 bit per pixel, gggbbbbb rrrrrggg */ + VPX_IMG_FMT_RGB555_LE, /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */ + VPX_IMG_FMT_YV12 = + VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */ + VPX_IMG_FMT_I420 = VPX_IMG_FMT_PLANAR | 2, + VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | + 3, /** < planar 4:2:0 format with vpx color space */ + VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4, + VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5, + VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6, + VPX_IMG_FMT_I440 = VPX_IMG_FMT_PLANAR | 7, + VPX_IMG_FMT_444A = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 6, + VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I44016 = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH +} vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */ + +/*!\brief List of supported color spaces */ +typedef enum vpx_color_space { + VPX_CS_UNKNOWN = 0, /**< Unknown */ + VPX_CS_BT_601 = 1, /**< BT.601 */ + VPX_CS_BT_709 = 2, /**< BT.709 */ + VPX_CS_SMPTE_170 = 3, /**< SMPTE.170 */ + VPX_CS_SMPTE_240 = 4, /**< SMPTE.240 */ + VPX_CS_BT_2020 = 5, /**< BT.2020 */ + VPX_CS_RESERVED = 6, /**< Reserved */ + VPX_CS_SRGB = 7 /**< sRGB */ +} vpx_color_space_t; /**< alias for enum vpx_color_space */ + +/*!\brief List of supported color range */ +typedef enum vpx_color_range { + VPX_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */ + VPX_CR_FULL_RANGE = 1 /**< YUV/RGB [0..255] */ +} vpx_color_range_t; /**< alias for enum vpx_color_range */ + +/**\brief Image Descriptor */ +typedef struct vpx_image { + vpx_img_fmt_t fmt; /**< Image Format */ + vpx_color_space_t cs; /**< Color Space */ + vpx_color_range_t range; /**< Color Range */ + + /* Image storage dimensions */ + unsigned int w; /**< Stored image width */ + unsigned int h; /**< Stored image height */ + unsigned int bit_depth; /**< Stored image bit-depth */ + + /* Image display dimensions */ + unsigned int d_w; /**< Displayed image width */ + unsigned int d_h; /**< Displayed image height */ + + /* Image intended rendering dimensions */ + unsigned int r_w; /**< Intended rendering image width */ + unsigned int r_h; /**< Intended rendering image height */ + + /* Chroma subsampling info */ + unsigned int x_chroma_shift; /**< subsampling order, X */ + unsigned int y_chroma_shift; /**< subsampling order, Y */ + +/* Image data pointers. */ +#define VPX_PLANE_PACKED 0 /**< To be used for all packed formats */ +#define VPX_PLANE_Y 0 /**< Y (Luminance) plane */ +#define VPX_PLANE_U 1 /**< U (Chroma) plane */ +#define VPX_PLANE_V 2 /**< V (Chroma) plane */ +#define VPX_PLANE_ALPHA 3 /**< A (Transparency) plane */ + unsigned char *planes[4]; /**< pointer to the top left pixel for each plane */ + int stride[4]; /**< stride between rows for each plane */ + + int bps; /**< bits per sample (for packed formats) */ + + /*!\brief The following member may be set by the application to associate + * data with this image. */ - vpx_image_t *vpx_img_alloc(vpx_image_t *img, - vpx_img_fmt_t fmt, - unsigned int d_w, - unsigned int d_h, - unsigned int align); - - /*!\brief Open a descriptor, using existing storage for the underlying image - * - * Returns a descriptor for storing an image of the given format. The - * storage for descriptor has been allocated elsewhere, and a descriptor is - * desired to "wrap" that storage. - * - * \param[in] img Pointer to storage for descriptor. If this parameter - * is NULL, the storage for the descriptor will be - * allocated on the heap. - * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image - * \param[in] align Alignment, in bytes, of each row in the image. - * \param[in] img_data Storage to use for the image - * - * \return Returns a pointer to the initialized image descriptor. If the img - * parameter is non-null, the value of the img parameter will be - * returned. - */ - vpx_image_t *vpx_img_wrap(vpx_image_t *img, - vpx_img_fmt_t fmt, - unsigned int d_w, - unsigned int d_h, - unsigned int align, - unsigned char *img_data); - - - /*!\brief Set the rectangle identifying the displayed portion of the image - * - * Updates the displayed rectangle (aka viewport) on the image surface to - * match the specified coordinates and size. - * - * \param[in] img Image descriptor - * \param[in] x leftmost column - * \param[in] y topmost row - * \param[in] w width - * \param[in] h height - * - * \return 0 if the requested rectangle is valid, nonzero otherwise. - */ - int vpx_img_set_rect(vpx_image_t *img, - unsigned int x, - unsigned int y, - unsigned int w, - unsigned int h); - - - /*!\brief Flip the image vertically (top for bottom) - * - * Adjusts the image descriptor's pointers and strides to make the image - * be referenced upside-down. - * - * \param[in] img Image descriptor - */ - void vpx_img_flip(vpx_image_t *img); + void *user_priv; - /*!\brief Close an image descriptor - * - * Frees all allocated storage associated with an image descriptor. - * - * \param[in] img Image descriptor - */ - void vpx_img_free(vpx_image_t *img); + /* The following members should be treated as private. */ + unsigned char *img_data; /**< private */ + int img_data_owner; /**< private */ + int self_allocd; /**< private */ + + void *fb_priv; /**< Frame buffer data associated with the image. */ +} vpx_image_t; /**< alias for struct vpx_image */ + +/**\brief Representation of a rectangle on a surface */ +typedef struct vpx_image_rect { + unsigned int x; /**< leftmost column */ + unsigned int y; /**< topmost row */ + unsigned int w; /**< width */ + unsigned int h; /**< height */ +} vpx_image_rect_t; /**< alias for struct vpx_image_rect */ + +/*!\brief Open a descriptor, allocating storage for the underlying image + * + * Returns a descriptor for storing an image of the given format. The + * storage for the descriptor is allocated on the heap. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of the image buffer and + * each row in the image(stride). + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ +vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align); + +/*!\brief Open a descriptor, using existing storage for the underlying image + * + * Returns a descriptor for storing an image of the given format. The + * storage for descriptor has been allocated elsewhere, and a descriptor is + * desired to "wrap" that storage. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of each row in the image. + * \param[in] img_data Storage to use for the image + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ +vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, + unsigned int d_h, unsigned int align, + unsigned char *img_data); + +/*!\brief Set the rectangle identifying the displayed portion of the image + * + * Updates the displayed rectangle (aka viewport) on the image surface to + * match the specified coordinates and size. + * + * \param[in] img Image descriptor + * \param[in] x leftmost column + * \param[in] y topmost row + * \param[in] w width + * \param[in] h height + * + * \return 0 if the requested rectangle is valid, nonzero otherwise. + */ +int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y, + unsigned int w, unsigned int h); + +/*!\brief Flip the image vertically (top for bottom) + * + * Adjusts the image descriptor's pointers and strides to make the image + * be referenced upside-down. + * + * \param[in] img Image descriptor + */ +void vpx_img_flip(vpx_image_t *img); + +/*!\brief Close an image descriptor + * + * Frees all allocated storage associated with an image descriptor. + * + * \param[in] img Image descriptor + */ +void vpx_img_free(vpx_image_t *img); #ifdef __cplusplus } // extern "C" diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h index 2945c87ca45c73f64824880d3251284ad4b3c684..09bad9222d4356df00036475267705d83659f4d5 100644 --- a/vpx/vpx_integer.h +++ b/vpx/vpx_integer.h @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - #ifndef VPX_VPX_INTEGER_H_ #define VPX_VPX_INTEGER_H_ @@ -25,13 +24,13 @@ #endif #if defined(VPX_EMULATE_INTTYPES) -typedef signed char int8_t; +typedef signed char int8_t; typedef signed short int16_t; -typedef signed int int32_t; +typedef signed int int32_t; -typedef unsigned char uint8_t; +typedef unsigned char uint8_t; typedef unsigned short uint16_t; -typedef unsigned int uint32_t; +typedef unsigned int uint32_t; #ifndef _UINTPTR_T_DEFINED typedef size_t uintptr_t; @@ -42,12 +41,12 @@ typedef size_t uintptr_t; /* Most platforms have the C99 standard integer types. */ #if defined(__cplusplus) -# if !defined(__STDC_FORMAT_MACROS) -# define __STDC_FORMAT_MACROS -# endif -# if !defined(__STDC_LIMIT_MACROS) -# define __STDC_LIMIT_MACROS -# endif +#if !defined(__STDC_FORMAT_MACROS) +#define __STDC_FORMAT_MACROS +#endif +#if !defined(__STDC_LIMIT_MACROS) +#define __STDC_LIMIT_MACROS +#endif #endif // __cplusplus #include <stdint.h> diff --git a/vpx_dsp/add_noise.c b/vpx_dsp/add_noise.c index 4ae67a813ec8a33abe642a2bfa590aa81f9922fa..80b1af9dde8d7515c71fdc2c95a5859b6e56f05f 100644 --- a/vpx_dsp/add_noise.c +++ b/vpx_dsp/add_noise.c @@ -17,16 +17,14 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -void vpx_plane_add_noise_c(uint8_t *start, char *noise, - char blackclamp[16], - char whiteclamp[16], - char bothclamp[16], +void vpx_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16], + char whiteclamp[16], char bothclamp[16], unsigned int width, unsigned int height, int pitch) { unsigned int i, j; for (i = 0; i < height; ++i) { uint8_t *pos = start + i * pitch; - char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT + char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT for (j = 0; j < width; ++j) { int v = pos[j]; @@ -51,7 +49,7 @@ int vpx_setup_noise(double sigma, int size, char *noise) { // set up a 256 entry lookup that matches gaussian distribution for (i = -32; i < 32; ++i) { - const int a_i = (int) (0.5 + 256 * gaussian(sigma, 0, i)); + const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i)); if (a_i) { for (j = 0; j < a_i; ++j) { char_dist[next + j] = (char)i; diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index e52958c547fa16b07c0847ddff31bec9ad8d6e83..001517d33ee71a8095b515fedb28f453d8971368 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -198,27 +198,24 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { } } -void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int *min, int *max) { +void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int *min, int *max) { // Load and concatenate. - const uint8x16_t a01 = vcombine_u8(vld1_u8(a), - vld1_u8(a + a_stride)); - const uint8x16_t a23 = vcombine_u8(vld1_u8(a + 2 * a_stride), - vld1_u8(a + 3 * a_stride)); - const uint8x16_t a45 = vcombine_u8(vld1_u8(a + 4 * a_stride), - vld1_u8(a + 5 * a_stride)); - const uint8x16_t a67 = vcombine_u8(vld1_u8(a + 6 * a_stride), - vld1_u8(a + 7 * a_stride)); - - const uint8x16_t b01 = vcombine_u8(vld1_u8(b), - vld1_u8(b + b_stride)); - const uint8x16_t b23 = vcombine_u8(vld1_u8(b + 2 * b_stride), - vld1_u8(b + 3 * b_stride)); - const uint8x16_t b45 = vcombine_u8(vld1_u8(b + 4 * b_stride), - vld1_u8(b + 5 * b_stride)); - const uint8x16_t b67 = vcombine_u8(vld1_u8(b + 6 * b_stride), - vld1_u8(b + 7 * b_stride)); + const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride)); + const uint8x16_t a23 = + vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride)); + const uint8x16_t a45 = + vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride)); + const uint8x16_t a67 = + vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride)); + + const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride)); + const uint8x16_t b23 = + vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride)); + const uint8x16_t b45 = + vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride)); + const uint8x16_t b67 = + vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride)); // Absolute difference. const uint8x16_t ab01_diff = vabdq_u8(a01, b01); diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c index 9f9de98d90eb65093d5c0617b51eadbcce6d765b..7cb2ba90d2fe6cd63ecac35bc432fac58b9e35ba 100644 --- a/vpx_dsp/arm/fwd_txfm_neon.c +++ b/vpx_dsp/arm/fwd_txfm_neon.c @@ -131,14 +131,14 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { // 14 15 16 17 54 55 56 57 // 24 25 26 27 64 65 66 67 // 34 35 36 37 74 75 76 77 - const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0), - vreinterpretq_s32_s16(out_2)); - const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1), - vreinterpretq_s32_s16(out_3)); - const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4), - vreinterpretq_s32_s16(out_6)); - const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5), - vreinterpretq_s32_s16(out_7)); + const int32x4x2_t r02_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2)); + const int32x4x2_t r13_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3)); + const int32x4x2_t r46_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6)); + const int32x4x2_t r57_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7)); const int16x8x2_t r01_s16 = vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), vreinterpretq_s16_s32(r13_s32.val[0])); diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c index 21e3e3dbacfb79ae044954081a4f3138d5514b98..46b2755ea68ed83266cfa591543587b43396dd64 100644 --- a/vpx_dsp/arm/hadamard_neon.c +++ b/vpx_dsp/arm/hadamard_neon.c @@ -12,9 +12,8 @@ #include "./vpx_dsp_rtcd.h" -static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, - int16x8_t *a2, int16x8_t *a3, - int16x8_t *a4, int16x8_t *a5, +static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, int16x8_t *a6, int16x8_t *a7) { const int16x8_t b0 = vaddq_s16(*a0, *a1); const int16x8_t b1 = vsubq_s16(*a0, *a1); @@ -47,9 +46,8 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, // TODO(johannkoenig): Make a transpose library and dedup with idct. Consider // reversing transpose order which may make it easier for the compiler to // reconcile the vtrn.64 moves. -static void transpose8x8(int16x8_t *a0, int16x8_t *a1, - int16x8_t *a2, int16x8_t *a3, - int16x8_t *a4, int16x8_t *a5, +static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, int16x8_t *a6, int16x8_t *a7) { // Swap 64 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 @@ -91,14 +89,14 @@ static void transpose8x8(int16x8_t *a0, int16x8_t *a1, // a1657_hi: // 12 13 28 29 44 45 60 61 // 14 15 30 31 46 47 62 63 - const int32x4x2_t a0246_lo = vtrnq_s32(vreinterpretq_s32_s16(a04_lo), - vreinterpretq_s32_s16(a26_lo)); - const int32x4x2_t a1357_lo = vtrnq_s32(vreinterpretq_s32_s16(a15_lo), - vreinterpretq_s32_s16(a37_lo)); - const int32x4x2_t a0246_hi = vtrnq_s32(vreinterpretq_s32_s16(a04_hi), - vreinterpretq_s32_s16(a26_hi)); - const int32x4x2_t a1357_hi = vtrnq_s32(vreinterpretq_s32_s16(a15_hi), - vreinterpretq_s32_s16(a37_hi)); + const int32x4x2_t a0246_lo = + vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo)); + const int32x4x2_t a1357_lo = + vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo)); + const int32x4x2_t a0246_hi = + vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi)); + const int32x4x2_t a1357_hi = + vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi)); // Swap 16 bit elements resulting in: // b0: diff --git a/vpx_dsp/arm/idct16x16_1_add_neon.c b/vpx_dsp/arm/idct16x16_1_add_neon.c index f734e48027944b9630e39768c3258269fa1fb53c..466b408893e2bfc9d367e65658e5f6de3b59c55d 100644 --- a/vpx_dsp/arm/idct16x16_1_add_neon.c +++ b/vpx_dsp/arm/idct16x16_1_add_neon.c @@ -13,49 +13,46 @@ #include "vpx_dsp/inv_txfm.h" #include "vpx_ports/mem.h" -void vpx_idct16x16_1_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, j, a1, cospi_16_64 = 11585; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); - - for (d1 = d2 = dest, i = 0; i < 4; i++) { - for (j = 0; j < 2; j++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += dest_stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += dest_stride; - - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); - - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); - d2 += dest_stride; - } +void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x8_t d2u8, d3u8, d30u8, d31u8; + uint64x1_t d2u64, d3u64, d4u64, d5u64; + uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, j, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 6); + + q0s16 = vdupq_n_s16(a1); + q0u16 = vreinterpretq_u16_s16(q0s16); + + for (d1 = d2 = dest, i = 0; i < 4; i++) { + for (j = 0; j < 2; j++) { + d2u64 = vld1_u64((const uint64_t *)d1); + d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); + d1 += dest_stride; + d4u64 = vld1_u64((const uint64_t *)d1); + d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); + d1 += dest_stride; + + q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); + q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); + q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); + q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); + vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); + d2 += dest_stride; } - return; + } + return; } diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c index 651ebb21f9967d4ac37380f403e7907ce8f3a2d9..6c03aff609b911e62c55b744ee8e1c9b1fdad24d 100644 --- a/vpx_dsp/arm/idct16x16_add_neon.c +++ b/vpx_dsp/arm/idct16x16_add_neon.c @@ -13,1175 +13,736 @@ #include "./vpx_config.h" #include "vpx_dsp/txfm_common.h" -static INLINE void TRANSPOSE8X8( - int16x8_t *q8s16, - int16x8_t *q9s16, - int16x8_t *q10s16, - int16x8_t *q11s16, - int16x8_t *q12s16, - int16x8_t *q13s16, - int16x8_t *q14s16, - int16x8_t *q15s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - *q12s16 = vcombine_s16(d17s16, d25s16); - *q13s16 = vcombine_s16(d19s16, d27s16); - *q14s16 = vcombine_s16(d21s16, d29s16); - *q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), - vreinterpretq_s32_s16(*q10s16)); - q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), - vreinterpretq_s32_s16(*q11s16)); - q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), - vreinterpretq_s32_s16(*q14s16)); - q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), - vreinterpretq_s32_s16(*q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - *q8s16 = q0x2s16.val[0]; - *q9s16 = q0x2s16.val[1]; - *q10s16 = q1x2s16.val[0]; - *q11s16 = q1x2s16.val[1]; - *q12s16 = q2x2s16.val[0]; - *q13s16 = q2x2s16.val[1]; - *q14s16 = q3x2s16.val[0]; - *q15s16 = q3x2s16.val[1]; - return; +static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, + int16x8_t *q10s16, int16x8_t *q11s16, + int16x8_t *q12s16, int16x8_t *q13s16, + int16x8_t *q14s16, int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; } -void vpx_idct16x16_256_add_neon_pass1( - int16_t *in, - int16_t *out, - int output_stride) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - // stage 3 - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d18s16, d1s16); - q6s32 = vmull_s16(d19s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlal_s16(q5s32, d30s16, d0s16); - q6s32 = vmlal_s16(q6s32, d31s16, d0s16); - - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); - - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q5s32, 14); - d15s16 = vqrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - q2s32 = vmull_s16(d26s16, d2s16); - q3s32 = vmull_s16(d27s16, d2s16); - q9s32 = vmull_s16(d26s16, d3s16); - q15s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q15s32 = vmlal_s16(q15s32, d23s16, d2s16); - - d10s16 = vqrshrn_n_s32(q2s32, 14); - d11s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q15s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 4 - d30s16 = vdup_n_s16(cospi_16_64); - - q2s32 = vmull_s16(d16s16, d30s16); - q11s32 = vmull_s16(d17s16, d30s16); - q0s32 = vmull_s16(d24s16, d30s16); - q1s32 = vmull_s16(d25s16, d30s16); - - d30s16 = vdup_n_s16(cospi_24_64); - d31s16 = vdup_n_s16(cospi_8_64); - - q3s32 = vaddq_s32(q2s32, q0s32); - q12s32 = vaddq_s32(q11s32, q1s32); - q13s32 = vsubq_s32(q2s32, q0s32); - q1s32 = vsubq_s32(q11s32, q1s32); - - d16s16 = vqrshrn_n_s32(q3s32, 14); - d17s16 = vqrshrn_n_s32(q12s32, 14); - d18s16 = vqrshrn_n_s32(q13s32, 14); - d19s16 = vqrshrn_n_s32(q1s32, 14); - q8s16 = vcombine_s16(d16s16, d17s16); - q9s16 = vcombine_s16(d18s16, d19s16); - - q0s32 = vmull_s16(d20s16, d31s16); - q1s32 = vmull_s16(d21s16, d31s16); - q12s32 = vmull_s16(d20s16, d30s16); - q13s32 = vmull_s16(d21s16, d30s16); - - q0s32 = vmlal_s16(q0s32, d28s16, d30s16); - q1s32 = vmlal_s16(q1s32, d29s16, d30s16); - q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); - - d22s16 = vqrshrn_n_s32(q0s32, 14); - d23s16 = vqrshrn_n_s32(q1s32, 14); - d20s16 = vqrshrn_n_s32(q12s32, 14); - d21s16 = vqrshrn_n_s32(q13s32, 14); - q10s16 = vcombine_s16(d20s16, d21s16); - q11s16 = vcombine_s16(d22s16, d23s16); - - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q15s16 = vaddq_s16(q6s16, q7s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - // stage 5 - q0s16 = vaddq_s16(q8s16, q11s16); - q1s16 = vaddq_s16(q9s16, q10s16); - q2s16 = vsubq_s16(q9s16, q10s16); - q3s16 = vsubq_s16(q8s16, q11s16); - - d16s16 = vdup_n_s16(cospi_16_64); - - q11s32 = vmull_s16(d26s16, d16s16); - q12s32 = vmull_s16(d27s16, d16s16); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - - q6s32 = vsubq_s32(q9s32, q11s32); - q13s32 = vsubq_s32(q10s32, q12s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d10s16 = vqrshrn_n_s32(q6s32, 14); - d11s16 = vqrshrn_n_s32(q13s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 6 - q8s16 = vaddq_s16(q0s16, q15s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); +void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, + int output_stride) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(in); + q8s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q9s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q10s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q11s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q12s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q13s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q14s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + // stage 3 + d0s16 = vdup_n_s16(cospi_28_64); + d1s16 = vdup_n_s16(cospi_4_64); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d18s16, d1s16); + q6s32 = vmull_s16(d19s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlal_s16(q5s32, d30s16, d0s16); + q6s32 = vmlal_s16(q6s32, d31s16, d0s16); + + d2s16 = vdup_n_s16(cospi_12_64); + d3s16 = vdup_n_s16(cospi_20_64); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q5s32, 14); + d15s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + q2s32 = vmull_s16(d26s16, d2s16); + q3s32 = vmull_s16(d27s16, d2s16); + q9s32 = vmull_s16(d26s16, d3s16); + q15s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); + q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q15s32 = vmlal_s16(q15s32, d23s16, d2s16); + + d10s16 = vqrshrn_n_s32(q2s32, 14); + d11s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q15s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 4 + d30s16 = vdup_n_s16(cospi_16_64); + + q2s32 = vmull_s16(d16s16, d30s16); + q11s32 = vmull_s16(d17s16, d30s16); + q0s32 = vmull_s16(d24s16, d30s16); + q1s32 = vmull_s16(d25s16, d30s16); + + d30s16 = vdup_n_s16(cospi_24_64); + d31s16 = vdup_n_s16(cospi_8_64); + + q3s32 = vaddq_s32(q2s32, q0s32); + q12s32 = vaddq_s32(q11s32, q1s32); + q13s32 = vsubq_s32(q2s32, q0s32); + q1s32 = vsubq_s32(q11s32, q1s32); + + d16s16 = vqrshrn_n_s32(q3s32, 14); + d17s16 = vqrshrn_n_s32(q12s32, 14); + d18s16 = vqrshrn_n_s32(q13s32, 14); + d19s16 = vqrshrn_n_s32(q1s32, 14); + q8s16 = vcombine_s16(d16s16, d17s16); + q9s16 = vcombine_s16(d18s16, d19s16); + + q0s32 = vmull_s16(d20s16, d31s16); + q1s32 = vmull_s16(d21s16, d31s16); + q12s32 = vmull_s16(d20s16, d30s16); + q13s32 = vmull_s16(d21s16, d30s16); + + q0s32 = vmlal_s16(q0s32, d28s16, d30s16); + q1s32 = vmlal_s16(q1s32, d29s16, d30s16); + q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); + q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); + + d22s16 = vqrshrn_n_s32(q0s32, 14); + d23s16 = vqrshrn_n_s32(q1s32, 14); + d20s16 = vqrshrn_n_s32(q12s32, 14); + d21s16 = vqrshrn_n_s32(q13s32, 14); + q10s16 = vcombine_s16(d20s16, d21s16); + q11s16 = vcombine_s16(d22s16, d23s16); + + q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q15s16 = vaddq_s16(q6s16, q7s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + // stage 5 + q0s16 = vaddq_s16(q8s16, q11s16); + q1s16 = vaddq_s16(q9s16, q10s16); + q2s16 = vsubq_s16(q9s16, q10s16); + q3s16 = vsubq_s16(q8s16, q11s16); + + d16s16 = vdup_n_s16(cospi_16_64); + + q11s32 = vmull_s16(d26s16, d16s16); + q12s32 = vmull_s16(d27s16, d16s16); + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + + q6s32 = vsubq_s32(q9s32, q11s32); + q13s32 = vsubq_s32(q10s32, q12s32); + q9s32 = vaddq_s32(q9s32, q11s32); + q10s32 = vaddq_s32(q10s32, q12s32); + + d10s16 = vqrshrn_n_s32(q6s32, 14); + d11s16 = vqrshrn_n_s32(q13s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q10s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 6 + q8s16 = vaddq_s16(q0s16, q15s16); + q9s16 = vaddq_s16(q1s16, q6s16); + q10s16 = vaddq_s16(q2s16, q5s16); + q11s16 = vaddq_s16(q3s16, q4s16); + q12s16 = vsubq_s16(q3s16, q4s16); + q13s16 = vsubq_s16(q2s16, q5s16); + q14s16 = vsubq_s16(q1s16, q6s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); + d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); + d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); + d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); + d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + // store the data + output_stride >>= 1; // output_stride / 2, out is int16_t + vst1_u64((uint64_t *)out, d16u64); + out += output_stride; + vst1_u64((uint64_t *)out, d17u64); + out += output_stride; + vst1_u64((uint64_t *)out, d18u64); + out += output_stride; + vst1_u64((uint64_t *)out, d19u64); + out += output_stride; + vst1_u64((uint64_t *)out, d20u64); + out += output_stride; + vst1_u64((uint64_t *)out, d21u64); + out += output_stride; + vst1_u64((uint64_t *)out, d22u64); + out += output_stride; + vst1_u64((uint64_t *)out, d23u64); + out += output_stride; + vst1_u64((uint64_t *)out, d24u64); + out += output_stride; + vst1_u64((uint64_t *)out, d25u64); + out += output_stride; + vst1_u64((uint64_t *)out, d26u64); + out += output_stride; + vst1_u64((uint64_t *)out, d27u64); + out += output_stride; + vst1_u64((uint64_t *)out, d28u64); + out += output_stride; + vst1_u64((uint64_t *)out, d29u64); + out += output_stride; + vst1_u64((uint64_t *)out, d30u64); + out += output_stride; + vst1_u64((uint64_t *)out, d31u64); + return; +} + +void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, + int16_t *pass1Output, int16_t skip_adding, + uint8_t *dest, int dest_stride) { + uint8_t *d; + uint8x8_t d12u8, d13u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + uint64x1_t d24u64, d25u64, d26u64, d27u64; + int64x1_t d12s64, d13s64; + uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; + uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + // stage 3 + d12s16 = vdup_n_s16(cospi_30_64); + d13s16 = vdup_n_s16(cospi_2_64); + + q2s32 = vmull_s16(d16s16, d12s16); + q3s32 = vmull_s16(d17s16, d12s16); + q1s32 = vmull_s16(d16s16, d13s16); + q4s32 = vmull_s16(d17s16, d13s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); + q1s32 = vmlal_s16(q1s32, d30s16, d12s16); + q4s32 = vmlal_s16(q4s32, d31s16, d12s16); + + d0s16 = vqrshrn_n_s32(q2s32, 14); + d1s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q1s32, 14); + d15s16 = vqrshrn_n_s32(q4s32, 14); + q0s16 = vcombine_s16(d0s16, d1s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d30s16 = vdup_n_s16(cospi_14_64); + d31s16 = vdup_n_s16(cospi_18_64); + + q2s32 = vmull_s16(d24s16, d30s16); + q3s32 = vmull_s16(d25s16, d30s16); + q4s32 = vmull_s16(d24s16, d31s16); + q5s32 = vmull_s16(d25s16, d31s16); + + q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); + q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); + q4s32 = vmlal_s16(q4s32, d22s16, d30s16); + q5s32 = vmlal_s16(q5s32, d23s16, d30s16); + + d2s16 = vqrshrn_n_s32(q2s32, 14); + d3s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q4s32, 14); + d13s16 = vqrshrn_n_s32(q5s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + d30s16 = vdup_n_s16(cospi_22_64); + d31s16 = vdup_n_s16(cospi_10_64); + + q11s32 = vmull_s16(d20s16, d30s16); + q12s32 = vmull_s16(d21s16, d30s16); + q4s32 = vmull_s16(d20s16, d31s16); + q5s32 = vmull_s16(d21s16, d31s16); + + q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); + q4s32 = vmlal_s16(q4s32, d26s16, d30s16); + q5s32 = vmlal_s16(q5s32, d27s16, d30s16); + + d4s16 = vqrshrn_n_s32(q11s32, 14); + d5s16 = vqrshrn_n_s32(q12s32, 14); + d11s16 = vqrshrn_n_s32(q5s32, 14); + d10s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + d30s16 = vdup_n_s16(cospi_6_64); + d31s16 = vdup_n_s16(cospi_26_64); + + q10s32 = vmull_s16(d28s16, d30s16); + q11s32 = vmull_s16(d29s16, d30s16); + q12s32 = vmull_s16(d28s16, d31s16); + q13s32 = vmull_s16(d29s16, d31s16); + + q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); + q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); + q12s32 = vmlal_s16(q12s32, d18s16, d30s16); + q13s32 = vmlal_s16(q13s32, d19s16, d30s16); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q11s32, 14); + d8s16 = vqrshrn_n_s32(q12s32, 14); + d9s16 = vqrshrn_n_s32(q13s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 3 + q9s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q10s16 = vsubq_s16(q3s16, q2s16); + q11s16 = vaddq_s16(q2s16, q3s16); + q12s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q6s16, q7s16); + + // stage 4 + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + d30s16 = vdup_n_s16(cospi_8_64); + d31s16 = vdup_n_s16(cospi_24_64); + + q2s32 = vmull_s16(d18s16, d31s16); + q3s32 = vmull_s16(d19s16, d31s16); + q4s32 = vmull_s16(d28s16, d31s16); + q5s32 = vmull_s16(d29s16, d31s16); + + q2s32 = vmlal_s16(q2s32, d28s16, d30s16); + q3s32 = vmlal_s16(q3s32, d29s16, d30s16); + q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); + q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); + + d12s16 = vqrshrn_n_s32(q2s32, 14); + d13s16 = vqrshrn_n_s32(q3s32, 14); + d2s16 = vqrshrn_n_s32(q4s32, 14); + d3s16 = vqrshrn_n_s32(q5s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + q3s16 = q11s16; + q4s16 = q12s16; + + d30s16 = vdup_n_s16(-cospi_8_64); + q11s32 = vmull_s16(d26s16, d30s16); + q12s32 = vmull_s16(d27s16, d30s16); + q8s32 = vmull_s16(d20s16, d30s16); + q9s32 = vmull_s16(d21s16, d30s16); + + q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); + q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); + q8s32 = vmlal_s16(q8s32, d26s16, d31s16); + q9s32 = vmlal_s16(q9s32, d27s16, d31s16); + + d4s16 = vqrshrn_n_s32(q11s32, 14); + d5s16 = vqrshrn_n_s32(q12s32, 14); + d10s16 = vqrshrn_n_s32(q8s32, 14); + d11s16 = vqrshrn_n_s32(q9s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + // stage 5 + q8s16 = vaddq_s16(q0s16, q3s16); + q9s16 = vaddq_s16(q1s16, q2s16); + q10s16 = vsubq_s16(q1s16, q2s16); + q11s16 = vsubq_s16(q0s16, q3s16); + q12s16 = vsubq_s16(q7s16, q4s16); + q13s16 = vsubq_s16(q6s16, q5s16); + q14s16 = vaddq_s16(q6s16, q5s16); + q15s16 = vaddq_s16(q7s16, q4s16); + + // stage 6 + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + + d14s16 = vdup_n_s16(cospi_16_64); + + q3s32 = vmull_s16(d26s16, d14s16); + q4s32 = vmull_s16(d27s16, d14s16); + q0s32 = vmull_s16(d20s16, d14s16); + q1s32 = vmull_s16(d21s16, d14s16); + + q5s32 = vsubq_s32(q3s32, q0s32); + q6s32 = vsubq_s32(q4s32, q1s32); + q10s32 = vaddq_s32(q3s32, q0s32); + q4s32 = vaddq_s32(q4s32, q1s32); + + d4s16 = vqrshrn_n_s32(q5s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + d10s16 = vqrshrn_n_s32(q10s32, 14); + d11s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q0s32 = vmull_s16(d22s16, d14s16); + q1s32 = vmull_s16(d23s16, d14s16); + q13s32 = vmull_s16(d24s16, d14s16); + q6s32 = vmull_s16(d25s16, d14s16); + + q10s32 = vsubq_s32(q13s32, q0s32); + q4s32 = vsubq_s32(q6s32, q1s32); + q13s32 = vaddq_s32(q13s32, q0s32); + q6s32 = vaddq_s32(q6s32, q1s32); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + d8s16 = vqrshrn_n_s32(q13s32, 14); + d9s16 = vqrshrn_n_s32(q6s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 7 + if (skip_adding != 0) { + d = dest; + // load the data in pass1 + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = + vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); + q13u16 = + vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q14s16 = vsubq_s16(q1s16, q14s16); q15s16 = vsubq_s16(q0s16, q15s16); - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); - d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); - d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); - d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - // store the data - output_stride >>= 1; // output_stride / 2, out is int16_t - vst1_u64((uint64_t *)out, d16u64); - out += output_stride; - vst1_u64((uint64_t *)out, d17u64); - out += output_stride; - vst1_u64((uint64_t *)out, d18u64); - out += output_stride; - vst1_u64((uint64_t *)out, d19u64); - out += output_stride; - vst1_u64((uint64_t *)out, d20u64); - out += output_stride; - vst1_u64((uint64_t *)out, d21u64); - out += output_stride; - vst1_u64((uint64_t *)out, d22u64); - out += output_stride; - vst1_u64((uint64_t *)out, d23u64); - out += output_stride; - vst1_u64((uint64_t *)out, d24u64); - out += output_stride; - vst1_u64((uint64_t *)out, d25u64); - out += output_stride; - vst1_u64((uint64_t *)out, d26u64); - out += output_stride; - vst1_u64((uint64_t *)out, d27u64); - out += output_stride; - vst1_u64((uint64_t *)out, d28u64); - out += output_stride; - vst1_u64((uint64_t *)out, d29u64); - out += output_stride; - vst1_u64((uint64_t *)out, d30u64); - out += output_stride; - vst1_u64((uint64_t *)out, d31u64); - return; -} + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = + vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); + q13u16 = + vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); -void vpx_idct16x16_256_add_neon_pass2( - int16_t *src, - int16_t *out, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride) { - uint8_t *d; - uint8x8_t d12u8, d13u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d24u64, d25u64, d26u64, d27u64; - int64x1_t d12s64, d13s64; - uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; - uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - // stage 3 - d12s16 = vdup_n_s16(cospi_30_64); - d13s16 = vdup_n_s16(cospi_2_64); - - q2s32 = vmull_s16(d16s16, d12s16); - q3s32 = vmull_s16(d17s16, d12s16); - q1s32 = vmull_s16(d16s16, d13s16); - q4s32 = vmull_s16(d17s16, d13s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); - q1s32 = vmlal_s16(q1s32, d30s16, d12s16); - q4s32 = vmlal_s16(q4s32, d31s16, d12s16); - - d0s16 = vqrshrn_n_s32(q2s32, 14); - d1s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q1s32, 14); - d15s16 = vqrshrn_n_s32(q4s32, 14); - q0s16 = vcombine_s16(d0s16, d1s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d30s16 = vdup_n_s16(cospi_14_64); - d31s16 = vdup_n_s16(cospi_18_64); - - q2s32 = vmull_s16(d24s16, d30s16); - q3s32 = vmull_s16(d25s16, d30s16); - q4s32 = vmull_s16(d24s16, d31s16); - q5s32 = vmull_s16(d25s16, d31s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); - q4s32 = vmlal_s16(q4s32, d22s16, d30s16); - q5s32 = vmlal_s16(q5s32, d23s16, d30s16); - - d2s16 = vqrshrn_n_s32(q2s32, 14); - d3s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q4s32, 14); - d13s16 = vqrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16(cospi_22_64); - d31s16 = vdup_n_s16(cospi_10_64); - - q11s32 = vmull_s16(d20s16, d30s16); - q12s32 = vmull_s16(d21s16, d30s16); - q4s32 = vmull_s16(d20s16, d31s16); - q5s32 = vmull_s16(d21s16, d31s16); - - q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); - q4s32 = vmlal_s16(q4s32, d26s16, d30s16); - q5s32 = vmlal_s16(q5s32, d27s16, d30s16); - - d4s16 = vqrshrn_n_s32(q11s32, 14); - d5s16 = vqrshrn_n_s32(q12s32, 14); - d11s16 = vqrshrn_n_s32(q5s32, 14); - d10s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - d30s16 = vdup_n_s16(cospi_6_64); - d31s16 = vdup_n_s16(cospi_26_64); - - q10s32 = vmull_s16(d28s16, d30s16); - q11s32 = vmull_s16(d29s16, d30s16); - q12s32 = vmull_s16(d28s16, d31s16); - q13s32 = vmull_s16(d29s16, d31s16); - - q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); - q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); - q12s32 = vmlal_s16(q12s32, d18s16, d30s16); - q13s32 = vmlal_s16(q13s32, d19s16, d30s16); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q11s32, 14); - d8s16 = vqrshrn_n_s32(q12s32, 14); - d9s16 = vqrshrn_n_s32(q13s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 3 - q9s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q10s16 = vsubq_s16(q3s16, q2s16); - q11s16 = vaddq_s16(q2s16, q3s16); - q12s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q6s16, q7s16); - - // stage 4 - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); - - q2s32 = vmull_s16(d18s16, d31s16); - q3s32 = vmull_s16(d19s16, d31s16); - q4s32 = vmull_s16(d28s16, d31s16); - q5s32 = vmull_s16(d29s16, d31s16); - - q2s32 = vmlal_s16(q2s32, d28s16, d30s16); - q3s32 = vmlal_s16(q3s32, d29s16, d30s16); - q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); - - d12s16 = vqrshrn_n_s32(q2s32, 14); - d13s16 = vqrshrn_n_s32(q3s32, 14); - d2s16 = vqrshrn_n_s32(q4s32, 14); - d3s16 = vqrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - q3s16 = q11s16; - q4s16 = q12s16; - - d30s16 = vdup_n_s16(-cospi_8_64); - q11s32 = vmull_s16(d26s16, d30s16); - q12s32 = vmull_s16(d27s16, d30s16); - q8s32 = vmull_s16(d20s16, d30s16); - q9s32 = vmull_s16(d21s16, d30s16); - - q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); - q8s32 = vmlal_s16(q8s32, d26s16, d31s16); - q9s32 = vmlal_s16(q9s32, d27s16, d31s16); - - d4s16 = vqrshrn_n_s32(q11s32, 14); - d5s16 = vqrshrn_n_s32(q12s32, 14); - d10s16 = vqrshrn_n_s32(q8s32, 14); - d11s16 = vqrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); - - // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16(cospi_16_64); - - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q10s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vqrshrn_n_s32(q5s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); - d10s16 = vqrshrn_n_s32(q10s32, 14); - d11s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); - d8s16 = vqrshrn_n_s32(q13s32, 14); - d9s16 = vqrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 7 - if (skip_adding != 0) { - d = dest; - // load the data in pass1 - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), - vreinterpret_u8_s64(d12s64)); - q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), - vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), - vreinterpret_u8_s64(d12s64)); - q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), - vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), - vreinterpret_u8_s64(d12s64)); - q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), - vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16), - vreinterpret_u8_s64(d12s64)); - q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16), - vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - // store the data out 8,9,10,11,12,13,14,15 - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q8s16 = vrshrq_n_s16(q8s16, 6); - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q9s16 = vrshrq_n_s16(q9s16, 6); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q2s16 = vrshrq_n_s16(q2s16, 6); - q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q3s16 = vrshrq_n_s16(q3s16, 6); - q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q4s16 = vrshrq_n_s16(q4s16, 6); - q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q5s16 = vrshrq_n_s16(q5s16, 6); - q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q14s16 = vrshrq_n_s16(q14s16, 6); - q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - q15s16 = vrshrq_n_s16(q15s16, 6); - q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16), - vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - } else { // skip_adding_dest - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); - } - return; -} + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = + vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); + q13u16 = + vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); -void vpx_idct16x16_10_add_neon_pass1( - int16_t *in, - int16_t *out, - int output_stride) { - int16x4_t d4s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q15s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - // stage 3 - q0s16 = vdupq_n_s16(cospi_28_64 * 2); - q1s16 = vdupq_n_s16(cospi_4_64 * 2); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - q7s16 = vqrdmulhq_s16(q9s16, q1s16); - - // stage 4 - q1s16 = vdupq_n_s16(cospi_16_64 * 2); - d4s16 = vdup_n_s16(cospi_16_64); - - q8s16 = vqrdmulhq_s16(q8s16, q1s16); - - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - q9s32 = vmull_s16(d14s16, d4s16); - q10s32 = vmull_s16(d15s16, d4s16); - q12s32 = vmull_s16(d9s16, d4s16); - q11s32 = vmull_s16(d8s16, d4s16); - - q15s32 = vsubq_s32(q10s32, q12s32); - q6s32 = vsubq_s32(q9s32, q11s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d11s16 = vqrshrn_n_s32(q15s32, 14); - d10s16 = vqrshrn_n_s32(q6s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 6 - q2s16 = vaddq_s16(q8s16, q7s16); - q9s16 = vaddq_s16(q8s16, q6s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q8s16, q4s16); - q12s16 = vsubq_s16(q8s16, q4s16); - q13s16 = vsubq_s16(q8s16, q5s16); - q14s16 = vsubq_s16(q8s16, q6s16); - q15s16 = vsubq_s16(q8s16, q7s16); - - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); - d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); - d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); - d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - // store the data - output_stride >>= 1; // output_stride / 2, out is int16_t - vst1_u64((uint64_t *)out, d4u64); - out += output_stride; - vst1_u64((uint64_t *)out, d5u64); - out += output_stride; - vst1_u64((uint64_t *)out, d18u64); - out += output_stride; - vst1_u64((uint64_t *)out, d19u64); - out += output_stride; - vst1_u64((uint64_t *)out, d20u64); - out += output_stride; - vst1_u64((uint64_t *)out, d21u64); - out += output_stride; - vst1_u64((uint64_t *)out, d22u64); - out += output_stride; - vst1_u64((uint64_t *)out, d23u64); - out += output_stride; - vst1_u64((uint64_t *)out, d24u64); - out += output_stride; - vst1_u64((uint64_t *)out, d25u64); - out += output_stride; - vst1_u64((uint64_t *)out, d26u64); - out += output_stride; - vst1_u64((uint64_t *)out, d27u64); - out += output_stride; - vst1_u64((uint64_t *)out, d28u64); - out += output_stride; - vst1_u64((uint64_t *)out, d29u64); - out += output_stride; - vst1_u64((uint64_t *)out, d30u64); - out += output_stride; - vst1_u64((uint64_t *)out, d31u64); - return; -} + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = + vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); + q13u16 = + vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); -void vpx_idct16x16_10_add_neon_pass2( - int16_t *src, - int16_t *out, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride) { - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; - uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; - uint64x1_t d16u64, d17u64, d18u64, d19u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - (void)skip_adding; - (void)dest; - (void)dest_stride; - - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - // stage 3 - q6s16 = vdupq_n_s16(cospi_30_64 * 2); - q0s16 = vqrdmulhq_s16(q8s16, q6s16); - q6s16 = vdupq_n_s16(cospi_2_64 * 2); - q7s16 = vqrdmulhq_s16(q8s16, q6s16); - - q15s16 = vdupq_n_s16(-cospi_26_64 * 2); - q14s16 = vdupq_n_s16(cospi_6_64 * 2); - q3s16 = vqrdmulhq_s16(q9s16, q15s16); - q4s16 = vqrdmulhq_s16(q9s16, q14s16); - - // stage 4 - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - d6s16 = vget_low_s16(q3s16); - d7s16 = vget_high_s16(q3s16); - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); - - q12s32 = vmull_s16(d14s16, d31s16); - q5s32 = vmull_s16(d15s16, d31s16); - q2s32 = vmull_s16(d0s16, d31s16); - q11s32 = vmull_s16(d1s16, d31s16); - - q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); - q2s32 = vmlal_s16(q2s32, d14s16, d30s16); - q11s32 = vmlal_s16(q11s32, d15s16, d30s16); - - d2s16 = vqrshrn_n_s32(q12s32, 14); - d3s16 = vqrshrn_n_s32(q5s32, 14); - d12s16 = vqrshrn_n_s32(q2s32, 14); - d13s16 = vqrshrn_n_s32(q11s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16(-cospi_8_64); - q10s32 = vmull_s16(d8s16, d30s16); - q13s32 = vmull_s16(d9s16, d30s16); - q8s32 = vmull_s16(d6s16, d30s16); - q9s32 = vmull_s16(d7s16, d30s16); - - q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); - q8s32 = vmlal_s16(q8s32, d8s16, d31s16); - q9s32 = vmlal_s16(q9s32, d9s16, d31s16); - - d4s16 = vqrshrn_n_s32(q10s32, 14); - d5s16 = vqrshrn_n_s32(q13s32, 14); - d10s16 = vqrshrn_n_s32(q8s32, 14); - d11s16 = vqrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); - - // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16(cospi_16_64); - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q0s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vqrshrn_n_s32(q5s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); - d10s16 = vqrshrn_n_s32(q0s32, 14); - d11s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); - d8s16 = vqrshrn_n_s32(q13s32, 14); - d9s16 = vqrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 7 + // store the data out 8,9,10,11,12,13,14,15 + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q8s16 = vrshrq_n_s16(q8s16, 6); + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q9s16 = vrshrq_n_s16(q9s16, 6); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q2s16 = vrshrq_n_s16(q2s16, 6); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q3s16 = vrshrq_n_s16(q3s16, 6); + q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q4s16 = vrshrq_n_s16(q4s16, 6); + q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q5s16 = vrshrq_n_s16(q5s16, 6); + q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q14s16 = vrshrq_n_s16(q14s16, 6); + q14u16 = + vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + q15s16 = vrshrq_n_s16(q15s16, 6); + q15u16 = + vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + } else { // skip_adding_dest q0s16 = vld1q_s16(pass1Output); pass1Output += 8; q1s16 = vld1q_s16(pass1Output); @@ -1248,6 +809,7 @@ void vpx_idct16x16_10_add_neon_pass2( q10s16 = vld1q_s16(pass1Output); pass1Output += 8; q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; q12s16 = vaddq_s16(q10s16, q9s16); q13s16 = vaddq_s16(q11s16, q8s16); d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); @@ -1265,53 +827,468 @@ void vpx_idct16x16_10_add_neon_pass2( q8s16 = vsubq_s16(q11s16, q8s16); q9s16 = vsubq_s16(q10s16, q9s16); - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); - d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); - d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); - d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); - d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); - d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - vst1_u64((uint64_t *)out, d16u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); out += 4; - vst1_u64((uint64_t *)out, d17u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); out += 12; - vst1_u64((uint64_t *)out, d18u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); out += 4; - vst1_u64((uint64_t *)out, d19u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); out += 12; - vst1_u64((uint64_t *)out, d4u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); out += 4; - vst1_u64((uint64_t *)out, d5u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); out += 12; - vst1_u64((uint64_t *)out, d6u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); out += 4; - vst1_u64((uint64_t *)out, d7u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); out += 12; - vst1_u64((uint64_t *)out, d8u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); out += 4; - vst1_u64((uint64_t *)out, d9u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); out += 12; - vst1_u64((uint64_t *)out, d10u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); out += 4; - vst1_u64((uint64_t *)out, d11u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); out += 12; - vst1_u64((uint64_t *)out, d28u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); out += 4; - vst1_u64((uint64_t *)out, d29u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); out += 12; - vst1_u64((uint64_t *)out, d30u64); + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); out += 4; - vst1_u64((uint64_t *)out, d31u64); - return; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); + } + return; +} + +void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out, + int output_stride) { + int16x4_t d4s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q6s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q15s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(in); + q8s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q9s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q10s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q11s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q12s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q13s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q14s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + // stage 3 + q0s16 = vdupq_n_s16(cospi_28_64 * 2); + q1s16 = vdupq_n_s16(cospi_4_64 * 2); + + q4s16 = vqrdmulhq_s16(q9s16, q0s16); + q7s16 = vqrdmulhq_s16(q9s16, q1s16); + + // stage 4 + q1s16 = vdupq_n_s16(cospi_16_64 * 2); + d4s16 = vdup_n_s16(cospi_16_64); + + q8s16 = vqrdmulhq_s16(q8s16, q1s16); + + d8s16 = vget_low_s16(q4s16); + d9s16 = vget_high_s16(q4s16); + d14s16 = vget_low_s16(q7s16); + d15s16 = vget_high_s16(q7s16); + q9s32 = vmull_s16(d14s16, d4s16); + q10s32 = vmull_s16(d15s16, d4s16); + q12s32 = vmull_s16(d9s16, d4s16); + q11s32 = vmull_s16(d8s16, d4s16); + + q15s32 = vsubq_s32(q10s32, q12s32); + q6s32 = vsubq_s32(q9s32, q11s32); + q9s32 = vaddq_s32(q9s32, q11s32); + q10s32 = vaddq_s32(q10s32, q12s32); + + d11s16 = vqrshrn_n_s32(q15s32, 14); + d10s16 = vqrshrn_n_s32(q6s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q10s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 6 + q2s16 = vaddq_s16(q8s16, q7s16); + q9s16 = vaddq_s16(q8s16, q6s16); + q10s16 = vaddq_s16(q8s16, q5s16); + q11s16 = vaddq_s16(q8s16, q4s16); + q12s16 = vsubq_s16(q8s16, q4s16); + q13s16 = vsubq_s16(q8s16, q5s16); + q14s16 = vsubq_s16(q8s16, q6s16); + q15s16 = vsubq_s16(q8s16, q7s16); + + d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); + d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); + d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); + d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); + d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + // store the data + output_stride >>= 1; // output_stride / 2, out is int16_t + vst1_u64((uint64_t *)out, d4u64); + out += output_stride; + vst1_u64((uint64_t *)out, d5u64); + out += output_stride; + vst1_u64((uint64_t *)out, d18u64); + out += output_stride; + vst1_u64((uint64_t *)out, d19u64); + out += output_stride; + vst1_u64((uint64_t *)out, d20u64); + out += output_stride; + vst1_u64((uint64_t *)out, d21u64); + out += output_stride; + vst1_u64((uint64_t *)out, d22u64); + out += output_stride; + vst1_u64((uint64_t *)out, d23u64); + out += output_stride; + vst1_u64((uint64_t *)out, d24u64); + out += output_stride; + vst1_u64((uint64_t *)out, d25u64); + out += output_stride; + vst1_u64((uint64_t *)out, d26u64); + out += output_stride; + vst1_u64((uint64_t *)out, d27u64); + out += output_stride; + vst1_u64((uint64_t *)out, d28u64); + out += output_stride; + vst1_u64((uint64_t *)out, d29u64); + out += output_stride; + vst1_u64((uint64_t *)out, d30u64); + out += output_stride; + vst1_u64((uint64_t *)out, d31u64); + return; +} + +void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, + int16_t *pass1Output, int16_t skip_adding, + uint8_t *dest, int dest_stride) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; + uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; + uint64x1_t d16u64, d17u64, d18u64, d19u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32; + int16x8x2_t q0x2s16; + (void)skip_adding; + (void)dest; + (void)dest_stride; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + // stage 3 + q6s16 = vdupq_n_s16(cospi_30_64 * 2); + q0s16 = vqrdmulhq_s16(q8s16, q6s16); + q6s16 = vdupq_n_s16(cospi_2_64 * 2); + q7s16 = vqrdmulhq_s16(q8s16, q6s16); + + q15s16 = vdupq_n_s16(-cospi_26_64 * 2); + q14s16 = vdupq_n_s16(cospi_6_64 * 2); + q3s16 = vqrdmulhq_s16(q9s16, q15s16); + q4s16 = vqrdmulhq_s16(q9s16, q14s16); + + // stage 4 + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + d6s16 = vget_low_s16(q3s16); + d7s16 = vget_high_s16(q3s16); + d8s16 = vget_low_s16(q4s16); + d9s16 = vget_high_s16(q4s16); + d14s16 = vget_low_s16(q7s16); + d15s16 = vget_high_s16(q7s16); + + d30s16 = vdup_n_s16(cospi_8_64); + d31s16 = vdup_n_s16(cospi_24_64); + + q12s32 = vmull_s16(d14s16, d31s16); + q5s32 = vmull_s16(d15s16, d31s16); + q2s32 = vmull_s16(d0s16, d31s16); + q11s32 = vmull_s16(d1s16, d31s16); + + q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); + q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); + q2s32 = vmlal_s16(q2s32, d14s16, d30s16); + q11s32 = vmlal_s16(q11s32, d15s16, d30s16); + + d2s16 = vqrshrn_n_s32(q12s32, 14); + d3s16 = vqrshrn_n_s32(q5s32, 14); + d12s16 = vqrshrn_n_s32(q2s32, 14); + d13s16 = vqrshrn_n_s32(q11s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + d30s16 = vdup_n_s16(-cospi_8_64); + q10s32 = vmull_s16(d8s16, d30s16); + q13s32 = vmull_s16(d9s16, d30s16); + q8s32 = vmull_s16(d6s16, d30s16); + q9s32 = vmull_s16(d7s16, d30s16); + + q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); + q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); + q8s32 = vmlal_s16(q8s32, d8s16, d31s16); + q9s32 = vmlal_s16(q9s32, d9s16, d31s16); + + d4s16 = vqrshrn_n_s32(q10s32, 14); + d5s16 = vqrshrn_n_s32(q13s32, 14); + d10s16 = vqrshrn_n_s32(q8s32, 14); + d11s16 = vqrshrn_n_s32(q9s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + // stage 5 + q8s16 = vaddq_s16(q0s16, q3s16); + q9s16 = vaddq_s16(q1s16, q2s16); + q10s16 = vsubq_s16(q1s16, q2s16); + q11s16 = vsubq_s16(q0s16, q3s16); + q12s16 = vsubq_s16(q7s16, q4s16); + q13s16 = vsubq_s16(q6s16, q5s16); + q14s16 = vaddq_s16(q6s16, q5s16); + q15s16 = vaddq_s16(q7s16, q4s16); + + // stage 6 + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + + d14s16 = vdup_n_s16(cospi_16_64); + q3s32 = vmull_s16(d26s16, d14s16); + q4s32 = vmull_s16(d27s16, d14s16); + q0s32 = vmull_s16(d20s16, d14s16); + q1s32 = vmull_s16(d21s16, d14s16); + + q5s32 = vsubq_s32(q3s32, q0s32); + q6s32 = vsubq_s32(q4s32, q1s32); + q0s32 = vaddq_s32(q3s32, q0s32); + q4s32 = vaddq_s32(q4s32, q1s32); + + d4s16 = vqrshrn_n_s32(q5s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + d10s16 = vqrshrn_n_s32(q0s32, 14); + d11s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q0s32 = vmull_s16(d22s16, d14s16); + q1s32 = vmull_s16(d23s16, d14s16); + q13s32 = vmull_s16(d24s16, d14s16); + q6s32 = vmull_s16(d25s16, d14s16); + + q10s32 = vsubq_s32(q13s32, q0s32); + q4s32 = vsubq_s32(q6s32, q1s32); + q13s32 = vaddq_s32(q13s32, q0s32); + q6s32 = vaddq_s32(q6s32, q1s32); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + d8s16 = vqrshrn_n_s32(q13s32, 14); + d9s16 = vqrshrn_n_s32(q6s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 7 + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); + d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); + d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); + d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); + d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); + d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); + d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); + d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); + d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); + d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + vst1_u64((uint64_t *)out, d16u64); + out += 4; + vst1_u64((uint64_t *)out, d17u64); + out += 12; + vst1_u64((uint64_t *)out, d18u64); + out += 4; + vst1_u64((uint64_t *)out, d19u64); + out += 12; + vst1_u64((uint64_t *)out, d4u64); + out += 4; + vst1_u64((uint64_t *)out, d5u64); + out += 12; + vst1_u64((uint64_t *)out, d6u64); + out += 4; + vst1_u64((uint64_t *)out, d7u64); + out += 12; + vst1_u64((uint64_t *)out, d8u64); + out += 4; + vst1_u64((uint64_t *)out, d9u64); + out += 12; + vst1_u64((uint64_t *)out, d10u64); + out += 4; + vst1_u64((uint64_t *)out, d11u64); + out += 12; + vst1_u64((uint64_t *)out, d28u64); + out += 4; + vst1_u64((uint64_t *)out, d29u64); + out += 12; + vst1_u64((uint64_t *)out, d30u64); + out += 4; + vst1_u64((uint64_t *)out, d31u64); + return; } diff --git a/vpx_dsp/arm/idct16x16_neon.c b/vpx_dsp/arm/idct16x16_neon.c index 352979aa16f7613c4a4d6c176095921be08a61a5..ecc263df28445be1e3d94b5d85126dd6c82062a5 100644 --- a/vpx_dsp/arm/idct16x16_neon.c +++ b/vpx_dsp/arm/idct16x16_neon.c @@ -10,24 +10,16 @@ #include "vpx_dsp/vpx_dsp_common.h" -void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, - int16_t *output, +void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output, int output_stride); -void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, - int16_t *output, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride); -void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, - int16_t *output, +void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output, + int16_t *pass1Output, int16_t skip_adding, + uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output, int output_stride); -void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, - int16_t *output, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride); +void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output, + int16_t *pass1Output, int16_t skip_adding, + uint8_t *dest, int dest_stride); #if HAVE_NEON_ASM /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ @@ -35,13 +27,13 @@ extern void vpx_push_neon(int64_t *store); extern void vpx_pop_neon(int64_t *store); #endif // HAVE_NEON_ASM -void vpx_idct16x16_256_add_neon(const int16_t *input, - uint8_t *dest, int dest_stride) { +void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, + int dest_stride) { #if HAVE_NEON_ASM int64_t store_reg[8]; #endif - int16_t pass1_output[16*16] = {0}; - int16_t row_idct_output[16*16] = {0}; + int16_t pass1_output[16 * 16] = { 0 }; + int16_t row_idct_output[16 * 16] = { 0 }; #if HAVE_NEON_ASM // save d8-d15 register values. @@ -56,27 +48,19 @@ void vpx_idct16x16_256_add_neon(const int16_t *input, // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vpx_idct16x16_256_add_neon_pass2(input+1, - row_idct_output, - pass1_output, - 0, - dest, - dest_stride); + vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, + dest, dest_stride); /* Parallel idct on the lower 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); + vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vpx_idct16x16_256_add_neon_pass2(input+8*16+1, - row_idct_output+8, - pass1_output, - 0, - dest, - dest_stride); + vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8, + pass1_output, 0, dest, dest_stride); /* Parallel idct on the left 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the @@ -86,27 +70,20 @@ void vpx_idct16x16_256_add_neon(const int16_t *input, // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output+1, - row_idct_output, - pass1_output, - 1, - dest, - dest_stride); + vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, + pass1_output, 1, dest, dest_stride); /* Parallel idct on the right 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, - row_idct_output+8, - pass1_output, - 1, - dest+8, - dest_stride); + vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, + row_idct_output + 8, pass1_output, 1, + dest + 8, dest_stride); #if HAVE_NEON_ASM // restore d8-d15 register values. @@ -116,13 +93,13 @@ void vpx_idct16x16_256_add_neon(const int16_t *input, return; } -void vpx_idct16x16_10_add_neon(const int16_t *input, - uint8_t *dest, int dest_stride) { +void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, + int dest_stride) { #if HAVE_NEON_ASM int64_t store_reg[8]; #endif - int16_t pass1_output[16*16] = {0}; - int16_t row_idct_output[16*16] = {0}; + int16_t pass1_output[16 * 16] = { 0 }; + int16_t row_idct_output[16 * 16] = { 0 }; #if HAVE_NEON_ASM // save d8-d15 register values. @@ -137,12 +114,8 @@ void vpx_idct16x16_10_add_neon(const int16_t *input, // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vpx_idct16x16_10_add_neon_pass2(input+1, - row_idct_output, - pass1_output, - 0, - dest, - dest_stride); + vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, + dest, dest_stride); /* Skip Parallel idct on the lower 8 rows as they are all 0s */ @@ -154,27 +127,20 @@ void vpx_idct16x16_10_add_neon(const int16_t *input, // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output+1, - row_idct_output, - pass1_output, - 1, - dest, - dest_stride); + vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, + pass1_output, 1, dest, dest_stride); /* Parallel idct on the right 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, - row_idct_output+8, - pass1_output, - 1, - dest+8, - dest_stride); + vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, + row_idct_output + 8, pass1_output, 1, + dest + 8, dest_stride); #if HAVE_NEON_ASM // restore d8-d15 register values. diff --git a/vpx_dsp/arm/idct32x32_1_add_neon.c b/vpx_dsp/arm/idct32x32_1_add_neon.c index c25c0c4a5c208b4e758793e71fa6bdd40633e078..dab7d098e8abd82f30c1b4b6d43d69f7e3826f98 100644 --- a/vpx_dsp/arm/idct32x32_1_add_neon.c +++ b/vpx_dsp/arm/idct32x32_1_add_neon.c @@ -15,151 +15,126 @@ #include "vpx_dsp/inv_txfm.h" #include "vpx_ports/mem.h" -static INLINE void LD_16x8( - uint8_t *d, - int d_stride, - uint8x16_t *q8u8, - uint8x16_t *q9u8, - uint8x16_t *q10u8, - uint8x16_t *q11u8, - uint8x16_t *q12u8, - uint8x16_t *q13u8, - uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vld1q_u8(d); - d += d_stride; - *q9u8 = vld1q_u8(d); - d += d_stride; - *q10u8 = vld1q_u8(d); - d += d_stride; - *q11u8 = vld1q_u8(d); - d += d_stride; - *q12u8 = vld1q_u8(d); - d += d_stride; - *q13u8 = vld1q_u8(d); - d += d_stride; - *q14u8 = vld1q_u8(d); - d += d_stride; - *q15u8 = vld1q_u8(d); - return; +static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, + uint8x16_t *q9u8, uint8x16_t *q10u8, + uint8x16_t *q11u8, uint8x16_t *q12u8, + uint8x16_t *q13u8, uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vld1q_u8(d); + d += d_stride; + *q9u8 = vld1q_u8(d); + d += d_stride; + *q10u8 = vld1q_u8(d); + d += d_stride; + *q11u8 = vld1q_u8(d); + d += d_stride; + *q12u8 = vld1q_u8(d); + d += d_stride; + *q13u8 = vld1q_u8(d); + d += d_stride; + *q14u8 = vld1q_u8(d); + d += d_stride; + *q15u8 = vld1q_u8(d); + return; } -static INLINE void ADD_DIFF_16x8( - uint8x16_t qdiffu8, - uint8x16_t *q8u8, - uint8x16_t *q9u8, - uint8x16_t *q10u8, - uint8x16_t *q11u8, - uint8x16_t *q12u8, - uint8x16_t *q13u8, - uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqaddq_u8(*q8u8, qdiffu8); - *q9u8 = vqaddq_u8(*q9u8, qdiffu8); - *q10u8 = vqaddq_u8(*q10u8, qdiffu8); - *q11u8 = vqaddq_u8(*q11u8, qdiffu8); - *q12u8 = vqaddq_u8(*q12u8, qdiffu8); - *q13u8 = vqaddq_u8(*q13u8, qdiffu8); - *q14u8 = vqaddq_u8(*q14u8, qdiffu8); - *q15u8 = vqaddq_u8(*q15u8, qdiffu8); - return; +static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, + uint8x16_t *q9u8, uint8x16_t *q10u8, + uint8x16_t *q11u8, uint8x16_t *q12u8, + uint8x16_t *q13u8, uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vqaddq_u8(*q8u8, qdiffu8); + *q9u8 = vqaddq_u8(*q9u8, qdiffu8); + *q10u8 = vqaddq_u8(*q10u8, qdiffu8); + *q11u8 = vqaddq_u8(*q11u8, qdiffu8); + *q12u8 = vqaddq_u8(*q12u8, qdiffu8); + *q13u8 = vqaddq_u8(*q13u8, qdiffu8); + *q14u8 = vqaddq_u8(*q14u8, qdiffu8); + *q15u8 = vqaddq_u8(*q15u8, qdiffu8); + return; } -static INLINE void SUB_DIFF_16x8( - uint8x16_t qdiffu8, - uint8x16_t *q8u8, - uint8x16_t *q9u8, - uint8x16_t *q10u8, - uint8x16_t *q11u8, - uint8x16_t *q12u8, - uint8x16_t *q13u8, - uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqsubq_u8(*q8u8, qdiffu8); - *q9u8 = vqsubq_u8(*q9u8, qdiffu8); - *q10u8 = vqsubq_u8(*q10u8, qdiffu8); - *q11u8 = vqsubq_u8(*q11u8, qdiffu8); - *q12u8 = vqsubq_u8(*q12u8, qdiffu8); - *q13u8 = vqsubq_u8(*q13u8, qdiffu8); - *q14u8 = vqsubq_u8(*q14u8, qdiffu8); - *q15u8 = vqsubq_u8(*q15u8, qdiffu8); - return; +static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, + uint8x16_t *q9u8, uint8x16_t *q10u8, + uint8x16_t *q11u8, uint8x16_t *q12u8, + uint8x16_t *q13u8, uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vqsubq_u8(*q8u8, qdiffu8); + *q9u8 = vqsubq_u8(*q9u8, qdiffu8); + *q10u8 = vqsubq_u8(*q10u8, qdiffu8); + *q11u8 = vqsubq_u8(*q11u8, qdiffu8); + *q12u8 = vqsubq_u8(*q12u8, qdiffu8); + *q13u8 = vqsubq_u8(*q13u8, qdiffu8); + *q14u8 = vqsubq_u8(*q14u8, qdiffu8); + *q15u8 = vqsubq_u8(*q15u8, qdiffu8); + return; } -static INLINE void ST_16x8( - uint8_t *d, - int d_stride, - uint8x16_t *q8u8, - uint8x16_t *q9u8, - uint8x16_t *q10u8, - uint8x16_t *q11u8, - uint8x16_t *q12u8, - uint8x16_t *q13u8, - uint8x16_t *q14u8, - uint8x16_t *q15u8) { - vst1q_u8(d, *q8u8); - d += d_stride; - vst1q_u8(d, *q9u8); - d += d_stride; - vst1q_u8(d, *q10u8); - d += d_stride; - vst1q_u8(d, *q11u8); - d += d_stride; - vst1q_u8(d, *q12u8); - d += d_stride; - vst1q_u8(d, *q13u8); - d += d_stride; - vst1q_u8(d, *q14u8); - d += d_stride; - vst1q_u8(d, *q15u8); - return; +static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, + uint8x16_t *q9u8, uint8x16_t *q10u8, + uint8x16_t *q11u8, uint8x16_t *q12u8, + uint8x16_t *q13u8, uint8x16_t *q14u8, + uint8x16_t *q15u8) { + vst1q_u8(d, *q8u8); + d += d_stride; + vst1q_u8(d, *q9u8); + d += d_stride; + vst1q_u8(d, *q10u8); + d += d_stride; + vst1q_u8(d, *q11u8); + d += d_stride; + vst1q_u8(d, *q12u8); + d += d_stride; + vst1q_u8(d, *q13u8); + d += d_stride; + vst1q_u8(d, *q14u8); + d += d_stride; + vst1q_u8(d, *q15u8); + return; } -void vpx_idct32x32_1_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; - int i, j, dest_stride8; - uint8_t *d; - int16_t a1, cospi_16_64 = 11585; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); +void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int i, j, dest_stride8; + uint8_t *d; + int16_t a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 6); - dest_stride8 = dest_stride * 8; - if (a1 >= 0) { // diff_positive_32_32 - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, - &q12u8, &q13u8, &q14u8, &q15u8); - ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, - &q12u8, &q13u8, &q14u8, &q15u8); - ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, - &q12u8, &q13u8, &q14u8, &q15u8); - d += dest_stride8; - } - } - } else { // diff_negative_32_32 - a1 = -a1; - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, - &q12u8, &q13u8, &q14u8, &q15u8); - SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, - &q12u8, &q13u8, &q14u8, &q15u8); - ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, - &q12u8, &q13u8, &q14u8, &q15u8); - d += dest_stride8; - } - } + dest_stride8 = dest_stride * 8; + if (a1 >= 0) { // diff_positive_32_32 + a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; + q0u8 = vdupq_n_u8(a1); + for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop + d = dest; + for (j = 0; j < 4; j++) { + LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + d += dest_stride8; + } } - return; + } else { // diff_negative_32_32 + a1 = -a1; + a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; + q0u8 = vdupq_n_u8(a1); + for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop + d = dest; + for (j = 0; j < 4; j++) { + LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + d += dest_stride8; + } + } + } + return; } diff --git a/vpx_dsp/arm/idct32x32_add_neon.c b/vpx_dsp/arm/idct32x32_add_neon.c index 025437eb963bc044a116e46f05be4d648a36064c..88b3d0109afdadf5f725b955ae0883cc673988f8 100644 --- a/vpx_dsp/arm/idct32x32_add_neon.c +++ b/vpx_dsp/arm/idct32x32_add_neon.c @@ -14,706 +14,672 @@ #include "vpx_dsp/txfm_common.h" #define LOAD_FROM_TRANSPOSED(prev, first, second) \ - q14s16 = vld1q_s16(trans_buf + first * 8); \ - q13s16 = vld1q_s16(trans_buf + second * 8); + q14s16 = vld1q_s16(trans_buf + first * 8); \ + q13s16 = vld1q_s16(trans_buf + second * 8); #define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ - qA = vld1q_s16(out + first * 32); \ - qB = vld1q_s16(out + second * 32); + qA = vld1q_s16(out + first * 32); \ + qB = vld1q_s16(out + second * 32); #define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ - vst1q_s16(out + first * 32, qA); \ - vst1q_s16(out + second * 32, qB); - -#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ - __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \ - q6s16, q7s16, q8s16, q9s16); -static INLINE void __STORE_COMBINE_CENTER_RESULTS( - uint8_t *p1, - uint8_t *p2, - int stride, - int16x8_t q6s16, - int16x8_t q7s16, - int16x8_t q8s16, - int16x8_t q9s16) { - int16x4_t d8s16, d9s16, d10s16, d11s16; - - d8s16 = vld1_s16((int16_t *)p1); - p1 += stride; - d11s16 = vld1_s16((int16_t *)p2); - p2 -= stride; - d9s16 = vld1_s16((int16_t *)p1); - d10s16 = vld1_s16((int16_t *)p2); - - q7s16 = vrshrq_n_s16(q7s16, 6); - q8s16 = vrshrq_n_s16(q8s16, 6); - q9s16 = vrshrq_n_s16(q9s16, 6); - q6s16 = vrshrq_n_s16(q6s16, 6); - - q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), - vreinterpret_u8_s16(d9s16))); - q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_s16(d10s16))); - q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_s16(d11s16))); - q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), - vreinterpret_u8_s16(d8s16))); - - d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); - d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); - d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); - d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); - - vst1_s16((int16_t *)p1, d9s16); - p1 -= stride; - vst1_s16((int16_t *)p2, d10s16); - p2 += stride; - vst1_s16((int16_t *)p1, d8s16); - vst1_s16((int16_t *)p2, d11s16); - return; + vst1q_s16(out + first * 32, qA); \ + vst1q_s16(out + second * 32, qB); + +#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ + __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16); +static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2, + int stride, int16x8_t q6s16, + int16x8_t q7s16, + int16x8_t q8s16, + int16x8_t q9s16) { + int16x4_t d8s16, d9s16, d10s16, d11s16; + + d8s16 = vld1_s16((int16_t *)p1); + p1 += stride; + d11s16 = vld1_s16((int16_t *)p2); + p2 -= stride; + d9s16 = vld1_s16((int16_t *)p1); + d10s16 = vld1_s16((int16_t *)p2); + + q7s16 = vrshrq_n_s16(q7s16, 6); + q8s16 = vrshrq_n_s16(q8s16, 6); + q9s16 = vrshrq_n_s16(q9s16, 6); + q6s16 = vrshrq_n_s16(q6s16, 6); + + q7s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16))); + q8s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16))); + q9s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16))); + q6s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16))); + + d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); + d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); + d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); + d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + + vst1_s16((int16_t *)p1, d9s16); + p1 -= stride; + vst1_s16((int16_t *)p2, d10s16); + p2 += stride; + vst1_s16((int16_t *)p1, d8s16); + vst1_s16((int16_t *)p2, d11s16); + return; } -#define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \ - __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \ - q4s16, q5s16, q6s16, q7s16); -static INLINE void __STORE_COMBINE_EXTREME_RESULTS( - uint8_t *p1, - uint8_t *p2, - int stride, - int16x8_t q4s16, - int16x8_t q5s16, - int16x8_t q6s16, - int16x8_t q7s16) { - int16x4_t d4s16, d5s16, d6s16, d7s16; - - d4s16 = vld1_s16((int16_t *)p1); - p1 += stride; - d7s16 = vld1_s16((int16_t *)p2); - p2 -= stride; - d5s16 = vld1_s16((int16_t *)p1); - d6s16 = vld1_s16((int16_t *)p2); - - q5s16 = vrshrq_n_s16(q5s16, 6); - q6s16 = vrshrq_n_s16(q6s16, 6); - q7s16 = vrshrq_n_s16(q7s16, 6); - q4s16 = vrshrq_n_s16(q4s16, 6); - - q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16), - vreinterpret_u8_s16(d5s16))); - q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16), - vreinterpret_u8_s16(d6s16))); - q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16), - vreinterpret_u8_s16(d7s16))); - q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16), - vreinterpret_u8_s16(d4s16))); - - d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); - d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); - d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); - d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); - - vst1_s16((int16_t *)p1, d5s16); - p1 -= stride; - vst1_s16((int16_t *)p2, d6s16); - p2 += stride; - vst1_s16((int16_t *)p2, d7s16); - vst1_s16((int16_t *)p1, d4s16); - return; +#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \ + ; \ + __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16); +static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2, + int stride, int16x8_t q4s16, + int16x8_t q5s16, + int16x8_t q6s16, + int16x8_t q7s16) { + int16x4_t d4s16, d5s16, d6s16, d7s16; + + d4s16 = vld1_s16((int16_t *)p1); + p1 += stride; + d7s16 = vld1_s16((int16_t *)p2); + p2 -= stride; + d5s16 = vld1_s16((int16_t *)p1); + d6s16 = vld1_s16((int16_t *)p2); + + q5s16 = vrshrq_n_s16(q5s16, 6); + q6s16 = vrshrq_n_s16(q6s16, 6); + q7s16 = vrshrq_n_s16(q7s16, 6); + q4s16 = vrshrq_n_s16(q4s16, 6); + + q5s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16))); + q6s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16))); + q7s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16))); + q4s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16))); + + d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); + d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); + d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); + + vst1_s16((int16_t *)p1, d5s16); + p1 -= stride; + vst1_s16((int16_t *)p2, d6s16); + p2 += stride; + vst1_s16((int16_t *)p2, d7s16); + vst1_s16((int16_t *)p1, d4s16); + return; } #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ - DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); -static INLINE void DO_BUTTERFLY( - int16x8_t q14s16, - int16x8_t q13s16, - int16_t first_const, - int16_t second_const, - int16x8_t *qAs16, - int16x8_t *qBs16) { - int16x4_t d30s16, d31s16; - int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; - int16x4_t dCs16, dDs16, dAs16, dBs16; - - dCs16 = vget_low_s16(q14s16); - dDs16 = vget_high_s16(q14s16); - dAs16 = vget_low_s16(q13s16); - dBs16 = vget_high_s16(q13s16); - - d30s16 = vdup_n_s16(first_const); - d31s16 = vdup_n_s16(second_const); - - q8s32 = vmull_s16(dCs16, d30s16); - q10s32 = vmull_s16(dAs16, d31s16); - q9s32 = vmull_s16(dDs16, d30s16); - q11s32 = vmull_s16(dBs16, d31s16); - q12s32 = vmull_s16(dCs16, d31s16); - - q8s32 = vsubq_s32(q8s32, q10s32); - q9s32 = vsubq_s32(q9s32, q11s32); - - q10s32 = vmull_s16(dDs16, d31s16); - q11s32 = vmull_s16(dAs16, d30s16); - q15s32 = vmull_s16(dBs16, d30s16); - - q11s32 = vaddq_s32(q12s32, q11s32); - q10s32 = vaddq_s32(q10s32, q15s32); - - *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), - vqrshrn_n_s32(q9s32, 14)); - *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), - vqrshrn_n_s32(q10s32, 14)); - return; + DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); +static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16, + int16_t first_const, int16_t second_const, + int16x8_t *qAs16, int16x8_t *qBs16) { + int16x4_t d30s16, d31s16; + int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; + int16x4_t dCs16, dDs16, dAs16, dBs16; + + dCs16 = vget_low_s16(q14s16); + dDs16 = vget_high_s16(q14s16); + dAs16 = vget_low_s16(q13s16); + dBs16 = vget_high_s16(q13s16); + + d30s16 = vdup_n_s16(first_const); + d31s16 = vdup_n_s16(second_const); + + q8s32 = vmull_s16(dCs16, d30s16); + q10s32 = vmull_s16(dAs16, d31s16); + q9s32 = vmull_s16(dDs16, d30s16); + q11s32 = vmull_s16(dBs16, d31s16); + q12s32 = vmull_s16(dCs16, d31s16); + + q8s32 = vsubq_s32(q8s32, q10s32); + q9s32 = vsubq_s32(q9s32, q11s32); + + q10s32 = vmull_s16(dDs16, d31s16); + q11s32 = vmull_s16(dAs16, d30s16); + q15s32 = vmull_s16(dBs16, d30s16); + + q11s32 = vaddq_s32(q12s32, q11s32); + q10s32 = vaddq_s32(q10s32, q15s32); + + *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14)); + *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14)); + return; } -static INLINE void idct32_transpose_pair( - int16_t *input, - int16_t *t_buf) { - int16_t *in; - int i; - const int stride = 32; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - for (i = 0; i < 4; i++, input += 8) { - in = input; - q8s16 = vld1q_s16(in); - in += stride; - q9s16 = vld1q_s16(in); - in += stride; - q10s16 = vld1q_s16(in); - in += stride; - q11s16 = vld1q_s16(in); - in += stride; - q12s16 = vld1q_s16(in); - in += stride; - q13s16 = vld1q_s16(in); - in += stride; - q14s16 = vld1q_s16(in); - in += stride; - q15s16 = vld1q_s16(in); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - q12s16 = vcombine_s16(d17s16, d25s16); - q13s16 = vcombine_s16(d19s16, d27s16); - q14s16 = vcombine_s16(d21s16, d29s16); - q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), - vreinterpretq_s32_s16(q10s16)); - q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16), - vreinterpretq_s32_s16(q11s16)); - q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16), - vreinterpretq_s32_s16(q14s16)); - q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16), - vreinterpretq_s32_s16(q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - vst1q_s16(t_buf, q0x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q0x2s16.val[1]); - t_buf += 8; - vst1q_s16(t_buf, q1x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q1x2s16.val[1]); - t_buf += 8; - vst1q_s16(t_buf, q2x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q2x2s16.val[1]); - t_buf += 8; - vst1q_s16(t_buf, q3x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q3x2s16.val[1]); - t_buf += 8; - } - return; +static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) { + int16_t *in; + int i; + const int stride = 32; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + for (i = 0; i < 4; i++, input += 8) { + in = input; + q8s16 = vld1q_s16(in); + in += stride; + q9s16 = vld1q_s16(in); + in += stride; + q10s16 = vld1q_s16(in); + in += stride; + q11s16 = vld1q_s16(in); + in += stride; + q12s16 = vld1q_s16(in); + in += stride; + q13s16 = vld1q_s16(in); + in += stride; + q14s16 = vld1q_s16(in); + in += stride; + q15s16 = vld1q_s16(in); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + q12s16 = vcombine_s16(d17s16, d25s16); + q13s16 = vcombine_s16(d19s16, d27s16); + q14s16 = vcombine_s16(d21s16, d29s16); + q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16)); + q1x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16)); + q2x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16)); + q3x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + vst1q_s16(t_buf, q0x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q0x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q1x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q1x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q2x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q2x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q3x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q3x2s16.val[1]); + t_buf += 8; + } + return; } -static INLINE void idct32_bands_end_1st_pass( - int16_t *out, - int16x8_t q2s16, - int16x8_t q3s16, - int16x8_t q6s16, - int16x8_t q7s16, - int16x8_t q8s16, - int16x8_t q9s16, - int16x8_t q10s16, - int16x8_t q11s16, - int16x8_t q12s16, - int16x8_t q13s16, - int16x8_t q14s16, - int16x8_t q15s16) { - int16x8_t q0s16, q1s16, q4s16, q5s16; - - STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); - STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); - - LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); - STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); - - LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); - q2s16 = vaddq_s16(q10s16, q1s16); - q3s16 = vaddq_s16(q11s16, q0s16); - q4s16 = vsubq_s16(q11s16, q0s16); - q5s16 = vsubq_s16(q10s16, q1s16); - - LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); - STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); - - LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); - STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); - - LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); - q2s16 = vaddq_s16(q12s16, q1s16); - q3s16 = vaddq_s16(q13s16, q0s16); - q4s16 = vsubq_s16(q13s16, q0s16); - q5s16 = vsubq_s16(q12s16, q1s16); - - LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); - STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); - - LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); - STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); - - LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); - q2s16 = vaddq_s16(q14s16, q1s16); - q3s16 = vaddq_s16(q15s16, q0s16); - q4s16 = vsubq_s16(q15s16, q0s16); - q5s16 = vsubq_s16(q14s16, q1s16); - - LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); - STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); - - LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); - STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); - return; +static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16, + int16x8_t q3s16, int16x8_t q6s16, + int16x8_t q7s16, int16x8_t q8s16, + int16x8_t q9s16, int16x8_t q10s16, + int16x8_t q11s16, int16x8_t q12s16, + int16x8_t q13s16, int16x8_t q14s16, + int16x8_t q15s16) { + int16x8_t q0s16, q1s16, q4s16, q5s16; + + STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); + STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); + + LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); + STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); + + LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); + q2s16 = vaddq_s16(q10s16, q1s16); + q3s16 = vaddq_s16(q11s16, q0s16); + q4s16 = vsubq_s16(q11s16, q0s16); + q5s16 = vsubq_s16(q10s16, q1s16); + + LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); + STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); + + LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); + STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); + + LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); + q2s16 = vaddq_s16(q12s16, q1s16); + q3s16 = vaddq_s16(q13s16, q0s16); + q4s16 = vsubq_s16(q13s16, q0s16); + q5s16 = vsubq_s16(q12s16, q1s16); + + LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); + STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); + + LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); + STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); + + LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); + q2s16 = vaddq_s16(q14s16, q1s16); + q3s16 = vaddq_s16(q15s16, q0s16); + q4s16 = vsubq_s16(q15s16, q0s16); + q5s16 = vsubq_s16(q14s16, q1s16); + + LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); + STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); + + LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); + STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); + return; } static INLINE void idct32_bands_end_2nd_pass( - int16_t *out, - uint8_t *dest, - int stride, - int16x8_t q2s16, - int16x8_t q3s16, - int16x8_t q6s16, - int16x8_t q7s16, - int16x8_t q8s16, - int16x8_t q9s16, - int16x8_t q10s16, - int16x8_t q11s16, - int16x8_t q12s16, - int16x8_t q13s16, - int16x8_t q14s16, - int16x8_t q15s16) { - uint8_t *r6 = dest + 31 * stride; - uint8_t *r7 = dest/* + 0 * stride*/; - uint8_t *r9 = dest + 15 * stride; - uint8_t *r10 = dest + 16 * stride; - int str2 = stride << 1; - int16x8_t q0s16, q1s16, q4s16, q5s16; - - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; r9 -= str2; - - LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; r6 -= str2; - - LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) - q2s16 = vaddq_s16(q10s16, q1s16); - q3s16 = vaddq_s16(q11s16, q0s16); - q4s16 = vsubq_s16(q11s16, q0s16); - q5s16 = vsubq_s16(q10s16, q1s16); - - LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; r9 -= str2; - - LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; r6 -= str2; - - LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) - q2s16 = vaddq_s16(q12s16, q1s16); - q3s16 = vaddq_s16(q13s16, q0s16); - q4s16 = vsubq_s16(q13s16, q0s16); - q5s16 = vsubq_s16(q12s16, q1s16); - - LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; r9 -= str2; - - LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; r6 -= str2; - - LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) - q2s16 = vaddq_s16(q14s16, q1s16); - q3s16 = vaddq_s16(q15s16, q0s16); - q4s16 = vsubq_s16(q15s16, q0s16); - q5s16 = vsubq_s16(q14s16, q1s16); - - LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - - LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - return; + int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16, + int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16, + int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16, + int16x8_t q14s16, int16x8_t q15s16) { + uint8_t *r6 = dest + 31 * stride; + uint8_t *r7 = dest /* + 0 * stride*/; + uint8_t *r9 = dest + 15 * stride; + uint8_t *r10 = dest + 16 * stride; + int str2 = stride << 1; + int16x8_t q0s16, q1s16, q4s16, q5s16; + + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; + r9 -= str2; + + LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; + r6 -= str2; + + LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) + q2s16 = vaddq_s16(q10s16, q1s16); + q3s16 = vaddq_s16(q11s16, q0s16); + q4s16 = vsubq_s16(q11s16, q0s16); + q5s16 = vsubq_s16(q10s16, q1s16); + + LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; + r9 -= str2; + + LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; + r6 -= str2; + + LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) + q2s16 = vaddq_s16(q12s16, q1s16); + q3s16 = vaddq_s16(q13s16, q0s16); + q4s16 = vsubq_s16(q13s16, q0s16); + q5s16 = vsubq_s16(q12s16, q1s16); + + LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; + r9 -= str2; + + LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; + r6 -= str2; + + LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) + q2s16 = vaddq_s16(q14s16, q1s16); + q3s16 = vaddq_s16(q15s16, q0s16); + q4s16 = vsubq_s16(q15s16, q0s16); + q5s16 = vsubq_s16(q14s16, q1s16); + + LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + + LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + return; } -void vpx_idct32x32_1024_add_neon( - int16_t *input, - uint8_t *dest, - int stride) { - int i, idct32_pass_loop; - int16_t trans_buf[32 * 8]; - int16_t pass1[32 * 32]; - int16_t pass2[32 * 32]; - int16_t *out; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - - for (idct32_pass_loop = 0, out = pass1; - idct32_pass_loop < 2; - idct32_pass_loop++, - input = pass1, // the input of pass2 is the result of pass1 - out = pass2) { - for (i = 0; - i < 4; i++, - input += 32 * 8, out += 8) { // idct32_bands_loop - idct32_transpose_pair(input, trans_buf); - - // ----------------------------------------- - // BLOCK A: 16-19,28-31 - // ----------------------------------------- - // generate 16,17,30,31 - // part of stage 1 - LOAD_FROM_TRANSPOSED(0, 1, 31) - DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(31, 17, 15) - DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) - // part of stage 2 - q4s16 = vaddq_s16(q0s16, q1s16); - q13s16 = vsubq_s16(q0s16, q1s16); - q6s16 = vaddq_s16(q2s16, q3s16); - q14s16 = vsubq_s16(q2s16, q3s16); - // part of stage 3 - DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) - - // generate 18,19,28,29 - // part of stage 1 - LOAD_FROM_TRANSPOSED(15, 9, 23) - DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(23, 25, 7) - DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) - // part of stage 2 - q13s16 = vsubq_s16(q3s16, q2s16); - q3s16 = vaddq_s16(q3s16, q2s16); - q14s16 = vsubq_s16(q1s16, q0s16); - q2s16 = vaddq_s16(q1s16, q0s16); - // part of stage 3 - DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) - // part of stage 4 - q8s16 = vaddq_s16(q4s16, q2s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q10s16 = vaddq_s16(q7s16, q1s16); - q15s16 = vaddq_s16(q6s16, q3s16); - q13s16 = vsubq_s16(q5s16, q0s16); - q14s16 = vsubq_s16(q7s16, q1s16); - STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) - STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) - // part of stage 5 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) - STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) - // part of stage 4 - q13s16 = vsubq_s16(q4s16, q2s16); - q14s16 = vsubq_s16(q6s16, q3s16); - // part of stage 5 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) - STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) - - // ----------------------------------------- - // BLOCK B: 20-23,24-27 - // ----------------------------------------- - // generate 20,21,26,27 - // part of stage 1 - LOAD_FROM_TRANSPOSED(7, 5, 27) - DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(27, 21, 11) - DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) - // part of stage 2 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); - // part of stage 3 - DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) - - // generate 22,23,24,25 - // part of stage 1 - LOAD_FROM_TRANSPOSED(11, 13, 19) - DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(19, 29, 3) - DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) - // part of stage 2 - q14s16 = vsubq_s16(q4s16, q5s16); - q5s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q6s16, q7s16); - q6s16 = vaddq_s16(q6s16, q7s16); - // part of stage 3 - DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) - // part of stage 4 - q10s16 = vaddq_s16(q7s16, q1s16); - q11s16 = vaddq_s16(q5s16, q0s16); - q12s16 = vaddq_s16(q6s16, q2s16); - q15s16 = vaddq_s16(q4s16, q3s16); - // part of stage 6 - LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) - q8s16 = vaddq_s16(q14s16, q11s16); - q9s16 = vaddq_s16(q13s16, q10s16); - q13s16 = vsubq_s16(q13s16, q10s16); - q11s16 = vsubq_s16(q14s16, q11s16); - STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) - LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) - q8s16 = vsubq_s16(q9s16, q12s16); - q10s16 = vaddq_s16(q14s16, q15s16); - q14s16 = vsubq_s16(q14s16, q15s16); - q12s16 = vaddq_s16(q9s16, q12s16); - STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) - // part of stage 7 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) - q13s16 = q11s16; - q14s16 = q8s16; - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) - // part of stage 4 - q14s16 = vsubq_s16(q5s16, q0s16); - q13s16 = vsubq_s16(q6s16, q2s16); - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); - q14s16 = vsubq_s16(q7s16, q1s16); - q13s16 = vsubq_s16(q4s16, q3s16); - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); - // part of stage 6 - LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) - q8s16 = vaddq_s16(q14s16, q1s16); - q9s16 = vaddq_s16(q13s16, q6s16); - q13s16 = vsubq_s16(q13s16, q6s16); - q1s16 = vsubq_s16(q14s16, q1s16); - STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) - LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) - q14s16 = vsubq_s16(q8s16, q5s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q9s16, q0s16); - q0s16 = vsubq_s16(q9s16, q0s16); - STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) - // part of stage 7 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) - DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, - &q1s16, &q0s16); - STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) - - // ----------------------------------------- - // BLOCK C: 8-10,11-15 - // ----------------------------------------- - // generate 8,9,14,15 - // part of stage 2 - LOAD_FROM_TRANSPOSED(3, 2, 30) - DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(30, 18, 14) - DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) - // part of stage 3 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); - // part of stage 4 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) - - // generate 10,11,12,13 - // part of stage 2 - LOAD_FROM_TRANSPOSED(14, 10, 22) - DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(22, 26, 6) - DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) - // part of stage 3 - q14s16 = vsubq_s16(q4s16, q5s16); - q5s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q6s16, q7s16); - q6s16 = vaddq_s16(q6s16, q7s16); - // part of stage 4 - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) - // part of stage 5 - q8s16 = vaddq_s16(q0s16, q5s16); - q9s16 = vaddq_s16(q1s16, q7s16); - q13s16 = vsubq_s16(q1s16, q7s16); - q14s16 = vsubq_s16(q3s16, q4s16); - q10s16 = vaddq_s16(q3s16, q4s16); - q15s16 = vaddq_s16(q2s16, q6s16); - STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) - STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) - // part of stage 6 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) - q13s16 = vsubq_s16(q0s16, q5s16); - q14s16 = vsubq_s16(q2s16, q6s16); - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) - - // ----------------------------------------- - // BLOCK D: 0-3,4-7 - // ----------------------------------------- - // generate 4,5,6,7 - // part of stage 3 - LOAD_FROM_TRANSPOSED(6, 4, 28) - DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(28, 20, 12) - DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) - // part of stage 4 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); - // part of stage 5 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - - // generate 0,1,2,3 - // part of stage 4 - LOAD_FROM_TRANSPOSED(12, 0, 16) - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(16, 8, 24) - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) - // part of stage 5 - q4s16 = vaddq_s16(q7s16, q6s16); - q7s16 = vsubq_s16(q7s16, q6s16); - q6s16 = vsubq_s16(q5s16, q14s16); - q5s16 = vaddq_s16(q5s16, q14s16); - // part of stage 6 - q8s16 = vaddq_s16(q4s16, q2s16); - q9s16 = vaddq_s16(q5s16, q3s16); - q10s16 = vaddq_s16(q6s16, q1s16); - q11s16 = vaddq_s16(q7s16, q0s16); - q12s16 = vsubq_s16(q7s16, q0s16); - q13s16 = vsubq_s16(q6s16, q1s16); - q14s16 = vsubq_s16(q5s16, q3s16); - q15s16 = vsubq_s16(q4s16, q2s16); - // part of stage 7 - LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) - q2s16 = vaddq_s16(q8s16, q1s16); - q3s16 = vaddq_s16(q9s16, q0s16); - q4s16 = vsubq_s16(q9s16, q0s16); - q5s16 = vsubq_s16(q8s16, q1s16); - LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - - if (idct32_pass_loop == 0) { - idct32_bands_end_1st_pass(out, - q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, - q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); - } else { - idct32_bands_end_2nd_pass(out, dest, stride, - q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, - q10s16, q11s16, q12s16, q13s16, q14s16, q15s16); - dest += 8; - } - } +void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) { + int i, idct32_pass_loop; + int16_t trans_buf[32 * 8]; + int16_t pass1[32 * 32]; + int16_t pass2[32 * 32]; + int16_t *out; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + + for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; + idct32_pass_loop++, + input = pass1, // the input of pass2 is the result of pass1 + out = pass2) { + for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop + idct32_transpose_pair(input, trans_buf); + + // ----------------------------------------- + // BLOCK A: 16-19,28-31 + // ----------------------------------------- + // generate 16,17,30,31 + // part of stage 1 + LOAD_FROM_TRANSPOSED(0, 1, 31) + DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(31, 17, 15) + DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) + // part of stage 2 + q4s16 = vaddq_s16(q0s16, q1s16); + q13s16 = vsubq_s16(q0s16, q1s16); + q6s16 = vaddq_s16(q2s16, q3s16); + q14s16 = vsubq_s16(q2s16, q3s16); + // part of stage 3 + DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) + + // generate 18,19,28,29 + // part of stage 1 + LOAD_FROM_TRANSPOSED(15, 9, 23) + DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(23, 25, 7) + DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) + // part of stage 2 + q13s16 = vsubq_s16(q3s16, q2s16); + q3s16 = vaddq_s16(q3s16, q2s16); + q14s16 = vsubq_s16(q1s16, q0s16); + q2s16 = vaddq_s16(q1s16, q0s16); + // part of stage 3 + DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) + // part of stage 4 + q8s16 = vaddq_s16(q4s16, q2s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q10s16 = vaddq_s16(q7s16, q1s16); + q15s16 = vaddq_s16(q6s16, q3s16); + q13s16 = vsubq_s16(q5s16, q0s16); + q14s16 = vsubq_s16(q7s16, q1s16); + STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) + STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) + // part of stage 5 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) + STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) + // part of stage 4 + q13s16 = vsubq_s16(q4s16, q2s16); + q14s16 = vsubq_s16(q6s16, q3s16); + // part of stage 5 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) + STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) + + // ----------------------------------------- + // BLOCK B: 20-23,24-27 + // ----------------------------------------- + // generate 20,21,26,27 + // part of stage 1 + LOAD_FROM_TRANSPOSED(7, 5, 27) + DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(27, 21, 11) + DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) + // part of stage 2 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 3 + DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + + // generate 22,23,24,25 + // part of stage 1 + LOAD_FROM_TRANSPOSED(11, 13, 19) + DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(19, 29, 3) + DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) + // part of stage 2 + q14s16 = vsubq_s16(q4s16, q5s16); + q5s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q6s16, q7s16); + q6s16 = vaddq_s16(q6s16, q7s16); + // part of stage 3 + DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) + // part of stage 4 + q10s16 = vaddq_s16(q7s16, q1s16); + q11s16 = vaddq_s16(q5s16, q0s16); + q12s16 = vaddq_s16(q6s16, q2s16); + q15s16 = vaddq_s16(q4s16, q3s16); + // part of stage 6 + LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) + q8s16 = vaddq_s16(q14s16, q11s16); + q9s16 = vaddq_s16(q13s16, q10s16); + q13s16 = vsubq_s16(q13s16, q10s16); + q11s16 = vsubq_s16(q14s16, q11s16); + STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) + LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) + q8s16 = vsubq_s16(q9s16, q12s16); + q10s16 = vaddq_s16(q14s16, q15s16); + q14s16 = vsubq_s16(q14s16, q15s16); + q12s16 = vaddq_s16(q9s16, q12s16); + STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) + // part of stage 7 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) + q13s16 = q11s16; + q14s16 = q8s16; + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) + // part of stage 4 + q14s16 = vsubq_s16(q5s16, q0s16); + q13s16 = vsubq_s16(q6s16, q2s16); + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); + q14s16 = vsubq_s16(q7s16, q1s16); + q13s16 = vsubq_s16(q4s16, q3s16); + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); + // part of stage 6 + LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) + q8s16 = vaddq_s16(q14s16, q1s16); + q9s16 = vaddq_s16(q13s16, q6s16); + q13s16 = vsubq_s16(q13s16, q6s16); + q1s16 = vsubq_s16(q14s16, q1s16); + STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) + LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) + q14s16 = vsubq_s16(q8s16, q5s16); + q10s16 = vaddq_s16(q8s16, q5s16); + q11s16 = vaddq_s16(q9s16, q0s16); + q0s16 = vsubq_s16(q9s16, q0s16); + STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) + // part of stage 7 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) + DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16); + STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) + + // ----------------------------------------- + // BLOCK C: 8-10,11-15 + // ----------------------------------------- + // generate 8,9,14,15 + // part of stage 2 + LOAD_FROM_TRANSPOSED(3, 2, 30) + DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(30, 18, 14) + DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) + // part of stage 3 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 4 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) + + // generate 10,11,12,13 + // part of stage 2 + LOAD_FROM_TRANSPOSED(14, 10, 22) + DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(22, 26, 6) + DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) + // part of stage 3 + q14s16 = vsubq_s16(q4s16, q5s16); + q5s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q6s16, q7s16); + q6s16 = vaddq_s16(q6s16, q7s16); + // part of stage 4 + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) + // part of stage 5 + q8s16 = vaddq_s16(q0s16, q5s16); + q9s16 = vaddq_s16(q1s16, q7s16); + q13s16 = vsubq_s16(q1s16, q7s16); + q14s16 = vsubq_s16(q3s16, q4s16); + q10s16 = vaddq_s16(q3s16, q4s16); + q15s16 = vaddq_s16(q2s16, q6s16); + STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) + STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) + // part of stage 6 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) + q13s16 = vsubq_s16(q0s16, q5s16); + q14s16 = vsubq_s16(q2s16, q6s16); + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) + + // ----------------------------------------- + // BLOCK D: 0-3,4-7 + // ----------------------------------------- + // generate 4,5,6,7 + // part of stage 3 + LOAD_FROM_TRANSPOSED(6, 4, 28) + DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(28, 20, 12) + DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + // part of stage 4 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 5 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + + // generate 0,1,2,3 + // part of stage 4 + LOAD_FROM_TRANSPOSED(12, 0, 16) + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(16, 8, 24) + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) + // part of stage 5 + q4s16 = vaddq_s16(q7s16, q6s16); + q7s16 = vsubq_s16(q7s16, q6s16); + q6s16 = vsubq_s16(q5s16, q14s16); + q5s16 = vaddq_s16(q5s16, q14s16); + // part of stage 6 + q8s16 = vaddq_s16(q4s16, q2s16); + q9s16 = vaddq_s16(q5s16, q3s16); + q10s16 = vaddq_s16(q6s16, q1s16); + q11s16 = vaddq_s16(q7s16, q0s16); + q12s16 = vsubq_s16(q7s16, q0s16); + q13s16 = vsubq_s16(q6s16, q1s16); + q14s16 = vsubq_s16(q5s16, q3s16); + q15s16 = vsubq_s16(q4s16, q2s16); + // part of stage 7 + LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) + q2s16 = vaddq_s16(q8s16, q1s16); + q3s16 = vaddq_s16(q9s16, q0s16); + q4s16 = vsubq_s16(q9s16, q0s16); + q5s16 = vsubq_s16(q8s16, q1s16); + LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + + if (idct32_pass_loop == 0) { + idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, + q10s16, q11s16, q12s16, q13s16, q14s16, + q15s16); + } else { + idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16, + q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, + q14s16, q15s16); + dest += 8; + } } - return; + } + return; } diff --git a/vpx_dsp/arm/idct4x4_1_add_neon.c b/vpx_dsp/arm/idct4x4_1_add_neon.c index ea618700c95457f77b1a361754446ce98eb5eba4..9f999e979d7fd1721db04c2475a36902463879f5 100644 --- a/vpx_dsp/arm/idct4x4_1_add_neon.c +++ b/vpx_dsp/arm/idct4x4_1_add_neon.c @@ -13,38 +13,34 @@ #include "vpx_dsp/inv_txfm.h" #include "vpx_ports/mem.h" -void vpx_idct4x4_1_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8x8_t d6u8; - uint32x2_t d2u32 = vdup_n_u32(0); - uint16x8_t q8u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, a1, cospi_16_64 = 11585; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 4); +void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x8_t d6u8; + uint32x2_t d2u32 = vdup_n_u32(0); + uint16x8_t q8u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); - q0s16 = vdupq_n_s16(a1); + q0s16 = vdupq_n_s16(a1); - // dc_only_idct_add - d1 = d2 = dest; - for (i = 0; i < 2; i++) { - d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); - d1 += dest_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); - d1 += dest_stride; + // dc_only_idct_add + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); + d1 += dest_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); + d1 += dest_stride; - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), - vreinterpret_u8_u32(d2u32)); - d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32)); + d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); - d2 += dest_stride; - vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); - d2 += dest_stride; - } - return; + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); + d2 += dest_stride; + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); + d2 += dest_stride; + } + return; } diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c index 3c975c99b771c7fbe063fecba3dde0d469e116ef..382626928bfdcd15dffad54bff915edd6b845f4f 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.c +++ b/vpx_dsp/arm/idct4x4_add_neon.c @@ -10,142 +10,137 @@ #include <arm_neon.h> -void vpx_idct4x4_16_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8x8_t d26u8, d27u8; - uint32x2_t d26u32, d27u32; - uint16x8_t q8u16, q9u16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; - int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; - int16x8_t q8s16, q9s16, q13s16, q14s16; - int32x4_t q1s32, q13s32, q14s32, q15s32; - int16x4x2_t d0x2s16, d1x2s16; - int32x4x2_t q0x2s32; - uint8_t *d; - int16_t cospi_8_64 = 15137; - int16_t cospi_16_64 = 11585; - int16_t cospi_24_64 = 6270; - - d26u32 = d27u32 = vdup_n_u32(0); - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - - d0x2s16 = vtrn_s16(d16s16, d17s16); - d1x2s16 = vtrn_s16(d18s16, d19s16); - q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); - q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - - d20s16 = vdup_n_s16(cospi_8_64); - d21s16 = vdup_n_s16(cospi_16_64); - - q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), - vreinterpretq_s32_s16(q9s16)); - d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - - d22s16 = vdup_n_s16(cospi_24_64); - - // stage 1 - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, d22s16); - q1s32 = vmull_s16(d17s16, d20s16); - q13s32 = vmull_s16(d23s16, d21s16); - q14s32 = vmull_s16(d24s16, d21s16); - - q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); - q1s32 = vmlal_s16(q1s32, d19s16, d22s16); - - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q1s32, 14); - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - - // stage 2 - q8s16 = vaddq_s16(q13s16, q14s16); - q9s16 = vsubq_s16(q13s16, q14s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_high_s16(q9s16); // vswp d18 d19 - d19s16 = vget_low_s16(q9s16); - - d0x2s16 = vtrn_s16(d16s16, d17s16); - d1x2s16 = vtrn_s16(d18s16, d19s16); - q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); - q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - - q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), - vreinterpretq_s32_s16(q9s16)); - d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - - // do the transform on columns - // stage 1 - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, d22s16); - q1s32 = vmull_s16(d17s16, d20s16); - q13s32 = vmull_s16(d23s16, d21s16); - q14s32 = vmull_s16(d24s16, d21s16); - - q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); - q1s32 = vmlal_s16(q1s32, d19s16, d22s16); - - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q1s32, 14); - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - - // stage 2 - q8s16 = vaddq_s16(q13s16, q14s16); - q9s16 = vsubq_s16(q13s16, q14s16); - - q8s16 = vrshrq_n_s16(q8s16, 4); - q9s16 = vrshrq_n_s16(q9s16, 4); - - d = dest; - d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); - d += dest_stride; - d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); - d += dest_stride; - d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); - d += dest_stride; - d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_u32(d26u32)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_u32(d27u32)); - - d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - - d = dest; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); - return; +void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x8_t d26u8, d27u8; + uint32x2_t d26u32, d27u32; + uint16x8_t q8u16, q9u16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; + int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; + int16x8_t q8s16, q9s16, q13s16, q14s16; + int32x4_t q1s32, q13s32, q14s32, q15s32; + int16x4x2_t d0x2s16, d1x2s16; + int32x4x2_t q0x2s32; + uint8_t *d; + int16_t cospi_8_64 = 15137; + int16_t cospi_16_64 = 11585; + int16_t cospi_24_64 = 6270; + + d26u32 = d27u32 = vdup_n_u32(0); + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + + d0x2s16 = vtrn_s16(d16s16, d17s16); + d1x2s16 = vtrn_s16(d18s16, d19s16); + q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); + q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + + d20s16 = vdup_n_s16(cospi_8_64); + d21s16 = vdup_n_s16(cospi_16_64); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); + d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + + d22s16 = vdup_n_s16(cospi_24_64); + + // stage 1 + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, d22s16); + q1s32 = vmull_s16(d17s16, d20s16); + q13s32 = vmull_s16(d23s16, d21s16); + q14s32 = vmull_s16(d24s16, d21s16); + + q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); + q1s32 = vmlal_s16(q1s32, d19s16, d22s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q1s32, 14); + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + + // stage 2 + q8s16 = vaddq_s16(q13s16, q14s16); + q9s16 = vsubq_s16(q13s16, q14s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_high_s16(q9s16); // vswp d18 d19 + d19s16 = vget_low_s16(q9s16); + + d0x2s16 = vtrn_s16(d16s16, d17s16); + d1x2s16 = vtrn_s16(d18s16, d19s16); + q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); + q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); + d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + + // do the transform on columns + // stage 1 + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, d22s16); + q1s32 = vmull_s16(d17s16, d20s16); + q13s32 = vmull_s16(d23s16, d21s16); + q14s32 = vmull_s16(d24s16, d21s16); + + q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); + q1s32 = vmlal_s16(q1s32, d19s16, d22s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q1s32, 14); + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + + // stage 2 + q8s16 = vaddq_s16(q13s16, q14s16); + q9s16 = vsubq_s16(q13s16, q14s16); + + q8s16 = vrshrq_n_s16(q8s16, 4); + q9s16 = vrshrq_n_s16(q9s16, 4); + + d = dest; + d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); + d += dest_stride; + d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); + d += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); + d += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); + + d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + + d = dest; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); + return; } diff --git a/vpx_dsp/arm/idct8x8_1_add_neon.c b/vpx_dsp/arm/idct8x8_1_add_neon.c index c1b801fad54390af8544f994de6bc9a2f98cae94..e3db0b876bc2cb5cf3fe2db7ee70ee99a555e639 100644 --- a/vpx_dsp/arm/idct8x8_1_add_neon.c +++ b/vpx_dsp/arm/idct8x8_1_add_neon.c @@ -13,52 +13,49 @@ #include "vpx_dsp/inv_txfm.h" #include "vpx_ports/mem.h" -void vpx_idct8x8_1_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, a1, cospi_16_64 = 11585; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 5); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); - - d1 = d2 = dest; - for (i = 0; i < 2; i++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d5u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); - - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); - d2 += dest_stride; - } - return; +void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x8_t d2u8, d3u8, d30u8, d31u8; + uint64x1_t d2u64, d3u64, d4u64, d5u64; + uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1, cospi_16_64 = 11585; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 5); + + q0s16 = vdupq_n_s16(a1); + q0u16 = vreinterpretq_u16_s16(q0s16); + + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d4u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d5u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + + q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); + q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); + q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); + q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); + d2 += dest_stride; + } + return; } diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c index 4b2c2a6f83c0ce731605827984aa658dd662ada4..f1c271110dea9fedc801d36e2e416676c5d81c53 100644 --- a/vpx_dsp/arm/idct8x8_add_neon.c +++ b/vpx_dsp/arm/idct8x8_add_neon.c @@ -13,528 +13,496 @@ #include "./vpx_config.h" #include "vpx_dsp/txfm_common.h" -static INLINE void TRANSPOSE8X8( - int16x8_t *q8s16, - int16x8_t *q9s16, - int16x8_t *q10s16, - int16x8_t *q11s16, - int16x8_t *q12s16, - int16x8_t *q13s16, - int16x8_t *q14s16, - int16x8_t *q15s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - *q12s16 = vcombine_s16(d17s16, d25s16); - *q13s16 = vcombine_s16(d19s16, d27s16); - *q14s16 = vcombine_s16(d21s16, d29s16); - *q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), - vreinterpretq_s32_s16(*q10s16)); - q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), - vreinterpretq_s32_s16(*q11s16)); - q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), - vreinterpretq_s32_s16(*q14s16)); - q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), - vreinterpretq_s32_s16(*q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - *q8s16 = q0x2s16.val[0]; - *q9s16 = q0x2s16.val[1]; - *q10s16 = q1x2s16.val[0]; - *q11s16 = q1x2s16.val[1]; - *q12s16 = q2x2s16.val[0]; - *q13s16 = q2x2s16.val[1]; - *q14s16 = q3x2s16.val[0]; - *q15s16 = q3x2s16.val[1]; - return; +static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, + int16x8_t *q10s16, int16x8_t *q11s16, + int16x8_t *q12s16, int16x8_t *q13s16, + int16x8_t *q14s16, int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; } -static INLINE void IDCT8x8_1D( - int16x8_t *q8s16, - int16x8_t *q9s16, - int16x8_t *q10s16, - int16x8_t *q11s16, - int16x8_t *q12s16, - int16x8_t *q13s16, - int16x8_t *q14s16, - int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d26s16, d2s16); - q6s32 = vmull_s16(d27s16, d2s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); - q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); - - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d10s16 = vqrshrn_n_s32(q5s32, 14); - d11s16 = vqrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q2s32 = vmull_s16(d18s16, d1s16); - q3s32 = vmull_s16(d19s16, d1s16); - q9s32 = vmull_s16(d26s16, d3s16); - q13s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlal_s16(q2s32, d30s16, d0s16); - q3s32 = vmlal_s16(q3s32, d31s16, d0s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q13s32 = vmlal_s16(q13s32, d23s16, d2s16); - - d14s16 = vqrshrn_n_s32(q2s32, 14); - d15s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q13s32, 14); - q6s16 = vcombine_s16(d12s16, d13s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d0s16 = vdup_n_s16(cospi_16_64); - - q2s32 = vmull_s16(d16s16, d0s16); - q3s32 = vmull_s16(d17s16, d0s16); - q13s32 = vmull_s16(d16s16, d0s16); - q15s32 = vmull_s16(d17s16, d0s16); - - q2s32 = vmlal_s16(q2s32, d24s16, d0s16); - q3s32 = vmlal_s16(q3s32, d25s16, d0s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); - q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); - - d0s16 = vdup_n_s16(cospi_24_64); - d1s16 = vdup_n_s16(cospi_8_64); - - d18s16 = vqrshrn_n_s32(q2s32, 14); - d19s16 = vqrshrn_n_s32(q3s32, 14); - d22s16 = vqrshrn_n_s32(q13s32, 14); - d23s16 = vqrshrn_n_s32(q15s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q2s32 = vmull_s16(d20s16, d0s16); - q3s32 = vmull_s16(d21s16, d0s16); - q8s32 = vmull_s16(d20s16, d1s16); - q12s32 = vmull_s16(d21s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); - q8s32 = vmlal_s16(q8s32, d28s16, d0s16); - q12s32 = vmlal_s16(q12s32, d29s16, d0s16); - - d26s16 = vqrshrn_n_s32(q2s32, 14); - d27s16 = vqrshrn_n_s32(q3s32, 14); - d30s16 = vqrshrn_n_s32(q8s32, 14); - d31s16 = vqrshrn_n_s32(q12s32, 14); - *q13s16 = vcombine_s16(d26s16, d27s16); - *q15s16 = vcombine_s16(d30s16, d31s16); - - q0s16 = vaddq_s16(*q9s16, *q15s16); - q1s16 = vaddq_s16(*q11s16, *q13s16); - q2s16 = vsubq_s16(*q11s16, *q13s16); - q3s16 = vsubq_s16(*q9s16, *q15s16); - - *q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - *q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - - d16s16 = vdup_n_s16(cospi_16_64); - - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - *q8s16 = vaddq_s16(q0s16, q7s16); - *q9s16 = vaddq_s16(q1s16, q6s16); - *q10s16 = vaddq_s16(q2s16, q5s16); - *q11s16 = vaddq_s16(q3s16, q4s16); - *q12s16 = vsubq_s16(q3s16, q4s16); - *q13s16 = vsubq_s16(q2s16, q5s16); - *q14s16 = vsubq_s16(q1s16, q6s16); - *q15s16 = vsubq_s16(q0s16, q7s16); - return; +static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, + int16x8_t *q10s16, int16x8_t *q11s16, + int16x8_t *q12s16, int16x8_t *q13s16, + int16x8_t *q14s16, int16x8_t *q15s16) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + + d0s16 = vdup_n_s16(cospi_28_64); + d1s16 = vdup_n_s16(cospi_4_64); + d2s16 = vdup_n_s16(cospi_12_64); + d3s16 = vdup_n_s16(cospi_20_64); + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d26s16, d2s16); + q6s32 = vmull_s16(d27s16, d2s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); + q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d10s16 = vqrshrn_n_s32(q5s32, 14); + d11s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q2s32 = vmull_s16(d18s16, d1s16); + q3s32 = vmull_s16(d19s16, d1s16); + q9s32 = vmull_s16(d26s16, d3s16); + q13s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlal_s16(q2s32, d30s16, d0s16); + q3s32 = vmlal_s16(q3s32, d31s16, d0s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q13s32 = vmlal_s16(q13s32, d23s16, d2s16); + + d14s16 = vqrshrn_n_s32(q2s32, 14); + d15s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q13s32, 14); + q6s16 = vcombine_s16(d12s16, d13s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d0s16 = vdup_n_s16(cospi_16_64); + + q2s32 = vmull_s16(d16s16, d0s16); + q3s32 = vmull_s16(d17s16, d0s16); + q13s32 = vmull_s16(d16s16, d0s16); + q15s32 = vmull_s16(d17s16, d0s16); + + q2s32 = vmlal_s16(q2s32, d24s16, d0s16); + q3s32 = vmlal_s16(q3s32, d25s16, d0s16); + q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); + q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); + + d0s16 = vdup_n_s16(cospi_24_64); + d1s16 = vdup_n_s16(cospi_8_64); + + d18s16 = vqrshrn_n_s32(q2s32, 14); + d19s16 = vqrshrn_n_s32(q3s32, 14); + d22s16 = vqrshrn_n_s32(q13s32, 14); + d23s16 = vqrshrn_n_s32(q15s32, 14); + *q9s16 = vcombine_s16(d18s16, d19s16); + *q11s16 = vcombine_s16(d22s16, d23s16); + + q2s32 = vmull_s16(d20s16, d0s16); + q3s32 = vmull_s16(d21s16, d0s16); + q8s32 = vmull_s16(d20s16, d1s16); + q12s32 = vmull_s16(d21s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); + q8s32 = vmlal_s16(q8s32, d28s16, d0s16); + q12s32 = vmlal_s16(q12s32, d29s16, d0s16); + + d26s16 = vqrshrn_n_s32(q2s32, 14); + d27s16 = vqrshrn_n_s32(q3s32, 14); + d30s16 = vqrshrn_n_s32(q8s32, 14); + d31s16 = vqrshrn_n_s32(q12s32, 14); + *q13s16 = vcombine_s16(d26s16, d27s16); + *q15s16 = vcombine_s16(d30s16, d31s16); + + q0s16 = vaddq_s16(*q9s16, *q15s16); + q1s16 = vaddq_s16(*q11s16, *q13s16); + q2s16 = vsubq_s16(*q11s16, *q13s16); + q3s16 = vsubq_s16(*q9s16, *q15s16); + + *q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + *q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + + d16s16 = vdup_n_s16(cospi_16_64); + + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + *q8s16 = vaddq_s16(q0s16, q7s16); + *q9s16 = vaddq_s16(q1s16, q6s16); + *q10s16 = vaddq_s16(q2s16, q5s16); + *q11s16 = vaddq_s16(q3s16, q4s16); + *q12s16 = vsubq_s16(q3s16, q4s16); + *q13s16 = vsubq_s16(q2s16, q5s16); + *q14s16 = vsubq_s16(q1s16, q6s16); + *q15s16 = vsubq_s16(q0s16, q7s16); + return; } -void vpx_idct8x8_64_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - d1 = d2 = dest; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), - vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), - vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), - vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), - vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - return; +void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 16); + q11s16 = vld1q_s16(input + 24); + q12s16 = vld1q_s16(input + 32); + q13s16 = vld1q_s16(input + 40); + q14s16 = vld1q_s16(input + 48); + q15s16 = vld1q_s16(input + 56); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + d1 = d2 = dest; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + return; } -void vpx_idct8x8_12_add_neon( - int16_t *input, - uint8_t *dest, - int dest_stride) { - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; - int16x4_t d26s16, d27s16, d28s16, d29s16; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - int32x4_t q9s32, q10s32, q11s32, q12s32; - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - // First transform rows - // stage 1 - q0s16 = vdupq_n_s16(cospi_28_64 * 2); - q1s16 = vdupq_n_s16(cospi_4_64 * 2); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - - q0s16 = vdupq_n_s16(-cospi_20_64 * 2); - - q7s16 = vqrdmulhq_s16(q9s16, q1s16); - - q1s16 = vdupq_n_s16(cospi_12_64 * 2); - - q5s16 = vqrdmulhq_s16(q11s16, q0s16); - - q0s16 = vdupq_n_s16(cospi_16_64 * 2); - - q6s16 = vqrdmulhq_s16(q11s16, q1s16); - - // stage 2 & stage 3 - even half - q1s16 = vdupq_n_s16(cospi_24_64 * 2); - - q9s16 = vqrdmulhq_s16(q8s16, q0s16); - - q0s16 = vdupq_n_s16(cospi_8_64 * 2); - - q13s16 = vqrdmulhq_s16(q10s16, q1s16); - - q15s16 = vqrdmulhq_s16(q10s16, q0s16); - - // stage 3 -odd half - q0s16 = vaddq_s16(q9s16, q15s16); - q1s16 = vaddq_s16(q9s16, q13s16); - q2s16 = vsubq_s16(q9s16, q13s16); - q3s16 = vsubq_s16(q9s16, q15s16); - - // stage 2 - odd half - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d16s16 = vdup_n_s16(cospi_16_64); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 4 - q8s16 = vaddq_s16(q0s16, q7s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); - q15s16 = vsubq_s16(q0s16, q7s16); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, - &q12s16, &q13s16, &q14s16, &q15s16); - - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - d1 = d2 = dest; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), - vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), - vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), - vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), - vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), - vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), - vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - return; +void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; + int16x4_t d26s16, d27s16, d28s16, d29s16; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 16); + q11s16 = vld1q_s16(input + 24); + q12s16 = vld1q_s16(input + 32); + q13s16 = vld1q_s16(input + 40); + q14s16 = vld1q_s16(input + 48); + q15s16 = vld1q_s16(input + 56); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + // First transform rows + // stage 1 + q0s16 = vdupq_n_s16(cospi_28_64 * 2); + q1s16 = vdupq_n_s16(cospi_4_64 * 2); + + q4s16 = vqrdmulhq_s16(q9s16, q0s16); + + q0s16 = vdupq_n_s16(-cospi_20_64 * 2); + + q7s16 = vqrdmulhq_s16(q9s16, q1s16); + + q1s16 = vdupq_n_s16(cospi_12_64 * 2); + + q5s16 = vqrdmulhq_s16(q11s16, q0s16); + + q0s16 = vdupq_n_s16(cospi_16_64 * 2); + + q6s16 = vqrdmulhq_s16(q11s16, q1s16); + + // stage 2 & stage 3 - even half + q1s16 = vdupq_n_s16(cospi_24_64 * 2); + + q9s16 = vqrdmulhq_s16(q8s16, q0s16); + + q0s16 = vdupq_n_s16(cospi_8_64 * 2); + + q13s16 = vqrdmulhq_s16(q10s16, q1s16); + + q15s16 = vqrdmulhq_s16(q10s16, q0s16); + + // stage 3 -odd half + q0s16 = vaddq_s16(q9s16, q15s16); + q1s16 = vaddq_s16(q9s16, q13s16); + q2s16 = vsubq_s16(q9s16, q13s16); + q3s16 = vsubq_s16(q9s16, q15s16); + + // stage 2 - odd half + q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + d16s16 = vdup_n_s16(cospi_16_64); + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 4 + q8s16 = vaddq_s16(q0s16, q7s16); + q9s16 = vaddq_s16(q1s16, q6s16); + q10s16 = vaddq_s16(q2s16, q5s16); + q11s16 = vaddq_s16(q3s16, q4s16); + q12s16 = vsubq_s16(q3s16, q4s16); + q13s16 = vsubq_s16(q2s16, q5s16); + q14s16 = vsubq_s16(q1s16, q6s16); + q15s16 = vsubq_s16(q0s16, q7s16); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + d1 = d2 = dest; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + return; } diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 0a376104d2bbc1f0d57c3e6062f95f4f91b53361..32dd1ba14606f25efabf12195ff882f147a31556 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -18,9 +18,8 @@ // DC 4x4 // 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { +static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; @@ -33,7 +32,7 @@ static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, } if (do_left) { - const uint8x8_t L = vld1_u8(left); // left border + const uint8x8_t L = vld1_u8(left); // left border const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left const uint16x4_t p1 = vpadd_u16(p0, p0); sum_left = vcombine_u16(p1, p1); @@ -54,7 +53,7 @@ static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8x8_t dc = vdup_lane_u8(dc0, 0); int i; for (i = 0; i < 4; ++i) { - vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0); + vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0); } } } @@ -87,9 +86,8 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, // DC 8x8 // 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { +static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; @@ -103,7 +101,7 @@ static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, } if (do_left) { - const uint8x8_t L = vld1_u8(left); // left border + const uint8x8_t L = vld1_u8(left); // left border const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); @@ -125,7 +123,7 @@ static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8x8_t dc = vdup_lane_u8(dc0, 0); int i; for (i = 0; i < 8; ++i) { - vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); + vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc)); } } } @@ -167,7 +165,7 @@ static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, if (do_above) { const uint8x16_t A = vld1q_u8(above); // top row - const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top + const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); @@ -425,8 +423,7 @@ void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, (void)left; d0u8 = vld1_u8(above); - for (i = 0; i < 8; i++, dst += stride) - vst1_u8(dst, d0u8); + for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8); } void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, @@ -436,8 +433,7 @@ void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, (void)left; q0u8 = vld1q_u8(above); - for (i = 0; i < 16; i++, dst += stride) - vst1q_u8(dst, q0u8); + for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8); } void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, @@ -608,8 +604,8 @@ void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); for (i = 0; i < 4; i++, dst += stride) { q1u16 = vdupq_n_u16((uint16_t)left[i]); - q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), - vreinterpretq_s16_u16(q3u16)); + q1s16 = + vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16)); d0u8 = vqmovun_s16(q1s16); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); } @@ -631,26 +627,26 @@ void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, d20u16 = vget_low_u16(q10u16); for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { q0u16 = vdupq_lane_u16(d20u16, 0); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += stride; q0u16 = vdupq_lane_u16(d20u16, 1); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += stride; q0u16 = vdupq_lane_u16(d20u16, 2); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += stride; q0u16 = vdupq_lane_u16(d20u16, 3); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += stride; @@ -677,14 +673,14 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { q0u16 = vdupq_lane_u16(d20u16, 0); q8u16 = vdupq_lane_u16(d20u16, 1); - q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q2u16)); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q3u16)); - q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q2u16)); - q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q3u16)); + q1s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); + q11s16 = + vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); + q8s16 = + vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); d2u8 = vqmovun_s16(q1s16); d3u8 = vqmovun_s16(q0s16); d22u8 = vqmovun_s16(q11s16); @@ -698,14 +694,14 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, q0u16 = vdupq_lane_u16(d20u16, 2); q8u16 = vdupq_lane_u16(d20u16, 3); - q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q2u16)); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q3u16)); - q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q2u16)); - q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q3u16)); + q1s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); + q11s16 = + vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); + q8s16 = + vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); d2u8 = vqmovun_s16(q1s16); d3u8 = vqmovun_s16(q0s16); d22u8 = vqmovun_s16(q11s16); @@ -742,10 +738,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, d6u16 = vget_low_u16(q3u16); for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { q0u16 = vdupq_lane_u16(d6u16, 0); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); + q12s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); + q13s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q10u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), @@ -761,10 +757,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, dst += stride; q0u16 = vdupq_lane_u16(d6u16, 1); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); + q12s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); + q13s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q10u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), @@ -780,10 +776,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, dst += stride; q0u16 = vdupq_lane_u16(d6u16, 2); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); + q12s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); + q13s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q10u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), @@ -799,10 +795,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, dst += stride; q0u16 = vdupq_lane_u16(d6u16, 3); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); + q12s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); + q13s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q10u16)); q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), diff --git a/vpx_dsp/arm/loopfilter_16_neon.c b/vpx_dsp/arm/loopfilter_16_neon.c index d24e6adc8a64a7bc72a91b5eecc19730c9288758..9607bb24056b4cc39212a5781ba812da10fbec3b 100644 --- a/vpx_dsp/arm/loopfilter_16_neon.c +++ b/vpx_dsp/arm/loopfilter_16_neon.c @@ -14,166 +14,160 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -static INLINE void loop_filter_neon_16( - uint8x16_t qblimit, // blimit - uint8x16_t qlimit, // limit - uint8x16_t qthresh, // thresh - uint8x16_t q3, // p3 - uint8x16_t q4, // p2 - uint8x16_t q5, // p1 - uint8x16_t q6, // p0 - uint8x16_t q7, // q0 - uint8x16_t q8, // q1 - uint8x16_t q9, // q2 - uint8x16_t q10, // q3 - uint8x16_t *q5r, // p1 - uint8x16_t *q6r, // p0 - uint8x16_t *q7r, // q0 - uint8x16_t *q8r) { // q1 - uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; - int16x8_t q2s16, q11s16; - uint16x8_t q4u16; - int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8; - int8x8_t d2s8, d3s8; - - q11u8 = vabdq_u8(q3, q4); - q12u8 = vabdq_u8(q4, q5); - q13u8 = vabdq_u8(q5, q6); - q14u8 = vabdq_u8(q8, q7); - q3 = vabdq_u8(q9, q8); - q4 = vabdq_u8(q10, q9); - - q11u8 = vmaxq_u8(q11u8, q12u8); - q12u8 = vmaxq_u8(q13u8, q14u8); - q3 = vmaxq_u8(q3, q4); - q15u8 = vmaxq_u8(q11u8, q12u8); - - q9 = vabdq_u8(q6, q7); - - // vp8_hevmask - q13u8 = vcgtq_u8(q13u8, qthresh); - q14u8 = vcgtq_u8(q14u8, qthresh); - q15u8 = vmaxq_u8(q15u8, q3); - - q2u8 = vabdq_u8(q5, q8); - q9 = vqaddq_u8(q9, q9); - - q15u8 = vcgeq_u8(qlimit, q15u8); - - // vp8_filter() function - // convert to signed - q10 = vdupq_n_u8(0x80); - q8 = veorq_u8(q8, q10); - q7 = veorq_u8(q7, q10); - q6 = veorq_u8(q6, q10); - q5 = veorq_u8(q5, q10); - - q2u8 = vshrq_n_u8(q2u8, 1); - q9 = vqaddq_u8(q9, q2u8); - - q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), - vget_low_s8(vreinterpretq_s8_u8(q6))); - q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), - vget_high_s8(vreinterpretq_s8_u8(q6))); - - q9 = vcgeq_u8(qblimit, q9); - - q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), - vreinterpretq_s8_u8(q8)); - - q14u8 = vorrq_u8(q13u8, q14u8); - - q4u16 = vdupq_n_u16(3); - q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); - q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); - - q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); - q15u8 = vandq_u8(q15u8, q9); - - q1s8 = vreinterpretq_s8_u8(q1u8); - q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); - q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); - - q4 = vdupq_n_u8(3); - q9 = vdupq_n_u8(4); - // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) - d2s8 = vqmovn_s16(q2s16); - d3s8 = vqmovn_s16(q11s16); - q1s8 = vcombine_s8(d2s8, d3s8); - q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); - q1s8 = vreinterpretq_s8_u8(q1u8); - - q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4)); - q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); - q2s8 = vshrq_n_s8(q2s8, 3); - q1s8 = vshrq_n_s8(q1s8, 3); - - q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); - q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); - - q1s8 = vrshrq_n_s8(q1s8, 1); - q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); - - q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); - q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); - - *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10); - *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10); - *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10); - *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10); - return; +static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // vp8_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vdupq_n_u16(3); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q4 = vdupq_n_u8(3); + q9 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10); + *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10); + return; } -void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; - uint8x16_t qblimit, qlimit, qthresh; - uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; - - dblimit0 = vld1_u8(blimit0); - dlimit0 = vld1_u8(limit0); - dthresh0 = vld1_u8(thresh0); - dblimit1 = vld1_u8(blimit1); - dlimit1 = vld1_u8(limit1); - dthresh1 = vld1_u8(thresh1); - qblimit = vcombine_u8(dblimit0, dblimit1); - qlimit = vcombine_u8(dlimit0, dlimit1); - qthresh = vcombine_u8(dthresh0, dthresh1); - - s -= (p << 2); - - q3u8 = vld1q_u8(s); - s += p; - q4u8 = vld1q_u8(s); - s += p; - q5u8 = vld1q_u8(s); - s += p; - q6u8 = vld1q_u8(s); - s += p; - q7u8 = vld1q_u8(s); - s += p; - q8u8 = vld1q_u8(s); - s += p; - q9u8 = vld1q_u8(s); - s += p; - q10u8 = vld1q_u8(s); - - loop_filter_neon_16(qblimit, qlimit, qthresh, - q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, - &q5u8, &q6u8, &q7u8, &q8u8); - - s -= (p * 5); - vst1q_u8(s, q5u8); - s += p; - vst1q_u8(s, q6u8); - s += p; - vst1q_u8(s, q7u8); - s += p; - vst1q_u8(s, q8u8); - return; +void vpx_lpf_horizontal_4_dual_neon( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; + uint8x16_t qblimit, qlimit, qthresh; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + + dblimit0 = vld1_u8(blimit0); + dlimit0 = vld1_u8(limit0); + dthresh0 = vld1_u8(thresh0); + dblimit1 = vld1_u8(blimit1); + dlimit1 = vld1_u8(limit1); + dthresh1 = vld1_u8(thresh1); + qblimit = vcombine_u8(dblimit0, dblimit1); + qlimit = vcombine_u8(dlimit0, dlimit1); + qthresh = vcombine_u8(dthresh0, dthresh1); + + s -= (p << 2); + + q3u8 = vld1q_u8(s); + s += p; + q4u8 = vld1q_u8(s); + s += p; + q5u8 = vld1q_u8(s); + s += p; + q6u8 = vld1q_u8(s); + s += p; + q7u8 = vld1q_u8(s); + s += p; + q8u8 = vld1q_u8(s); + s += p; + q9u8 = vld1q_u8(s); + s += p; + q10u8 = vld1q_u8(s); + + loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8, + q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8); + + s -= (p * 5); + vst1q_u8(s, q5u8); + s += p; + vst1q_u8(s, q6u8); + s += p; + vst1q_u8(s, q7u8); + s += p; + vst1q_u8(s, q8u8); + return; } diff --git a/vpx_dsp/arm/loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c index 7f3ee70b94873aef38a4d37efe4909e732192f49..1c1e80e00088ee801ab11665ac758438a3367925 100644 --- a/vpx_dsp/arm/loopfilter_4_neon.c +++ b/vpx_dsp/arm/loopfilter_4_neon.c @@ -12,255 +12,238 @@ #include "./vpx_dsp_rtcd.h" -static INLINE void loop_filter_neon( - uint8x8_t dblimit, // flimit - uint8x8_t dlimit, // limit - uint8x8_t dthresh, // thresh - uint8x8_t d3u8, // p3 - uint8x8_t d4u8, // p2 - uint8x8_t d5u8, // p1 - uint8x8_t d6u8, // p0 - uint8x8_t d7u8, // q0 - uint8x8_t d16u8, // q1 - uint8x8_t d17u8, // q2 - uint8x8_t d18u8, // q3 - uint8x8_t *d4ru8, // p1 - uint8x8_t *d5ru8, // p0 - uint8x8_t *d6ru8, // q0 - uint8x8_t *d7ru8) { // q1 - uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; - int16x8_t q12s16; - int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; - - d19u8 = vabd_u8(d3u8, d4u8); - d20u8 = vabd_u8(d4u8, d5u8); - d21u8 = vabd_u8(d5u8, d6u8); - d22u8 = vabd_u8(d16u8, d7u8); - d3u8 = vabd_u8(d17u8, d16u8); - d4u8 = vabd_u8(d18u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - d20u8 = vmax_u8(d21u8, d22u8); - d3u8 = vmax_u8(d3u8, d4u8); - d23u8 = vmax_u8(d19u8, d20u8); - - d17u8 = vabd_u8(d6u8, d7u8); - - d21u8 = vcgt_u8(d21u8, dthresh); - d22u8 = vcgt_u8(d22u8, dthresh); - d23u8 = vmax_u8(d23u8, d3u8); - - d28u8 = vabd_u8(d5u8, d16u8); - d17u8 = vqadd_u8(d17u8, d17u8); - - d23u8 = vcge_u8(dlimit, d23u8); - - d18u8 = vdup_n_u8(0x80); - d5u8 = veor_u8(d5u8, d18u8); - d6u8 = veor_u8(d6u8, d18u8); - d7u8 = veor_u8(d7u8, d18u8); - d16u8 = veor_u8(d16u8, d18u8); - - d28u8 = vshr_n_u8(d28u8, 1); - d17u8 = vqadd_u8(d17u8, d28u8); - - d19u8 = vdup_n_u8(3); - - d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), - vreinterpret_s8_u8(d6u8)); - - d17u8 = vcge_u8(dblimit, d17u8); - - d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), - vreinterpret_s8_u8(d16u8)); - - d22u8 = vorr_u8(d21u8, d22u8); - - q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); - - d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); - d23u8 = vand_u8(d23u8, d17u8); - - q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); - - d17u8 = vdup_n_u8(4); - - d27s8 = vqmovn_s16(q12s16); - d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); - d27s8 = vreinterpret_s8_u8(d27u8); - - d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); - d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); - d28s8 = vshr_n_s8(d28s8, 3); - d27s8 = vshr_n_s8(d27s8, 3); - - d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); - d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); - - d27s8 = vrshr_n_s8(d27s8, 1); - d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); - - d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); - d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); - - *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); - *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); - *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); - *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); - return; +static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p3 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d4ru8, // p1 + uint8x8_t *d5ru8, // p0 + uint8x8_t *d6ru8, // q0 + uint8x8_t *d7ru8) { // q1 + uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; + int16x8_t q12s16; + int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d3u8 = vabd_u8(d17u8, d16u8); + d4u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + d3u8 = vmax_u8(d3u8, d4u8); + d23u8 = vmax_u8(d19u8, d20u8); + + d17u8 = vabd_u8(d6u8, d7u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + d22u8 = vcgt_u8(d22u8, dthresh); + d23u8 = vmax_u8(d23u8, d3u8); + + d28u8 = vabd_u8(d5u8, d16u8); + d17u8 = vqadd_u8(d17u8, d17u8); + + d23u8 = vcge_u8(dlimit, d23u8); + + d18u8 = vdup_n_u8(0x80); + d5u8 = veor_u8(d5u8, d18u8); + d6u8 = veor_u8(d6u8, d18u8); + d7u8 = veor_u8(d7u8, d18u8); + d16u8 = veor_u8(d16u8, d18u8); + + d28u8 = vshr_n_u8(d28u8, 1); + d17u8 = vqadd_u8(d17u8, d28u8); + + d19u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8)); + + d17u8 = vcge_u8(dblimit, d17u8); + + d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8)); + + d22u8 = vorr_u8(d21u8, d22u8); + + q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); + d23u8 = vand_u8(d23u8, d17u8); + + q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); + + d17u8 = vdup_n_u8(4); + + d27s8 = vqmovn_s16(q12s16); + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); + d27s8 = vreinterpret_s8_u8(d27u8); + + d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); + d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); + d28s8 = vshr_n_s8(d28s8, 3); + d27s8 = vshr_n_s8(d27s8, 3); + + d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); + + d27s8 = vrshr_n_s8(d27s8, 1); + d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); + + d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); + d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); + + *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); + *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); + *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); + *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); + return; } -void vpx_lpf_horizontal_4_neon( - uint8_t *src, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - int i; - uint8_t *s, *psrc; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - psrc = src - (pitch << 2); - for (i = 0; i < 1; i++) { - s = psrc + i * 8; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - loop_filter_neon(dblimit, dlimit, dthresh, - d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, - &d4u8, &d5u8, &d6u8, &d7u8); - - s -= (pitch * 5); - vst1_u8(s, d4u8); - s += pitch; - vst1_u8(s, d5u8); - s += pitch; - vst1_u8(s, d6u8); - s += pitch; - vst1_u8(s, d7u8); - } - return; +void vpx_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < 1; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, + d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); + + s -= (pitch * 5); + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + s += pitch; + vst1_u8(s, d6u8); + s += pitch; + vst1_u8(s, d7u8); + } + return; } -void vpx_lpf_vertical_4_neon( - uint8_t *src, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - int i, pitch8; - uint8_t *s; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; - uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; - uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; - uint8x8x4_t d4Result; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - pitch8 = pitch * 8; - for (i = 0; i < 1; i++, src += pitch8) { - s = src - (i + 1) * 4; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), - vreinterpret_u32_u8(d7u8)); - d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), - vreinterpret_u32_u8(d16u8)); - d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), - vreinterpret_u32_u8(d17u8)); - d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), - vreinterpret_u32_u8(d18u8)); - - d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), - vreinterpret_u16_u32(d2tmp2.val[0])); - d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), - vreinterpret_u16_u32(d2tmp3.val[0])); - d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), - vreinterpret_u16_u32(d2tmp2.val[1])); - d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), - vreinterpret_u16_u32(d2tmp3.val[1])); - - d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), - vreinterpret_u8_u16(d2tmp5.val[0])); - d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), - vreinterpret_u8_u16(d2tmp5.val[1])); - d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), - vreinterpret_u8_u16(d2tmp7.val[0])); - d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), - vreinterpret_u8_u16(d2tmp7.val[1])); - - d3u8 = d2tmp8.val[0]; - d4u8 = d2tmp8.val[1]; - d5u8 = d2tmp9.val[0]; - d6u8 = d2tmp9.val[1]; - d7u8 = d2tmp10.val[0]; - d16u8 = d2tmp10.val[1]; - d17u8 = d2tmp11.val[0]; - d18u8 = d2tmp11.val[1]; - - loop_filter_neon(dblimit, dlimit, dthresh, - d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, - &d4u8, &d5u8, &d6u8, &d7u8); - - d4Result.val[0] = d4u8; - d4Result.val[1] = d5u8; - d4Result.val[2] = d6u8; - d4Result.val[3] = d7u8; - - src -= 2; - vst4_lane_u8(src, d4Result, 0); - src += pitch; - vst4_lane_u8(src, d4Result, 1); - src += pitch; - vst4_lane_u8(src, d4Result, 2); - src += pitch; - vst4_lane_u8(src, d4Result, 3); - src += pitch; - vst4_lane_u8(src, d4Result, 4); - src += pitch; - vst4_lane_u8(src, d4Result, 5); - src += pitch; - vst4_lane_u8(src, d4Result, 6); - src += pitch; - vst4_lane_u8(src, d4Result, 7); - } - return; +void vpx_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i, pitch8; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + pitch8 = pitch * 8; + for (i = 0; i < 1; i++, src += pitch8) { + s = src - (i + 1) * 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, + d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); + + d4Result.val[0] = d4u8; + d4Result.val[1] = d5u8; + d4Result.val[2] = d6u8; + d4Result.val[3] = d7u8; + + src -= 2; + vst4_lane_u8(src, d4Result, 0); + src += pitch; + vst4_lane_u8(src, d4Result, 1); + src += pitch; + vst4_lane_u8(src, d4Result, 2); + src += pitch; + vst4_lane_u8(src, d4Result, 3); + src += pitch; + vst4_lane_u8(src, d4Result, 4); + src += pitch; + vst4_lane_u8(src, d4Result, 5); + src += pitch; + vst4_lane_u8(src, d4Result, 6); + src += pitch; + vst4_lane_u8(src, d4Result, 7); + } + return; } diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c index ec3757380d572f047145d751575ed80c336f4f44..854196f4272e692bc6f85333095c5139f7950032 100644 --- a/vpx_dsp/arm/loopfilter_8_neon.c +++ b/vpx_dsp/arm/loopfilter_8_neon.c @@ -12,434 +12,418 @@ #include "./vpx_dsp_rtcd.h" -static INLINE void mbloop_filter_neon( - uint8x8_t dblimit, // mblimit - uint8x8_t dlimit, // limit - uint8x8_t dthresh, // thresh - uint8x8_t d3u8, // p2 - uint8x8_t d4u8, // p2 - uint8x8_t d5u8, // p1 - uint8x8_t d6u8, // p0 - uint8x8_t d7u8, // q0 - uint8x8_t d16u8, // q1 - uint8x8_t d17u8, // q2 - uint8x8_t d18u8, // q3 - uint8x8_t *d0ru8, // p1 - uint8x8_t *d1ru8, // p1 - uint8x8_t *d2ru8, // p0 - uint8x8_t *d3ru8, // q0 - uint8x8_t *d4ru8, // q1 - uint8x8_t *d5ru8) { // q1 - uint32_t flat; - uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; - uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; - int16x8_t q15s16; - uint16x8_t q10u16, q14u16; - int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; +static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p2 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d0ru8, // p1 + uint8x8_t *d1ru8, // p1 + uint8x8_t *d2ru8, // p0 + uint8x8_t *d3ru8, // q0 + uint8x8_t *d4ru8, // q1 + uint8x8_t *d5ru8) { // q1 + uint32_t flat; + uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; + uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int16x8_t q15s16; + uint16x8_t q10u16, q14u16; + int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; - d19u8 = vabd_u8(d3u8, d4u8); - d20u8 = vabd_u8(d4u8, d5u8); - d21u8 = vabd_u8(d5u8, d6u8); - d22u8 = vabd_u8(d16u8, d7u8); - d23u8 = vabd_u8(d17u8, d16u8); - d24u8 = vabd_u8(d18u8, d17u8); + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d23u8 = vabd_u8(d17u8, d16u8); + d24u8 = vabd_u8(d18u8, d17u8); - d19u8 = vmax_u8(d19u8, d20u8); - d20u8 = vmax_u8(d21u8, d22u8); + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); - d25u8 = vabd_u8(d6u8, d4u8); + d25u8 = vabd_u8(d6u8, d4u8); - d23u8 = vmax_u8(d23u8, d24u8); + d23u8 = vmax_u8(d23u8, d24u8); - d26u8 = vabd_u8(d7u8, d17u8); + d26u8 = vabd_u8(d7u8, d17u8); - d19u8 = vmax_u8(d19u8, d20u8); + d19u8 = vmax_u8(d19u8, d20u8); - d24u8 = vabd_u8(d6u8, d7u8); - d27u8 = vabd_u8(d3u8, d6u8); - d28u8 = vabd_u8(d18u8, d7u8); + d24u8 = vabd_u8(d6u8, d7u8); + d27u8 = vabd_u8(d3u8, d6u8); + d28u8 = vabd_u8(d18u8, d7u8); - d19u8 = vmax_u8(d19u8, d23u8); + d19u8 = vmax_u8(d19u8, d23u8); - d23u8 = vabd_u8(d5u8, d16u8); - d24u8 = vqadd_u8(d24u8, d24u8); + d23u8 = vabd_u8(d5u8, d16u8); + d24u8 = vqadd_u8(d24u8, d24u8); + d19u8 = vcge_u8(dlimit, d19u8); - d19u8 = vcge_u8(dlimit, d19u8); + d25u8 = vmax_u8(d25u8, d26u8); + d26u8 = vmax_u8(d27u8, d28u8); + d23u8 = vshr_n_u8(d23u8, 1); - d25u8 = vmax_u8(d25u8, d26u8); - d26u8 = vmax_u8(d27u8, d28u8); + d25u8 = vmax_u8(d25u8, d26u8); - d23u8 = vshr_n_u8(d23u8, 1); + d24u8 = vqadd_u8(d24u8, d23u8); - d25u8 = vmax_u8(d25u8, d26u8); + d20u8 = vmax_u8(d20u8, d25u8); - d24u8 = vqadd_u8(d24u8, d23u8); + d23u8 = vdup_n_u8(1); + d24u8 = vcge_u8(dblimit, d24u8); - d20u8 = vmax_u8(d20u8, d25u8); + d21u8 = vcgt_u8(d21u8, dthresh); - d23u8 = vdup_n_u8(1); - d24u8 = vcge_u8(dblimit, d24u8); + d20u8 = vcge_u8(d23u8, d20u8); - d21u8 = vcgt_u8(d21u8, dthresh); + d19u8 = vand_u8(d19u8, d24u8); - d20u8 = vcge_u8(d23u8, d20u8); + d23u8 = vcgt_u8(d22u8, dthresh); - d19u8 = vand_u8(d19u8, d24u8); + d20u8 = vand_u8(d20u8, d19u8); - d23u8 = vcgt_u8(d22u8, dthresh); + d22u8 = vdup_n_u8(0x80); - d20u8 = vand_u8(d20u8, d19u8); + d23u8 = vorr_u8(d21u8, d23u8); - d22u8 = vdup_n_u8(0x80); + q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8)); - d23u8 = vorr_u8(d21u8, d23u8); + d30u8 = vshrn_n_u16(q10u16, 4); + flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); - q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), - vreinterpret_u16_u8(d21u8)); + if (flat == 0xffffffff) { // Check for all 1's, power_branch_only + d27u8 = vdup_n_u8(3); + d21u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d21u8); + q14u16 = vaddw_u8(q14u16, d5u8); + *d0ru8 = vqrshrn_n_u16(q14u16, 3); - d30u8 = vshrn_n_u16(q10u16, 4); - flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + *d1ru8 = vqrshrn_n_u16(q14u16, 3); - if (flat == 0xffffffff) { // Check for all 1's, power_branch_only - d27u8 = vdup_n_u8(3); - d21u8 = vdup_n_u8(2); - q14u16 = vaddl_u8(d6u8, d7u8); - q14u16 = vmlal_u8(q14u16, d3u8, d27u8); - q14u16 = vmlal_u8(q14u16, d4u8, d21u8); - q14u16 = vaddw_u8(q14u16, d5u8); - *d0ru8 = vqrshrn_n_u16(q14u16, 3); + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + *d2ru8 = vqrshrn_n_u16(q14u16, 3); - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d16u8); - *d1ru8 = vqrshrn_n_u16(q14u16, 3); + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d3ru8 = vqrshrn_n_u16(q14u16, 3); - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d17u8); - *d2ru8 = vqrshrn_n_u16(q14u16, 3); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d4ru8 = vqrshrn_n_u16(q14u16, 3); - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d3ru8 = vqrshrn_n_u16(q14u16, 3); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d5ru8 = vqrshrn_n_u16(q14u16, 3); + } else { + d21u8 = veor_u8(d7u8, d22u8); + d24u8 = veor_u8(d6u8, d22u8); + d25u8 = veor_u8(d5u8, d22u8); + d26u8 = veor_u8(d16u8, d22u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vsubw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d4ru8 = vqrshrn_n_u16(q14u16, 3); + d27u8 = vdup_n_u8(3); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vsubw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d17u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d5ru8 = vqrshrn_n_u16(q14u16, 3); - } else { - d21u8 = veor_u8(d7u8, d22u8); - d24u8 = veor_u8(d6u8, d22u8); - d25u8 = veor_u8(d5u8, d22u8); - d26u8 = veor_u8(d16u8, d22u8); + d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); + d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); - d27u8 = vdup_n_u8(3); + q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); - d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); - d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); + d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); - q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); + q15s16 = vaddw_s8(q15s16, d29s8); - d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); + d29u8 = vdup_n_u8(4); - q15s16 = vaddw_s8(q15s16, d29s8); + d28s8 = vqmovn_s16(q15s16); - d29u8 = vdup_n_u8(4); + d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); - d28s8 = vqmovn_s16(q15s16); + d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); + d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); + d30s8 = vshr_n_s8(d30s8, 3); + d29s8 = vshr_n_s8(d29s8, 3); - d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); + d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); + d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); - d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); - d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); - d30s8 = vshr_n_s8(d30s8, 3); - d29s8 = vshr_n_s8(d29s8, 3); + d29s8 = vrshr_n_s8(d29s8, 1); + d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); - d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); - d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); + d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); - d29s8 = vrshr_n_s8(d29s8, 1); - d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); - - d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); - d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); - - if (flat == 0) { // filter_branch_only - *d0ru8 = d4u8; - *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); - *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); - *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); - *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); - *d5ru8 = d17u8; - return; - } + if (flat == 0) { // filter_branch_only + *d0ru8 = d4u8; + *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + *d5ru8 = d17u8; + return; + } - d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); - d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); - d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); - d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); - d23u8 = vdup_n_u8(2); - q14u16 = vaddl_u8(d6u8, d7u8); - q14u16 = vmlal_u8(q14u16, d3u8, d27u8); - q14u16 = vmlal_u8(q14u16, d4u8, d23u8); + d23u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d23u8); - d0u8 = vbsl_u8(d20u8, dblimit, d4u8); + d0u8 = vbsl_u8(d20u8, dblimit, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d5u8); - d1u8 = vbsl_u8(d20u8, dlimit, d25u8); + d1u8 = vbsl_u8(d20u8, dlimit, d25u8); - d30u8 = vqrshrn_n_u16(q14u16, 3); + d30u8 = vqrshrn_n_u16(q14u16, 3); - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d16u8); + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); - d2u8 = vbsl_u8(d20u8, dthresh, d24u8); + d2u8 = vbsl_u8(d20u8, dthresh, d24u8); - d31u8 = vqrshrn_n_u16(q14u16, 3); + d31u8 = vqrshrn_n_u16(q14u16, 3); - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); - *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); + *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); - d23u8 = vqrshrn_n_u16(q14u16, 3); + d23u8 = vqrshrn_n_u16(q14u16, 3); - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d7u8); + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); - *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); + *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); - q14u16 = vaddw_u8(q14u16, d18u8); + q14u16 = vaddw_u8(q14u16, d18u8); - *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); + *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); - d22u8 = vqrshrn_n_u16(q14u16, 3); + d22u8 = vqrshrn_n_u16(q14u16, 3); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vsubw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d16u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); - d3u8 = vbsl_u8(d20u8, d3u8, d21u8); + d3u8 = vbsl_u8(d20u8, d3u8, d21u8); - q14u16 = vaddw_u8(q14u16, d18u8); + q14u16 = vaddw_u8(q14u16, d18u8); - d4u8 = vbsl_u8(d20u8, d4u8, d26u8); + d4u8 = vbsl_u8(d20u8, d4u8, d26u8); - d6u8 = vqrshrn_n_u16(q14u16, 3); + d6u8 = vqrshrn_n_u16(q14u16, 3); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vsubw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d17u8); - q14u16 = vaddw_u8(q14u16, d18u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); - d5u8 = vbsl_u8(d20u8, d5u8, d17u8); + d5u8 = vbsl_u8(d20u8, d5u8, d17u8); - d7u8 = vqrshrn_n_u16(q14u16, 3); + d7u8 = vqrshrn_n_u16(q14u16, 3); - *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); - *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); - *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); - } - return; + *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); + *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); + *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); + } + return; } -void vpx_lpf_horizontal_8_neon( - uint8_t *src, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - int i; - uint8_t *s, *psrc; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint8x8_t d16u8, d17u8, d18u8; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - psrc = src - (pitch << 2); - for (i = 0; i < 1; i++) { - s = psrc + i * 8; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - mbloop_filter_neon(dblimit, dlimit, dthresh, - d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, - &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); - - s -= (pitch * 6); - vst1_u8(s, d0u8); - s += pitch; - vst1_u8(s, d1u8); - s += pitch; - vst1_u8(s, d2u8); - s += pitch; - vst1_u8(s, d3u8); - s += pitch; - vst1_u8(s, d4u8); - s += pitch; - vst1_u8(s, d5u8); - } - return; +void vpx_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < 1; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, + d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, + &d5u8); + + s -= (pitch * 6); + vst1_u8(s, d0u8); + s += pitch; + vst1_u8(s, d1u8); + s += pitch; + vst1_u8(s, d2u8); + s += pitch; + vst1_u8(s, d3u8); + s += pitch; + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + } + return; } -void vpx_lpf_vertical_8_neon( - uint8_t *src, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - int i; - uint8_t *s; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint8x8_t d16u8, d17u8, d18u8; - uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; - uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; - uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; - uint8x8x4_t d4Result; - uint8x8x2_t d2Result; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - for (i = 0; i < 1; i++) { - s = src + (i * (pitch << 3)) - 4; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), - vreinterpret_u32_u8(d7u8)); - d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), - vreinterpret_u32_u8(d16u8)); - d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), - vreinterpret_u32_u8(d17u8)); - d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), - vreinterpret_u32_u8(d18u8)); - - d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), - vreinterpret_u16_u32(d2tmp2.val[0])); - d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), - vreinterpret_u16_u32(d2tmp3.val[0])); - d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), - vreinterpret_u16_u32(d2tmp2.val[1])); - d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), - vreinterpret_u16_u32(d2tmp3.val[1])); - - d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), - vreinterpret_u8_u16(d2tmp5.val[0])); - d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), - vreinterpret_u8_u16(d2tmp5.val[1])); - d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), - vreinterpret_u8_u16(d2tmp7.val[0])); - d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), - vreinterpret_u8_u16(d2tmp7.val[1])); - - d3u8 = d2tmp8.val[0]; - d4u8 = d2tmp8.val[1]; - d5u8 = d2tmp9.val[0]; - d6u8 = d2tmp9.val[1]; - d7u8 = d2tmp10.val[0]; - d16u8 = d2tmp10.val[1]; - d17u8 = d2tmp11.val[0]; - d18u8 = d2tmp11.val[1]; - - mbloop_filter_neon(dblimit, dlimit, dthresh, - d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, - &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); - - d4Result.val[0] = d0u8; - d4Result.val[1] = d1u8; - d4Result.val[2] = d2u8; - d4Result.val[3] = d3u8; - - d2Result.val[0] = d4u8; - d2Result.val[1] = d5u8; - - s = src - 3; - vst4_lane_u8(s, d4Result, 0); - s += pitch; - vst4_lane_u8(s, d4Result, 1); - s += pitch; - vst4_lane_u8(s, d4Result, 2); - s += pitch; - vst4_lane_u8(s, d4Result, 3); - s += pitch; - vst4_lane_u8(s, d4Result, 4); - s += pitch; - vst4_lane_u8(s, d4Result, 5); - s += pitch; - vst4_lane_u8(s, d4Result, 6); - s += pitch; - vst4_lane_u8(s, d4Result, 7); - - s = src + 1; - vst2_lane_u8(s, d2Result, 0); - s += pitch; - vst2_lane_u8(s, d2Result, 1); - s += pitch; - vst2_lane_u8(s, d2Result, 2); - s += pitch; - vst2_lane_u8(s, d2Result, 3); - s += pitch; - vst2_lane_u8(s, d2Result, 4); - s += pitch; - vst2_lane_u8(s, d2Result, 5); - s += pitch; - vst2_lane_u8(s, d2Result, 6); - s += pitch; - vst2_lane_u8(s, d2Result, 7); - } - return; +void vpx_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + uint8x8x2_t d2Result; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + for (i = 0; i < 1; i++) { + s = src + (i * (pitch << 3)) - 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, + d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, + &d5u8); + + d4Result.val[0] = d0u8; + d4Result.val[1] = d1u8; + d4Result.val[2] = d2u8; + d4Result.val[3] = d3u8; + + d2Result.val[0] = d4u8; + d2Result.val[1] = d5u8; + + s = src - 3; + vst4_lane_u8(s, d4Result, 0); + s += pitch; + vst4_lane_u8(s, d4Result, 1); + s += pitch; + vst4_lane_u8(s, d4Result, 2); + s += pitch; + vst4_lane_u8(s, d4Result, 3); + s += pitch; + vst4_lane_u8(s, d4Result, 4); + s += pitch; + vst4_lane_u8(s, d4Result, 5); + s += pitch; + vst4_lane_u8(s, d4Result, 6); + s += pitch; + vst4_lane_u8(s, d4Result, 7); + + s = src + 1; + vst2_lane_u8(s, d2Result, 0); + s += pitch; + vst2_lane_u8(s, d2Result, 1); + s += pitch; + vst2_lane_u8(s, d2Result, 2); + s += pitch; + vst2_lane_u8(s, d2Result, 3); + s += pitch; + vst2_lane_u8(s, d2Result, 4); + s += pitch; + vst2_lane_u8(s, d2Result, 5); + s += pitch; + vst2_lane_u8(s, d2Result, 6); + s += pitch; + vst2_lane_u8(s, d2Result, 7); + } + return; } diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c index aa31f293588cab5e67e8a131f73091ec776352a1..9129b5d2d5596d8595315cc016aa08d82ba6761d 100644 --- a/vpx_dsp/arm/loopfilter_neon.c +++ b/vpx_dsp/arm/loopfilter_neon.c @@ -14,42 +14,32 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0); vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1); } #if HAVE_NEON_ASM -void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { +void vpx_lpf_horizontal_8_dual_neon( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); } -void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); } -void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, - const uint8_t *blimit, +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh); diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index c7704dc1be67266995b6e117adba5248f53307b3..dc20398000aaa4daffb16ef65a9c0c7863dd7c78 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -16,10 +16,10 @@ static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), - vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), - vget_high_u16(vec_hi)); + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); const uint64x2_t b = vpaddlq_u32(a); const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), @@ -33,8 +33,7 @@ static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, static void sad_neon_64(const uint8x16_t vec_src_00, const uint8x16_t vec_src_16, const uint8x16_t vec_src_32, - const uint8x16_t vec_src_48, - const uint8_t *ref, + const uint8x16_t vec_src_48, const uint8_t *ref, uint16x8_t *vec_sum_ref_lo, uint16x8_t *vec_sum_ref_hi) { const uint8x16_t vec_ref_00 = vld1q_u8(ref); @@ -63,8 +62,7 @@ static void sad_neon_64(const uint8x16_t vec_src_00, // Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, // and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. static void sad_neon_32(const uint8x16_t vec_src_00, - const uint8x16_t vec_src_16, - const uint8_t *ref, + const uint8x16_t vec_src_16, const uint8_t *ref, uint16x8_t *vec_sum_ref_lo, uint16x8_t *vec_sum_ref_hi) { const uint8x16_t vec_ref_00 = vld1q_u8(ref); @@ -81,7 +79,7 @@ static void sad_neon_32(const uint8x16_t vec_src_00, } void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, - const uint8_t* const ref[4], int ref_stride, + const uint8_t *const ref[4], int ref_stride, uint32_t *res) { int i; uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); @@ -127,7 +125,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, } void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t* const ref[4], int ref_stride, + const uint8_t *const ref[4], int ref_stride, uint32_t *res) { int i; uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); @@ -148,14 +146,14 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, const uint8x16_t vec_src_00 = vld1q_u8(src); const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - sad_neon_32(vec_src_00, vec_src_16, ref0, - &vec_sum_ref0_lo, &vec_sum_ref0_hi); - sad_neon_32(vec_src_00, vec_src_16, ref1, - &vec_sum_ref1_lo, &vec_sum_ref1_hi); - sad_neon_32(vec_src_00, vec_src_16, ref2, - &vec_sum_ref2_lo, &vec_sum_ref2_hi); - sad_neon_32(vec_src_00, vec_src_16, ref3, - &vec_sum_ref3_lo, &vec_sum_ref3_hi); + sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo, + &vec_sum_ref0_hi); + sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo, + &vec_sum_ref1_hi); + sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo, + &vec_sum_ref2_hi); + sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo, + &vec_sum_ref3_hi); src += src_stride; ref0 += ref_stride; @@ -171,7 +169,7 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, } void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, - const uint8_t* const ref[4], int ref_stride, + const uint8_t *const ref[4], int ref_stride, uint32_t *res) { int i; uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); @@ -195,20 +193,20 @@ void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, const uint8x16_t vec_ref2 = vld1q_u8(ref2); const uint8x16_t vec_ref3 = vld1q_u8(ref3); - vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), - vget_low_u8(vec_ref0)); + vec_sum_ref0_lo = + vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0)); vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref0)); - vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), - vget_low_u8(vec_ref1)); + vec_sum_ref1_lo = + vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1)); vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref1)); - vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), - vget_low_u8(vec_ref2)); + vec_sum_ref2_lo = + vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2)); vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref2)); - vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), - vget_low_u8(vec_ref3)); + vec_sum_ref3_lo = + vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3)); vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref3)); diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c index 173f08ac3c3e202764a8dd01a43a9b8877d08289..ff3228768cefc8455f65267982b881c9c4ebf7d2 100644 --- a/vpx_dsp/arm/sad_neon.c +++ b/vpx_dsp/arm/sad_neon.c @@ -14,114 +14,105 @@ #include "vpx/vpx_integer.h" -unsigned int vpx_sad8x16_neon( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; - int i; +unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride, + unsigned char *ref_ptr, int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + for (i = 0; i < 15; i++) { d0 = vld1_u8(src_ptr); src_ptr += src_stride; d8 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); - - for (i = 0; i < 15; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); - } - - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); - - return vget_lane_u32(d5, 0); + q12 = vabal_u8(q12, d0, d8); + } + + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); } -unsigned int vpx_sad4x4_neon( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x2_t d1; - uint64x1_t d3; - int i; +unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride, + unsigned char *ref_ptr, int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x2_t d1; + uint64x1_t d3; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + for (i = 0; i < 3; i++) { d0 = vld1_u8(src_ptr); src_ptr += src_stride; d8 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); - - for (i = 0; i < 3; i++) { - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); - } + q12 = vabal_u8(q12, d0, d8); + } - d1 = vpaddl_u16(vget_low_u16(q12)); - d3 = vpaddl_u32(d1); + d1 = vpaddl_u16(vget_low_u16(q12)); + d3 = vpaddl_u32(d1); - return vget_lane_u32(vreinterpret_u32_u64(d3), 0); + return vget_lane_u32(vreinterpret_u32_u64(d3), 0); } -unsigned int vpx_sad16x8_neon( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride) { - uint8x16_t q0, q4; - uint16x8_t q12, q13; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; - int i; +unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride, + unsigned char *ref_ptr, int ref_stride) { + uint8x16_t q0, q4; + uint16x8_t q12, q13; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); + for (i = 0; i < 7; i++) { q0 = vld1q_u8(src_ptr); src_ptr += src_stride; q4 = vld1q_u8(ref_ptr); ref_ptr += ref_stride; - q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); - - for (i = 0; i < 7; i++) { - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); - } - - q12 = vaddq_u16(q12, q13); - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); - - return vget_lane_u32(d5, 0); + q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); + } + + q12 = vaddq_u16(q12, q13); + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); } static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), - vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), - vget_high_u16(vec_hi)); + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); const uint64x2_t b = vpaddlq_u32(a); const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), @@ -208,10 +199,10 @@ unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, const uint8x16_t vec_ref = vld1q_u8(ref); src += src_stride; ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), - vget_low_u8(vec_ref)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref)); + vec_accum_lo = + vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref)); + vec_accum_hi = + vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref)); } return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); } diff --git a/vpx_dsp/arm/subpel_variance_media.c b/vpx_dsp/arm/subpel_variance_media.c index e7d8c85fb510028d6fde9330fd589d0841a95e48..ab53361579df363db0dfd5464aee8d860b06dc07 100644 --- a/vpx_dsp/arm/subpel_variance_media.c +++ b/vpx_dsp/arm/subpel_variance_media.c @@ -14,91 +14,66 @@ #include "vpx_ports/mem.h" #if HAVE_MEDIA -static const int16_t bilinear_filters_media[8][2] = { - { 128, 0 }, - { 112, 16 }, - { 96, 32 }, - { 80, 48 }, - { 64, 64 }, - { 48, 80 }, - { 32, 96 }, - { 16, 112 } -}; +static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 }, + { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, + { 32, 96 }, { 16, 112 } }; -extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr, - uint16_t *dst_ptr, - uint32_t src_pitch, - uint32_t height, - uint32_t width, - const int16_t *filter); +extern void vpx_filter_block2d_bil_first_pass_media( + const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch, + uint32_t height, uint32_t width, const int16_t *filter); -extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr, - uint8_t *dst_ptr, - int32_t src_pitch, - uint32_t height, - uint32_t width, - const int16_t *filter); +extern void vpx_filter_block2d_bil_second_pass_media( + const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch, + uint32_t height, uint32_t width, const int16_t *filter); - -unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t first_pass[10*8]; - uint8_t second_pass[8*8]; +unsigned int vpx_sub_pixel_variance8x8_media( + const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, + const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { + uint16_t first_pass[10 * 8]; + uint8_t second_pass[8 * 8]; const int16_t *HFilter, *VFilter; HFilter = bilinear_filters_media[xoffset]; VFilter = bilinear_filters_media[yoffset]; vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass, - src_pixels_per_line, - 9, 8, HFilter); - vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, - 8, 8, 8, VFilter); + src_pixels_per_line, 9, 8, HFilter); + vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8, + VFilter); - return vpx_variance8x8_media(second_pass, 8, dst_ptr, - dst_pixels_per_line, sse); + return vpx_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line, + sse); } -unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t first_pass[36*16]; - uint8_t second_pass[20*16]; +unsigned int vpx_sub_pixel_variance16x16_media( + const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, + const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { + uint16_t first_pass[36 * 16]; + uint8_t second_pass[20 * 16]; const int16_t *HFilter, *VFilter; unsigned int var; if (xoffset == 4 && yoffset == 0) { - var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, - sse); + var = vpx_variance_halfpixvar16x16_h_media( + src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); } else if (xoffset == 0 && yoffset == 4) { - var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, - sse); + var = vpx_variance_halfpixvar16x16_v_media( + src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); } else if (xoffset == 4 && yoffset == 4) { - var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, - sse); + var = vpx_variance_halfpixvar16x16_hv_media( + src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); } else { HFilter = bilinear_filters_media[xoffset]; VFilter = bilinear_filters_media[yoffset]; - vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass, - src_pixels_per_line, - 17, 16, HFilter); - vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, - 16, 16, 16, VFilter); + vpx_filter_block2d_bil_first_pass_media( + src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter); + vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16, + 16, VFilter); - var = vpx_variance16x16_media(second_pass, 16, dst_ptr, - dst_pixels_per_line, sse); + var = vpx_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line, + sse); } return var; } diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c index 40e2cc89b35d3e657bb1d025bf548091672138fa..f044e11a1553651086eb88f0266713e62e05de3f 100644 --- a/vpx_dsp/arm/subpel_variance_neon.c +++ b/vpx_dsp/arm/subpel_variance_neon.c @@ -18,14 +18,8 @@ #include "vpx_dsp/variance.h" static const uint8_t bilinear_filters[8][2] = { - { 128, 0, }, - { 112, 16, }, - { 96, 32, }, - { 80, 48, }, - { 64, 64, }, - { 48, 80, }, - { 32, 96, }, - { 16, 112, }, + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, }; static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, @@ -79,74 +73,61 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, } } -unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, - int src_stride, - int xoffset, - int yoffset, - const uint8_t *dst, - int dst_stride, +unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, + int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, unsigned int *sse) { DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); - var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, - 9, 8, + var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, bilinear_filters[xoffset]); - var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, - 8, bilinear_filters[yoffset]); + var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, + bilinear_filters[yoffset]); return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); } unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src, - int src_stride, - int xoffset, - int yoffset, - const uint8_t *dst, + int src_stride, int xoffset, + int yoffset, const uint8_t *dst, int dst_stride, unsigned int *sse) { DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, - 17, 16, + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, - 16, bilinear_filters[yoffset]); + var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, + bilinear_filters[yoffset]); return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); } unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src, - int src_stride, - int xoffset, - int yoffset, - const uint8_t *dst, + int src_stride, int xoffset, + int yoffset, const uint8_t *dst, int dst_stride, unsigned int *sse) { DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, - 33, 32, + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, - 32, bilinear_filters[yoffset]); + var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, + bilinear_filters[yoffset]); return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); } unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src, - int src_stride, - int xoffset, - int yoffset, - const uint8_t *dst, + int src_stride, int xoffset, + int yoffset, const uint8_t *dst, int dst_stride, unsigned int *sse) { DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, - 65, 64, + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, - 64, bilinear_filters[yoffset]); + var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, + bilinear_filters[yoffset]); return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); } diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c index 7b146095ea2f5aed80e05a3bd7f52e1d73c62405..ce81fb630f248f2f2053a15befb1691333b6d083 100644 --- a/vpx_dsp/arm/subtract_neon.c +++ b/vpx_dsp/arm/subtract_neon.c @@ -13,10 +13,10 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -void vpx_subtract_block_neon(int rows, int cols, - int16_t *diff, ptrdiff_t diff_stride, - const uint8_t *src, ptrdiff_t src_stride, - const uint8_t *pred, ptrdiff_t pred_stride) { +void vpx_subtract_block_neon(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { int r, c; if (cols > 16) { @@ -24,38 +24,38 @@ void vpx_subtract_block_neon(int rows, int cols, for (c = 0; c < cols; c += 32) { const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); - const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); + const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); - const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00), - vget_low_u8(v_pred_00)); - const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00), - vget_high_u8(v_pred_00)); - const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16), - vget_low_u8(v_pred_16)); - const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16), - vget_high_u8(v_pred_16)); - vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); - vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); + const uint16x8_t v_diff_lo_00 = + vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); + const uint16x8_t v_diff_hi_00 = + vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); + const uint16x8_t v_diff_lo_16 = + vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); + const uint16x8_t v_diff_hi_16 = + vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); } diff += diff_stride; pred += pred_stride; - src += src_stride; + src += src_stride; } } else if (cols > 8) { for (r = 0; r < rows; ++r) { const uint8x16_t v_src = vld1q_u8(&src[0]); const uint8x16_t v_pred = vld1q_u8(&pred[0]); - const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src), - vget_low_u8(v_pred)); - const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src), - vget_high_u8(v_pred)); + const uint16x8_t v_diff_lo = + vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); + const uint16x8_t v_diff_hi = + vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); diff += diff_stride; pred += pred_stride; - src += src_stride; + src += src_stride; } } else if (cols > 4) { for (r = 0; r < rows; ++r) { @@ -65,16 +65,15 @@ void vpx_subtract_block_neon(int rows, int cols, vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); diff += diff_stride; pred += pred_stride; - src += src_stride; + src += src_stride; } } else { for (r = 0; r < rows; ++r) { - for (c = 0; c < cols; ++c) - diff[c] = src[c] - pred[c]; + for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c]; diff += diff_stride; pred += pred_stride; - src += src_stride; + src += src_stride; } } } diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index ede6e7bbb03b6915f6e19a1c383d86fd7477646c..f469afc4e4b8db4a93c6607a72678e315e3d3b2d 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -32,9 +32,9 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { } // w * h must be less than 2048 or local variable v_sum may overflow. -static void variance_neon_w8(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, uint32_t *sse, int *sum) { +static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, + int *sum) { int i, j; int16x8_t v_sum = vdupq_n_s16(0); int32x4_t v_sse_lo = vdupq_n_s32(0); @@ -47,12 +47,10 @@ static void variance_neon_w8(const uint8_t *a, int a_stride, const uint16x8_t v_diff = vsubl_u8(v_a, v_b); const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); v_sum = vaddq_s16(v_sum, sv_diff); - v_sse_lo = vmlal_s16(v_sse_lo, - vget_low_s16(sv_diff), - vget_low_s16(sv_diff)); - v_sse_hi = vmlal_s16(v_sse_hi, - vget_high_s16(sv_diff), - vget_high_s16(sv_diff)); + v_sse_lo = + vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); + v_sse_hi = + vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); } a += a_stride; b += b_stride; @@ -62,15 +60,13 @@ static void variance_neon_w8(const uint8_t *a, int a_stride, *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); } -void vpx_get8x8var_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse, int *sum) { +void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, unsigned int *sse, int *sum) { variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); } -void vpx_get16x16var_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse, int *sum) { +void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, unsigned int *sse, int *sum) { variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); } @@ -104,9 +100,8 @@ unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, int sum1, sum2; uint32_t sse1, sse2; variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); - variance_neon_w8(a + (32 * a_stride), a_stride, - b + (32 * b_stride), b_stride, 32, 32, - &sse2, &sum2); + variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride, + 32, 32, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 @@ -118,9 +113,8 @@ unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, int sum1, sum2; uint32_t sse1, sse2; variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, - b + (16 * b_stride), b_stride, 64, 16, - &sse2, &sum2); + variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, + 64, 16, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 @@ -133,286 +127,273 @@ unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, uint32_t sse1, sse2; variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, - b + (16 * b_stride), b_stride, 64, 16, - &sse2, &sum2); + variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, + 64, 16, &sse2, &sum2); sse1 += sse2; sum1 += sum2; - variance_neon_w8(a + (16 * 2 * a_stride), a_stride, - b + (16 * 2 * b_stride), b_stride, - 64, 16, &sse2, &sum2); + variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), + b_stride, 64, 16, &sse2, &sum2); sse1 += sse2; sum1 += sum2; - variance_neon_w8(a + (16 * 3 * a_stride), a_stride, - b + (16 * 3 * b_stride), b_stride, - 64, 16, &sse2, &sum2); + variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), + b_stride, 64, 16, &sse2, &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 } -unsigned int vpx_variance16x8_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 4; i++) { - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); +unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - return vget_lane_u32(d0u32, 0); -} + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); -unsigned int vpx_variance8x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - uint8x8_t d0u8, d2u8, d4u8, d6u8; - int16x4_t d22s16, d23s16, d24s16, d25s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d2u8, d6u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - } + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); + return vget_lane_u32(d0u32, 0); +} - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); +unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, unsigned int *sse) { + int i; + uint8x8_t d0u8, d2u8, d4u8, d6u8; + int16x4_t d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint16x8_t q11u16, q12u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d2u8, d6u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + } - return vget_lane_u32(d0u32, 0); -} + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); -unsigned int vpx_mse16x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - int64x1_t d0s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - q7s32 = vdupq_n_s32(0); - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { // mse16x16_neon_loop - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q7s32 = vmlal_s16(q7s32, d22s16, d22s16); - q8s32 = vmlal_s16(q8s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q7s32 = vmlal_s16(q7s32, d26s16, d26s16); - q8s32 = vmlal_s16(q8s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q10s32 = vaddq_s32(q7s32, q9s32); + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - q1s64 = vpaddlq_s32(q10s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); + return vget_lane_u32(d0u32, 0); } -unsigned int vpx_get4x4sse_cs_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride) { - int16x4_t d22s16, d24s16, d26s16, d28s16; - int64x1_t d0s64; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - d0u8 = vld1_u8(src_ptr); +unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int64x1_t d0s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + q7s32 = vdupq_n_s32(0); + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { // mse16x16_neon_loop + q0u8 = vld1q_u8(src_ptr); src_ptr += source_stride; - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d1u8 = vld1_u8(src_ptr); + q1u8 = vld1q_u8(src_ptr); src_ptr += source_stride; - d5u8 = vld1_u8(ref_ptr); + q2u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d7u8 = vld1_u8(ref_ptr); + q3u8 = vld1q_u8(ref_ptr); ref_ptr += recon_stride; - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d1u8, d5u8); - q13u16 = vsubl_u8(d2u8, d6u8); - q14u16 = vsubl_u8(d3u8, d7u8); - - d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); - d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); - d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); - d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q7s32 = vmlal_s16(q7s32, d22s16, d22s16); + q8s32 = vmlal_s16(q8s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q7s32 = vmlal_s16(q7s32, d26s16, d26s16); + q8s32 = vmlal_s16(q8s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } - q7s32 = vmull_s16(d22s16, d22s16); - q8s32 = vmull_s16(d24s16, d24s16); - q9s32 = vmull_s16(d26s16, d26s16); - q10s32 = vmull_s16(d28s16, d28s16); + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q10s32 = vaddq_s32(q7s32, q9s32); - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q9s32 = vaddq_s32(q7s32, q9s32); + q1s64 = vpaddlq_s32(q10s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - q1s64 = vpaddlq_s32(q9s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride) { + int16x4_t d22s16, d24s16, d26s16, d28s16; + int64x1_t d0s64; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d1u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d5u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d3u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d7u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d1u8, d5u8); + q13u16 = vsubl_u8(d2u8, d6u8); + q14u16 = vsubl_u8(d3u8, d7u8); + + d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); + d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); + d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); + d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); + + q7s32 = vmull_s16(d22s16, d22s16); + q8s32 = vmull_s16(d24s16, d24s16); + q9s32 = vmull_s16(d26s16, d26s16); + q10s32 = vmull_s16(d28s16, d28s16); + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q9s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q9s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); } diff --git a/vpx_dsp/arm/vpx_convolve8_avg_neon.c b/vpx_dsp/arm/vpx_convolve8_avg_neon.c index 8632250138c18b7f7ce86cac0892a76619857026..69cb28400538d1f8767c7dc863d764206158c701 100644 --- a/vpx_dsp/arm/vpx_convolve8_avg_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_avg_neon.c @@ -16,16 +16,11 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -static INLINE int32x4_t MULTIPLY_BY_Q0( - int16x4_t dsrc0, - int16x4_t dsrc1, - int16x4_t dsrc2, - int16x4_t dsrc3, - int16x4_t dsrc4, - int16x4_t dsrc5, - int16x4_t dsrc6, - int16x4_t dsrc7, - int16x8_t q0s16) { +static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1, + int16x4_t dsrc2, int16x4_t dsrc3, + int16x4_t dsrc4, int16x4_t dsrc5, + int16x4_t dsrc6, int16x4_t dsrc7, + int16x8_t q0s16) { int32x4_t qdst; int16x4_t d0s16, d1s16; @@ -43,17 +38,12 @@ static INLINE int32x4_t MULTIPLY_BY_Q0( return qdst; } -void vpx_convolve8_avg_horiz_neon( - const uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, - const int16_t *filter_x, - int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, - int h) { +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, int h) { int width; const uint8_t *s; uint8_t *d; @@ -76,7 +66,7 @@ void vpx_convolve8_avg_horiz_neon( q0s16 = vld1q_s16(filter_x); - src -= 3; // adjust for taps + src -= 3; // adjust for taps for (; h > 0; h -= 4) { // loop_horiz_v s = src; d24u8 = vld1_u8(s); @@ -90,8 +80,8 @@ void vpx_convolve8_avg_horiz_neon( q12u8 = vcombine_u8(d24u8, d25u8); q13u8 = vcombine_u8(d26u8, d27u8); - q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), - vreinterpretq_u16_u8(q13u8)); + q0x2u16 = + vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); @@ -116,10 +106,8 @@ void vpx_convolve8_avg_horiz_neon( q9u16 = vcombine_u16(d17u16, d19u16); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 - for (width = w; - width > 0; - width -= 4, src += 4, dst += 4) { // loop_horiz + d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 + for (width = w; width > 0; width -= 4, src += 4, dst += 4) { // loop_horiz s = src; d28u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; @@ -131,10 +119,10 @@ void vpx_convolve8_avg_horiz_neon( __builtin_prefetch(src + 64); - d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), - vreinterpret_u16_u32(d31u32)); - d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), - vreinterpret_u16_u32(d30u32)); + d0x2u16 = + vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); + d1x2u16 = + vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 vreinterpret_u8_u16(d1x2u16.val[0])); // d29 d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 @@ -144,8 +132,8 @@ void vpx_convolve8_avg_horiz_neon( q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); - q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), - vreinterpretq_u32_u8(q15u8)); + q0x2u32 = + vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); @@ -173,14 +161,14 @@ void vpx_convolve8_avg_horiz_neon( d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, - d18s16, d19s16, d23s16, d24s16, q0s16); - q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, - d19s16, d23s16, d24s16, d26s16, q0s16); - q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, - d23s16, d24s16, d26s16, d27s16, q0s16); - q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, - d24s16, d26s16, d27s16, d25s16, q0s16); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, + d23s16, d24s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, + d24s16, d26s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, + d26s16, d27s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, + d27s16, d25s16, q0s16); __builtin_prefetch(src + 64 + src_stride * 3); @@ -195,8 +183,7 @@ void vpx_convolve8_avg_horiz_neon( d2u8 = vqmovn_u16(q1u16); d3u8 = vqmovn_u16(q2u16); - d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), - vreinterpret_u16_u8(d3u8)); + d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), vreinterpret_u32_u16(d0x2u16.val[1])); d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), @@ -231,17 +218,12 @@ void vpx_convolve8_avg_horiz_neon( return; } -void vpx_convolve8_avg_vert_neon( - const uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, - int y_step_q4, - int w, - int h) { +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, int y_step_q4, int w, + int h) { int height; const uint8_t *s; uint8_t *d; @@ -277,8 +259,8 @@ void vpx_convolve8_avg_vert_neon( d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); s += src_stride; - q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); - q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); + q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); + q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); @@ -319,20 +301,20 @@ void vpx_convolve8_avg_vert_neon( __builtin_prefetch(s); __builtin_prefetch(s + src_stride); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, - d20s16, d21s16, d22s16, d24s16, q0s16); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, + d22s16, d24s16, q0s16); __builtin_prefetch(s + src_stride * 2); __builtin_prefetch(s + src_stride * 3); - q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, - d21s16, d22s16, d24s16, d26s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, + d24s16, d26s16, q0s16); __builtin_prefetch(d); __builtin_prefetch(d + dst_stride); - q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, - d22s16, d24s16, d26s16, d27s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, + d26s16, d27s16, q0s16); __builtin_prefetch(d + dst_stride * 2); __builtin_prefetch(d + dst_stride * 3); - q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, - d24s16, d26s16, d27s16, d25s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, + d27s16, d25s16, q0s16); d2u16 = vqrshrun_n_s32(q1s32, 7); d3u16 = vqrshrun_n_s32(q2s32, 7); diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index 9bd715e2c630b2f65adeeb18195f6eea9b2685d8..514525696b0dd99922a4e0e7ab67394de1644310 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -16,16 +16,11 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -static INLINE int32x4_t MULTIPLY_BY_Q0( - int16x4_t dsrc0, - int16x4_t dsrc1, - int16x4_t dsrc2, - int16x4_t dsrc3, - int16x4_t dsrc4, - int16x4_t dsrc5, - int16x4_t dsrc6, - int16x4_t dsrc7, - int16x8_t q0s16) { +static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1, + int16x4_t dsrc2, int16x4_t dsrc3, + int16x4_t dsrc4, int16x4_t dsrc5, + int16x4_t dsrc6, int16x4_t dsrc7, + int16x8_t q0s16) { int32x4_t qdst; int16x4_t d0s16, d1s16; @@ -43,17 +38,12 @@ static INLINE int32x4_t MULTIPLY_BY_Q0( return qdst; } -void vpx_convolve8_horiz_neon( - const uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, - const int16_t *filter_x, - int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, - int h) { +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, int h) { int width; const uint8_t *s, *psrc; uint8_t *d, *pdst; @@ -77,9 +67,8 @@ void vpx_convolve8_horiz_neon( q0s16 = vld1q_s16(filter_x); src -= 3; // adjust for taps - for (; h > 0; h -= 4, - src += src_stride * 4, - dst += dst_stride * 4) { // loop_horiz_v + for (; h > 0; h -= 4, src += src_stride * 4, + dst += dst_stride * 4) { // loop_horiz_v s = src; d24u8 = vld1_u8(s); s += src_stride; @@ -92,8 +81,8 @@ void vpx_convolve8_horiz_neon( q12u8 = vcombine_u8(d24u8, d25u8); q13u8 = vcombine_u8(d26u8, d27u8); - q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), - vreinterpretq_u16_u8(q13u8)); + q0x2u16 = + vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); @@ -105,8 +94,8 @@ void vpx_convolve8_horiz_neon( __builtin_prefetch(src + src_stride * 5); __builtin_prefetch(src + src_stride * 6); - q8u16 = vmovl_u8(d0x2u8.val[0]); - q9u16 = vmovl_u8(d0x2u8.val[1]); + q8u16 = vmovl_u8(d0x2u8.val[0]); + q9u16 = vmovl_u8(d0x2u8.val[1]); q10u16 = vmovl_u8(d1x2u8.val[0]); q11u16 = vmovl_u8(d1x2u8.val[1]); @@ -119,8 +108,7 @@ void vpx_convolve8_horiz_neon( d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 - for (width = w, psrc = src + 7, pdst = dst; - width > 0; + for (width = w, psrc = src + 7, pdst = dst; width > 0; width -= 4, psrc += 4, pdst += 4) { // loop_horiz s = psrc; d28u32 = vld1_dup_u32((const uint32_t *)s); @@ -133,10 +121,10 @@ void vpx_convolve8_horiz_neon( __builtin_prefetch(psrc + 64); - d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), - vreinterpret_u16_u32(d31u32)); - d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), - vreinterpret_u16_u32(d30u32)); + d0x2u16 = + vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); + d1x2u16 = + vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 vreinterpret_u8_u16(d1x2u16.val[0])); // d29 d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 @@ -146,8 +134,8 @@ void vpx_convolve8_horiz_neon( q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); - q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), - vreinterpretq_u32_u8(q15u8)); + q0x2u32 = + vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); @@ -166,14 +154,14 @@ void vpx_convolve8_horiz_neon( d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, - d18s16, d19s16, d23s16, d24s16, q0s16); - q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, - d19s16, d23s16, d24s16, d26s16, q0s16); - q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, - d23s16, d24s16, d26s16, d27s16, q0s16); - q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, - d24s16, d26s16, d27s16, d25s16, q0s16); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, + d23s16, d24s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, + d24s16, d26s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, + d26s16, d27s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, + d27s16, d25s16, q0s16); __builtin_prefetch(psrc + 60 + src_stride * 3); @@ -188,8 +176,7 @@ void vpx_convolve8_horiz_neon( d2u8 = vqmovn_u16(q1u16); d3u8 = vqmovn_u16(q2u16); - d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), - vreinterpret_u16_u8(d3u8)); + d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), vreinterpret_u32_u16(d0x2u16.val[1])); d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), @@ -217,17 +204,12 @@ void vpx_convolve8_horiz_neon( return; } -void vpx_convolve8_vert_neon( - const uint8_t *src, - ptrdiff_t src_stride, - uint8_t *dst, - ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, - int y_step_q4, - int w, - int h) { +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, int y_step_q4, int w, + int h) { int height; const uint8_t *s; uint8_t *d; @@ -261,8 +243,8 @@ void vpx_convolve8_vert_neon( d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); s += src_stride; - q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); - q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); + q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); + q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); @@ -294,20 +276,20 @@ void vpx_convolve8_vert_neon( __builtin_prefetch(d); __builtin_prefetch(d + dst_stride); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, - d20s16, d21s16, d22s16, d24s16, q0s16); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, + d22s16, d24s16, q0s16); __builtin_prefetch(d + dst_stride * 2); __builtin_prefetch(d + dst_stride * 3); - q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, - d21s16, d22s16, d24s16, d26s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, + d24s16, d26s16, q0s16); __builtin_prefetch(s); __builtin_prefetch(s + src_stride); - q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, - d22s16, d24s16, d26s16, d27s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, + d26s16, d27s16, q0s16); __builtin_prefetch(s + src_stride * 2); __builtin_prefetch(s + src_stride * 3); - q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, - d24s16, d26s16, d27s16, d25s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, + d27s16, d25s16, q0s16); d2u16 = vqrshrun_n_s32(q1s32, 7); d3u16 = vqrshrun_n_s32(q2s32, 7); diff --git a/vpx_dsp/arm/vpx_convolve_avg_neon.c b/vpx_dsp/arm/vpx_convolve_avg_neon.c index dc58a332f81d147acc3e9b60f19ac8de32347f9b..abc2511ea291f34c26710c9eae7d3a24138c2940 100644 --- a/vpx_dsp/arm/vpx_convolve_avg_neon.c +++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c @@ -13,34 +13,32 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_convolve_avg_neon( - const uint8_t *src, // r0 - ptrdiff_t src_stride, // r1 - uint8_t *dst, // r2 - ptrdiff_t dst_stride, // r3 - const int16_t *filter_x, - int filter_x_stride, - const int16_t *filter_y, - int filter_y_stride, - int w, - int h) { +void vpx_convolve_avg_neon(const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, int w, + int h) { uint8_t *d; uint8x8_t d0u8, d1u8, d2u8, d3u8; uint32x2_t d0u32, d2u32; uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; - (void)filter_x; (void)filter_x_stride; - (void)filter_y; (void)filter_y_stride; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; d = dst; if (w > 32) { // avg64 for (; h > 0; h -= 1) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - q2u8 = vld1q_u8(src + 32); - q3u8 = vld1q_u8(src + 48); + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); src += src_stride; - q8u8 = vld1q_u8(d); - q9u8 = vld1q_u8(d + 16); + q8u8 = vld1q_u8(d); + q9u8 = vld1q_u8(d + 16); q10u8 = vld1q_u8(d + 32); q11u8 = vld1q_u8(d + 48); d += dst_stride; @@ -133,8 +131,7 @@ void vpx_convolve_avg_neon( d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); d += dst_stride; - d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), - vreinterpret_u8_u32(d2u32)); + d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32)); d0u32 = vreinterpret_u32_u8(d0u8); vst1_lane_u32((uint32_t *)dst, d0u32, 0); diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c index d8fb97a861907cc834765d3e259e3d570a34770a..fec189e0e4267ed9fe3f257b59c61b3c7ac2c85d 100644 --- a/vpx_dsp/arm/vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c @@ -13,21 +13,19 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_convolve_copy_neon( - const uint8_t *src, // r0 - ptrdiff_t src_stride, // r1 - uint8_t *dst, // r2 - ptrdiff_t dst_stride, // r3 - const int16_t *filter_x, - int filter_x_stride, - const int16_t *filter_y, - int filter_y_stride, - int w, - int h) { +void vpx_convolve_copy_neon(const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, int w, + int h) { uint8x8_t d0u8, d2u8; uint8x16_t q0u8, q1u8, q2u8, q3u8; - (void)filter_x; (void)filter_x_stride; - (void)filter_y; (void)filter_y_stride; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; if (w > 32) { // copy64 for (; h > 0; h--) { diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c index 1506ce6203de21ade9449453b47c94237cfa608b..c2d5895b718bd02e8ff91cc45ff62efc7ad46f17 100644 --- a/vpx_dsp/arm/vpx_convolve_neon.c +++ b/vpx_dsp/arm/vpx_convolve_neon.c @@ -14,10 +14,9 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_ports/mem.h" -void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). @@ -35,23 +34,20 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, * the temp buffer which has lots of extra room and is subsequently discarded * this is safe if somewhat less than ideal. */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, - temp, 64, - filter_x, x_step_q4, filter_y, y_step_q4, - w, intermediate_height); + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, + x_step_q4, filter_y, y_step_q4, w, + intermediate_height); /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_convolve8_vert_neon(temp + 64 * 3, 64, - dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); } void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); int intermediate_height = h + 7; @@ -61,12 +57,9 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, /* This implementation has the same issues as above. In addition, we only want * to average the values after both passes. */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, - temp, 64, - filter_x, x_step_q4, filter_y, y_step_q4, - w, intermediate_height); - vpx_convolve8_avg_vert_neon(temp + 64 * 3, - 64, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, + x_step_q4, filter_y, y_step_q4, w, + intermediate_height); + vpx_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); } diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c index cf7fd36665e04ddd109ddc9a06e77b32dc0b642b..b0c5e9831d85ae6d11b3e233ba6ccfb7f84b5414 100644 --- a/vpx_dsp/avg.c +++ b/vpx_dsp/avg.c @@ -16,7 +16,8 @@ unsigned int vpx_avg_8x8_c(const uint8_t *src, int stride) { int i, j; int sum = 0; for (i = 0; i < 8; ++i, src += stride) - for (j = 0; j < 8; sum += src[j], ++j) {} + for (j = 0; j < 8; sum += src[j], ++j) { + } return ROUND_POWER_OF_TWO(sum, 6); } @@ -25,7 +26,8 @@ unsigned int vpx_avg_4x4_c(const uint8_t *src, int stride) { int i, j; int sum = 0; for (i = 0; i < 4; ++i, src += stride) - for (j = 0; j < 4; sum += src[j], ++j) {} + for (j = 0; j < 4; sum += src[j], ++j) { + } return ROUND_POWER_OF_TWO(sum, 4); } @@ -80,8 +82,8 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, for (idx = 0; idx < 8; ++idx) { hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit // dynamic range [-2040, 2040] - coeff += 8; // coeff: 15 bit - // dynamic range [-16320, 16320] + coeff += 8; // coeff: 15 bit + // dynamic range [-16320, 16320] ++tmp_buf; } } @@ -92,8 +94,8 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int idx; for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] - const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride - + (idx & 0x01) * 8; + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); } @@ -109,8 +111,8 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] int16_t b3 = (a2 - a3) >> 1; - coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] - coeff[64] = b1 + b3; + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[64] = b1 + b3; coeff[128] = b0 - b2; coeff[192] = b1 - b3; @@ -123,8 +125,7 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int vpx_satd_c(const int16_t *coeff, int length) { int i; int satd = 0; - for (i = 0; i < length; ++i) - satd += abs(coeff[i]); + for (i = 0; i < length; ++i) satd += abs(coeff[i]); // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] return satd; @@ -140,8 +141,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, int i; hbuf[idx] = 0; // hbuf[idx]: 14 bit, dynamic range [0, 16320]. - for (i = 0; i < height; ++i) - hbuf[idx] += ref[i * ref_stride]; + for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride]; // hbuf[idx]: 9 bit, dynamic range [0, 510]. hbuf[idx] /= norm_factor; ++ref; @@ -153,16 +153,14 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width) { int idx; int16_t sum = 0; // sum: 14 bit, dynamic range [0, 16320] - for (idx = 0; idx < width; ++idx) - sum += ref[idx]; + for (idx = 0; idx < width; ++idx) sum += ref[idx]; return sum; } // ref: [0 - 510] // src: [0 - 510] // bwl: {2, 3, 4} -int vpx_vector_var_c(const int16_t *ref, const int16_t *src, - const int bwl) { +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) { int i; int width = 4 << bwl; int sse = 0, mean = 0, var; @@ -178,15 +176,14 @@ int vpx_vector_var_c(const int16_t *ref, const int16_t *src, return var; } -void vpx_minmax_8x8_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - int *min, int *max) { +void vpx_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, int *min, int *max) { int i, j; *min = 255; *max = 0; for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) { for (j = 0; j < 8; ++j) { - int diff = abs(src[j]-ref[j]); + int diff = abs(src[j] - ref[j]); *min = diff < *min ? diff : *min; *max = diff > *max ? diff : *max; } @@ -197,9 +194,10 @@ void vpx_minmax_8x8_c(const uint8_t *src, int src_stride, unsigned int vpx_highbd_avg_8x8_c(const uint8_t *src, int stride) { int i, j; int sum = 0; - const uint16_t* s = CONVERT_TO_SHORTPTR(src); + const uint16_t *s = CONVERT_TO_SHORTPTR(src); for (i = 0; i < 8; ++i, s += stride) - for (j = 0; j < 8; sum += s[j], ++j) {} + for (j = 0; j < 8; sum += s[j], ++j) { + } return ROUND_POWER_OF_TWO(sum, 6); } @@ -207,9 +205,10 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *src, int stride) { unsigned int vpx_highbd_avg_4x4_c(const uint8_t *src, int stride) { int i, j; int sum = 0; - const uint16_t* s = CONVERT_TO_SHORTPTR(src); - for (i = 0; i < 4; ++i, s+=stride) - for (j = 0; j < 4; sum += s[j], ++j) {} + const uint16_t *s = CONVERT_TO_SHORTPTR(src); + for (i = 0; i < 4; ++i, s += stride) + for (j = 0; j < 4; sum += s[j], ++j) { + } return ROUND_POWER_OF_TWO(sum, 4); } @@ -217,18 +216,16 @@ unsigned int vpx_highbd_avg_4x4_c(const uint8_t *src, int stride) { void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max) { int i, j; - const uint16_t* s = CONVERT_TO_SHORTPTR(s8); - const uint16_t* d = CONVERT_TO_SHORTPTR(d8); + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + const uint16_t *d = CONVERT_TO_SHORTPTR(d8); *min = 255; *max = 0; for (i = 0; i < 8; ++i, s += p, d += dp) { for (j = 0; j < 8; ++j) { - int diff = abs(s[j]-d[j]); + int diff = abs(s[j] - d[j]); *min = diff < *min ? diff : *min; *max = diff > *max ? diff : *max; } } } #endif // CONFIG_VP9_HIGHBITDEPTH - - diff --git a/vpx_dsp/bitreader.c b/vpx_dsp/bitreader.c index 8140e78e70e86ecd110fa39c9d2cbd3a45ac5a7b..90cbbba53f47ae8638f61ca44c9d39eaf9cd893d 100644 --- a/vpx_dsp/bitreader.c +++ b/vpx_dsp/bitreader.c @@ -18,11 +18,8 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_util/endian_inl.h" -int vpx_reader_init(vpx_reader *r, - const uint8_t *buffer, - size_t size, - vpx_decrypt_cb decrypt_cb, - void *decrypt_state) { +int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size, + vpx_decrypt_cb decrypt_cb, void *decrypt_state) { if (size && !buffer) { return 1; } else { @@ -55,19 +52,19 @@ void vpx_reader_fill(vpx_reader *r) { buffer_start = r->clear_buffer; } if (bits_left > BD_VALUE_SIZE) { - const int bits = (shift & 0xfffffff8) + CHAR_BIT; - BD_VALUE nv; - BD_VALUE big_endian_values; - memcpy(&big_endian_values, buffer, sizeof(BD_VALUE)); + const int bits = (shift & 0xfffffff8) + CHAR_BIT; + BD_VALUE nv; + BD_VALUE big_endian_values; + memcpy(&big_endian_values, buffer, sizeof(BD_VALUE)); #if SIZE_MAX == 0xffffffffffffffffULL - big_endian_values = HToBE64(big_endian_values); + big_endian_values = HToBE64(big_endian_values); #else - big_endian_values = HToBE32(big_endian_values); + big_endian_values = HToBE32(big_endian_values); #endif - nv = big_endian_values >> (BD_VALUE_SIZE - bits); - count += bits; - buffer += (bits >> 3); - value = r->value | (nv << (shift & 0x7)); + nv = big_endian_values >> (BD_VALUE_SIZE - bits); + count += bits; + buffer += (bits >> 3); + value = r->value | (nv << (shift & 0x7)); } else { const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left); int loop_end = 0; diff --git a/vpx_dsp/bitreader.h b/vpx_dsp/bitreader.h index 9a441b41077e6b4100b746168dddef7bcd74cbf6..6ee2a58632c5e4c5a2c7574f9cc430be74aee37a 100644 --- a/vpx_dsp/bitreader.h +++ b/vpx_dsp/bitreader.h @@ -45,11 +45,8 @@ typedef struct { uint8_t clear_buffer[sizeof(BD_VALUE) + 1]; } vpx_reader; -int vpx_reader_init(vpx_reader *r, - const uint8_t *buffer, - size_t size, - vpx_decrypt_cb decrypt_cb, - void *decrypt_state); +int vpx_reader_init(vpx_reader *r, const uint8_t *buffer, size_t size, + vpx_decrypt_cb decrypt_cb, void *decrypt_state); void vpx_reader_fill(vpx_reader *r); @@ -81,8 +78,7 @@ static INLINE int vpx_read(vpx_reader *r, int prob) { unsigned int range; unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT; - if (r->count < 0) - vpx_reader_fill(r); + if (r->count < 0) vpx_reader_fill(r); value = r->value; count = r->count; @@ -117,8 +113,7 @@ static INLINE int vpx_read_bit(vpx_reader *r) { static INLINE int vpx_read_literal(vpx_reader *r, int bits) { int literal = 0, bit; - for (bit = bits - 1; bit >= 0; bit--) - literal |= vpx_read_bit(r) << bit; + for (bit = bits - 1; bit >= 0; bit--) literal |= vpx_read_bit(r) << bit; return literal; } @@ -127,8 +122,7 @@ static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree, const vpx_prob *probs) { vpx_tree_index i = 0; - while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0) - continue; + while ((i = tree[i + vpx_read(r, probs[i >> 1])]) > 0) continue; return -i; } diff --git a/vpx_dsp/bitreader_buffer.c b/vpx_dsp/bitreader_buffer.c index 595b9bb1233b75fb0598f6d11dac7965358fef2b..bf88119a948c149a4babc9211388db1a3160b804 100644 --- a/vpx_dsp/bitreader_buffer.c +++ b/vpx_dsp/bitreader_buffer.c @@ -30,20 +30,17 @@ int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) { int vpx_rb_read_literal(struct vpx_read_bit_buffer *rb, int bits) { int value = 0, bit; - for (bit = bits - 1; bit >= 0; bit--) - value |= vpx_rb_read_bit(rb) << bit; + for (bit = bits - 1; bit >= 0; bit--) value |= vpx_rb_read_bit(rb) << bit; return value; } -int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, - int bits) { +int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits) { const int value = vpx_rb_read_literal(rb, bits); return vpx_rb_read_bit(rb) ? -value : value; } -int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, - int bits) { +int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits) { const int nbits = sizeof(unsigned) * 8 - bits - 1; const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits; - return ((int) value) >> nbits; + return ((int)value) >> nbits; } diff --git a/vpx_dsp/bitwriter.c b/vpx_dsp/bitwriter.c index 5b232e346e22a7214add60f5a9d21d760b62585b..81e28b309f573e2cabb1b6c29f9324655eacca86 100644 --- a/vpx_dsp/bitwriter.c +++ b/vpx_dsp/bitwriter.c @@ -14,21 +14,18 @@ void vpx_start_encode(vpx_writer *br, uint8_t *source) { br->lowvalue = 0; - br->range = 255; - br->count = -24; - br->buffer = source; - br->pos = 0; + br->range = 255; + br->count = -24; + br->buffer = source; + br->pos = 0; vpx_write_bit(br, 0); } void vpx_stop_encode(vpx_writer *br) { int i; - for (i = 0; i < 32; i++) - vpx_write_bit(br, 0); + for (i = 0; i < 32; i++) vpx_write_bit(br, 0); // Ensure there's no ambigous collision with any index marker bytes - if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) - br->buffer[br->pos++] = 0; + if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0; } - diff --git a/vpx_dsp/bitwriter.h b/vpx_dsp/bitwriter.h index d904997af309ffcd36c0295c2d21c551a7cc35cb..41040cf93549829d36297293d1a655a6b52ac09d 100644 --- a/vpx_dsp/bitwriter.h +++ b/vpx_dsp/bitwriter.h @@ -85,8 +85,7 @@ static INLINE void vpx_write_bit(vpx_writer *w, int bit) { static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) { int bit; - for (bit = bits - 1; bit >= 0; bit--) - vpx_write_bit(w, 1 & (data >> bit)); + for (bit = bits - 1; bit >= 0; bit--) vpx_write_bit(w, 1 & (data >> bit)); } #define vpx_write_prob(w, v) vpx_write_literal((w), (v), 8) diff --git a/vpx_dsp/bitwriter_buffer.c b/vpx_dsp/bitwriter_buffer.c index 8633372da3478af419c749f74473e0ebddaa2595..0638622911aeee4fea21e65c2d944c86c3da4098 100644 --- a/vpx_dsp/bitwriter_buffer.c +++ b/vpx_dsp/bitwriter_buffer.c @@ -22,7 +22,7 @@ void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) { const int off = (int)wb->bit_offset; const int p = off / CHAR_BIT; const int q = CHAR_BIT - 1 - off % CHAR_BIT; - if (q == CHAR_BIT -1) { + if (q == CHAR_BIT - 1) { wb->bit_buffer[p] = bit << q; } else { wb->bit_buffer[p] &= ~(1 << q); @@ -33,11 +33,10 @@ void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) { void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) { int bit; - for (bit = bits - 1; bit >= 0; bit--) - vpx_wb_write_bit(wb, (data >> bit) & 1); + for (bit = bits - 1; bit >= 0; bit--) vpx_wb_write_bit(wb, (data >> bit) & 1); } -void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, - int data, int bits) { +void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data, + int bits) { vpx_wb_write_literal(wb, data, bits + 1); } diff --git a/vpx_dsp/blend.h b/vpx_dsp/blend.h index 109183acc41bc49bdffbc03aed1da8eb0881be3e..2ceb4c78f854eb47a4a20c68dbcbc122ffb93c5a 100644 --- a/vpx_dsp/blend.h +++ b/vpx_dsp/blend.h @@ -18,23 +18,23 @@ // Alpha blending with alpha values from the range [0, 64], where 64 // means use the first input and 0 means use the second input. -#define VPX_BLEND_A64_ROUND_BITS 6 -#define VPX_BLEND_A64_MAX_ALPHA (1 << VPX_BLEND_A64_ROUND_BITS) // 64 +#define VPX_BLEND_A64_ROUND_BITS 6 +#define VPX_BLEND_A64_MAX_ALPHA (1 << VPX_BLEND_A64_ROUND_BITS) // 64 -#define VPX_BLEND_A64(a, v0, v1) \ - ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A64_MAX_ALPHA - (a)) * (v1), \ +#define VPX_BLEND_A64(a, v0, v1) \ + ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A64_MAX_ALPHA - (a)) * (v1), \ VPX_BLEND_A64_ROUND_BITS) // Alpha blending with alpha values from the range [0, 256], where 256 // means use the first input and 0 means use the second input. #define VPX_BLEND_A256_ROUND_BITS 8 -#define VPX_BLEND_A256_MAX_ALPHA (1 << VPX_BLEND_A256_ROUND_BITS) // 256 +#define VPX_BLEND_A256_MAX_ALPHA (1 << VPX_BLEND_A256_ROUND_BITS) // 256 -#define VPX_BLEND_A256(a, v0, v1) \ - ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A256_MAX_ALPHA - (a)) * (v1), \ +#define VPX_BLEND_A256(a, v0, v1) \ + ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A256_MAX_ALPHA - (a)) * (v1), \ VPX_BLEND_A256_ROUND_BITS) // Blending by averaging. -#define VPX_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1) +#define VPX_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1) #endif // VPX_DSP_BLEND_H_ diff --git a/vpx_dsp/blend_a64_hmask.c b/vpx_dsp/blend_a64_hmask.c index 90f3415fffd4c6b81847a505a1e623d882ced1f6..46d73ffd289730d1a5d2b9cfe5d2e0aca5cbe2f8 100644 --- a/vpx_dsp/blend_a64_hmask.c +++ b/vpx_dsp/blend_a64_hmask.c @@ -17,11 +17,10 @@ #include "./vpx_dsp_rtcd.h" -void vpx_blend_a64_hmask_c( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { +void vpx_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { int i, j; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); @@ -34,19 +33,17 @@ void vpx_blend_a64_hmask_c( for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { - dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j], - src0[i * src0_stride + j], - src1[i * src1_stride + j]); + dst[i * dst_stride + j] = VPX_BLEND_A64( + mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_blend_a64_hmask_c( - uint8_t *dst_8, uint32_t dst_stride, - const uint8_t *src0_8, uint32_t src0_stride, - const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int h, int w, int bd) { +void vpx_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { int i, j; uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); @@ -64,9 +61,8 @@ void vpx_highbd_blend_a64_hmask_c( for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { - dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j], - src0[i * src0_stride + j], - src1[i * src1_stride + j]); + dst[i * dst_stride + j] = VPX_BLEND_A64( + mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } diff --git a/vpx_dsp/blend_a64_mask.c b/vpx_dsp/blend_a64_mask.c index 1649798e404340e5cc4cd2dcb0b311904e991fb0..eee544c1db5db6478541ad83d09eb8b66c0d1c94 100644 --- a/vpx_dsp/blend_a64_mask.c +++ b/vpx_dsp/blend_a64_mask.c @@ -24,8 +24,8 @@ void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, int subh, int subw) { + const uint8_t *mask, uint32_t mask_stride, int h, + int w, int subh, int subw) { int i, j; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); @@ -40,22 +40,20 @@ void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int m = mask[i * mask_stride + j]; - dst[i * dst_stride + j] = VPX_BLEND_A64(m, - src0[i * src0_stride + j], + dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } else if (subw == 1 && subh == 1) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { - const int m = - ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] + - mask[(2 * i + 1) * mask_stride + (2 * j)] + - mask[(2 * i) * mask_stride + (2 * j + 1)] + - mask[(2 * i + 1) * mask_stride + (2 * j + 1)], - 2); - dst[i * dst_stride + j] = VPX_BLEND_A64(m, - src0[i * src0_stride + j], + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } @@ -64,8 +62,7 @@ void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, for (j = 0; j < w; ++j) { const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)], mask[i * mask_stride + (2 * j + 1)]); - dst[i * dst_stride + j] = VPX_BLEND_A64(m, - src0[i * src0_stride + j], + dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } @@ -74,8 +71,7 @@ void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, for (j = 0; j < w; ++j) { const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j], mask[(2 * i + 1) * mask_stride + j]); - dst[i * dst_stride + j] = VPX_BLEND_A64(m, - src0[i * src0_stride + j], + dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } @@ -107,22 +103,20 @@ void vpx_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride, for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { const int m = mask[i * mask_stride + j]; - dst[i * dst_stride + j] = VPX_BLEND_A64(m, - src0[i * src0_stride + j], + dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } else if (subw == 1 && subh == 1) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { - const int m = - ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] + - mask[(2 * i + 1) * mask_stride + (2 * j)] + - mask[(2 * i) * mask_stride + (2 * j + 1)] + - mask[(2 * i + 1) * mask_stride + (2 * j + 1)], - 2); - dst[i * dst_stride + j] = VPX_BLEND_A64(m, - src0[i * src0_stride + j], + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } @@ -131,8 +125,7 @@ void vpx_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride, for (j = 0; j < w; ++j) { const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)], mask[i * mask_stride + (2 * j + 1)]); - dst[i * dst_stride + j] = VPX_BLEND_A64(m, - src0[i * src0_stride + j], + dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } @@ -141,8 +134,7 @@ void vpx_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride, for (j = 0; j < w; ++j) { const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j], mask[(2 * i + 1) * mask_stride + j]); - dst[i * dst_stride + j] = VPX_BLEND_A64(m, - src0[i * src0_stride + j], + dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } diff --git a/vpx_dsp/blend_a64_vmask.c b/vpx_dsp/blend_a64_vmask.c index 5d48a8336cbb581b5436611e954558d80749433d..4a2ced75182483d666509aa63024ed13ed89901d 100644 --- a/vpx_dsp/blend_a64_vmask.c +++ b/vpx_dsp/blend_a64_vmask.c @@ -17,11 +17,10 @@ #include "./vpx_dsp_rtcd.h" -void vpx_blend_a64_vmask_c( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { +void vpx_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { int i, j; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); @@ -35,19 +34,17 @@ void vpx_blend_a64_vmask_c( for (i = 0; i < h; ++i) { const int m = mask[i]; for (j = 0; j < w; ++j) { - dst[i * dst_stride + j] = VPX_BLEND_A64(m, - src0[i * src0_stride + j], + dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_blend_a64_vmask_c( - uint8_t *dst_8, uint32_t dst_stride, - const uint8_t *src0_8, uint32_t src0_stride, - const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int h, int w, int bd) { +void vpx_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { int i, j; uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); @@ -66,8 +63,7 @@ void vpx_highbd_blend_a64_vmask_c( for (i = 0; i < h; ++i) { const int m = mask[i]; for (j = 0; j < w; ++j) { - dst[i * dst_stride + j] = VPX_BLEND_A64(m, - src0[i * src0_stride + j], + dst[i * dst_stride + j] = VPX_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c index aba99d7a665d8be4059843f4741ac104c5b9bad5..589b124e26a44f90f370b7b35964217ecd49bfff 100644 --- a/vpx_dsp/deblock.c +++ b/vpx_dsp/deblock.c @@ -10,26 +10,32 @@ #include <stdlib.h> #include "vpx/vpx_integer.h" -const int16_t vpx_rv[] = {8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, - 14, 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, 3, 14, - 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, 3, 3, 1, 13, - 13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, 14, 4, 12, 5, 12, 10, - 8, 10, 13, 10, 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5, - 4, 4, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3, - 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, 12, 0, - 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7, - 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, - 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3, - 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10, - 8, 9, 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, - 2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, - 1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, - 7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, - 5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, - 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, - 10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, 11, 9, 14, 8, 14, 13, - 4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12, - 13, 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, }; +const int16_t vpx_rv[] = { + 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, 14, + 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, + 3, 14, 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, + 2, 9, 7, 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, + 2, 0, 13, 13, 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, 4, 14, 4, 10, 0, + 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, 0, 10, 0, 5, + 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3, 4, 7, + 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, + 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, + 6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, + 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, + 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3, 10, 5, 8, 0, 11, 6, + 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10, 8, 9, + 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, + 7, 2, 2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, + 0, 11, 8, 13, 1, 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, + 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0, + 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12, + 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, + 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, + 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, + 3, 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, + 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, 0, 9, 5, 5, 11, 10, 13, + 9, 10, 13, +}; void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, unsigned char *dst_ptr, @@ -55,8 +61,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, v = p_src[col]; - if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) - && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) { + if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) && + (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) { unsigned char k1, k2, k3; k1 = (p_above2 + p_above1 + 1) >> 1; k2 = (p_below2 + p_below1 + 1) >> 1; @@ -77,10 +83,10 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, for (col = 0; col < cols; col++) { v = p_src[col]; - if ((abs(v - p_src[col - 2]) < f[col]) - && (abs(v - p_src[col - 1]) < f[col]) - && (abs(v - p_src[col + 1]) < f[col]) - && (abs(v - p_src[col + 2]) < f[col])) { + if ((abs(v - p_src[col - 2]) < f[col]) && + (abs(v - p_src[col - 1]) < f[col]) && + (abs(v - p_src[col + 1]) < f[col]) && + (abs(v - p_src[col + 2]) < f[col])) { unsigned char k1, k2, k3; k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1; k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1; @@ -90,8 +96,7 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, d[col & 3] = v; - if (col >= 2) - p_dst[col - 2] = d[(col - 2) & 3]; + if (col >= 2) p_dst[col - 2] = d[(col - 2) & 3]; } /* handle the last two pixels */ @@ -115,14 +120,12 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int sumsq = 0; int sum = 0; - for (i = -8; i < 0; i++) - s[i] = s[0]; + for (i = -8; i < 0; i++) s[i] = s[0]; /* 17 avoids valgrind warning - we buffer values in c in d * and only write them when we've read 8 ahead... */ - for (i = 0; i < 17; i++) - s[i + cols] = s[cols - 1]; + for (i = 0; i < 17; i++) s[i + cols] = s[cols - 1]; for (i = -8; i <= 6; i++) { sumsq += s[i] * s[i]; @@ -162,14 +165,12 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, unsigned char d[16]; const int16_t *rv2 = rv3 + ((c * 17) & 127); - for (i = -8; i < 0; i++) - s[i * pitch] = s[0]; + for (i = -8; i < 0; i++) s[i * pitch] = s[0]; /* 17 avoids valgrind warning - we buffer values in c in d * and only write them when we've read 8 ahead... */ - for (i = 0; i < 17; i++) - s[(i + rows) * pitch] = s[(rows - 1) * pitch]; + for (i = 0; i < 17; i++) s[(i + rows) * pitch] = s[(rows - 1) * pitch]; for (i = -8; i <= 6; i++) { sumsq += s[i * pitch] * s[i * pitch]; @@ -184,10 +185,8 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, if (sumsq * 15 - sum * sum < flimit) { d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4; } - if (r >= 8) - s[-8 * pitch] = d[(r - 8) & 15]; + if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15]; s += pitch; } } } - diff --git a/vpx_dsp/fastssim.c b/vpx_dsp/fastssim.c index 7d90891714373c9fe22d41dfae2243b8bc72705a..4d5eb5a6ff12594a9c84fceb5ffb385bbe4a6409 100644 --- a/vpx_dsp/fastssim.c +++ b/vpx_dsp/fastssim.c @@ -55,12 +55,12 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { int l; lw = (_w + 1) >> 1; lh = (_h + 1) >> 1; - data_size = _nlevels * sizeof(fs_level) - + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf); + data_size = + _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf); for (l = 0; l < _nlevels; l++) { size_t im_size; size_t level_size; - im_size = lw * (size_t) lh; + im_size = lw * (size_t)lh; level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); level_size += sizeof(*_ctx->level[l].ssim) - 1; level_size /= sizeof(*_ctx->level[l].ssim); @@ -70,8 +70,8 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { lw = (lw + 1) >> 1; lh = (lh + 1) >> 1; } - data = (unsigned char *) malloc(data_size); - _ctx->level = (fs_level *) data; + data = (unsigned char *)malloc(data_size); + _ctx->level = (fs_level *)data; _ctx->nlevels = _nlevels; data += _nlevels * sizeof(*_ctx->level); lw = (_w + 1) >> 1; @@ -81,7 +81,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { size_t level_size; _ctx->level[l].w = lw; _ctx->level[l].h = lh; - im_size = lw * (size_t) lh; + im_size = lw * (size_t)lh; level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); level_size += sizeof(*_ctx->level[l].ssim) - 1; level_size /= sizeof(*_ctx->level[l].ssim); @@ -89,17 +89,15 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { _ctx->level[l].im1 = (uint32_t *)data; _ctx->level[l].im2 = _ctx->level[l].im1 + im_size; data += level_size; - _ctx->level[l].ssim = (double *) data; + _ctx->level[l].ssim = (double *)data; data += im_size * sizeof(*_ctx->level[l].ssim); lw = (lw + 1) >> 1; lh = (lh + 1) >> 1; } - _ctx->col_buf = (unsigned *) data; + _ctx->col_buf = (unsigned *)data; } -static void fs_ctx_clear(fs_ctx *_ctx) { - free(_ctx->level); -} +static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); } static void fs_downsample_level(fs_ctx *_ctx, int _l) { const uint32_t *src1; @@ -130,18 +128,18 @@ static void fs_downsample_level(fs_ctx *_ctx, int _l) { int i1; i0 = 2 * i; i1 = FS_MINI(i0 + 1, w2); - dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] - + src1[j1offs + i0] + src1[j1offs + i1]; - dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] - + src2[j1offs + i0] + src2[j1offs + i1]; + dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] + + src1[j1offs + i0] + src1[j1offs + i1]; + dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] + + src2[j1offs + i0] + src2[j1offs + i1]; } } } static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1, int _s1ystride, const uint8_t *_src2, - int _s2ystride, int _w, int _h, - uint32_t bd, uint32_t shift) { + int _s2ystride, int _w, int _h, uint32_t bd, + uint32_t shift) { uint32_t *dst1; uint32_t *dst2; int w; @@ -163,23 +161,23 @@ static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1, i0 = 2 * i; i1 = FS_MINI(i0 + 1, _w); if (bd == 8 && shift == 0) { - dst1[j * w + i] = _src1[j0 * _s1ystride + i0] - + _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0] - + _src1[j1 * _s1ystride + i1]; - dst2[j * w + i] = _src2[j0 * _s2ystride + i0] - + _src2[j0 * _s2ystride + i1] + _src2[j1 * _s2ystride + i0] - + _src2[j1 * _s2ystride + i1]; + dst1[j * w + i] = + _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] + + _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1]; + dst2[j * w + i] = + _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] + + _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1]; } else { - uint16_t * src1s = CONVERT_TO_SHORTPTR(_src1); - uint16_t * src2s = CONVERT_TO_SHORTPTR(_src2); - dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) - + (src1s[j0 * _s1ystride + i1] >> shift) - + (src1s[j1 * _s1ystride + i0] >> shift) - + (src1s[j1 * _s1ystride + i1] >> shift); - dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) - + (src2s[j0 * _s2ystride + i1] >> shift) - + (src2s[j1 * _s2ystride + i0] >> shift) - + (src2s[j1 * _s2ystride + i1] >> shift); + uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1); + uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2); + dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) + + (src1s[j0 * _s1ystride + i1] >> shift) + + (src1s[j1 * _s1ystride + i0] >> shift) + + (src1s[j1 * _s1ystride + i1] >> shift); + dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) + + (src2s[j0 * _s2ystride + i1] >> shift) + + (src2s[j1 * _s2ystride + i0] >> shift) + + (src2s[j1 * _s2ystride + i1] >> shift); } } } @@ -200,10 +198,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { int j; double ssim_c1 = SSIM_C1; #if CONFIG_VP9_HIGHBITDEPTH - if (bit_depth == 10) - ssim_c1 = SSIM_C1_10; - if (bit_depth == 12) - ssim_c1 = SSIM_C1_12; + if (bit_depth == 10) ssim_c1 = SSIM_C1_10; + if (bit_depth == 12) ssim_c1 = SSIM_C1_12; #else assert(bit_depth == 8); #endif @@ -213,19 +209,15 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { col_sums_y = col_sums_x + w; im1 = _ctx->level[_l].im1; im2 = _ctx->level[_l].im2; - for (i = 0; i < w; i++) - col_sums_x[i] = 5 * im1[i]; - for (i = 0; i < w; i++) - col_sums_y[i] = 5 * im2[i]; + for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i]; + for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i]; for (j = 1; j < 4; j++) { j1offs = FS_MINI(j, h - 1) * w; - for (i = 0; i < w; i++) - col_sums_x[i] += im1[j1offs + i]; - for (i = 0; i < w; i++) - col_sums_y[i] += im2[j1offs + i]; + for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; } ssim = _ctx->level[_l].ssim; - c1 = (double) (ssim_c1 * 4096 * (1 << 4 * _l)); + c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l)); for (j = 0; j < h; j++) { unsigned mux; unsigned muy; @@ -239,8 +231,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { muy += col_sums_y[i1]; } for (i = 0; i < w; i++) { - ssim[j * w + i] *= (2 * mux * (double) muy + c1) - / (mux * (double) mux + muy * (double) muy + c1); + ssim[j * w + i] *= (2 * mux * (double)muy + c1) / + (mux * (double)mux + muy * (double)muy + c1); if (i + 1 < w) { i0 = FS_MAXI(0, i - 4); i1 = FS_MINI(i + 4, w - 1); @@ -250,78 +242,68 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { } if (j + 1 < h) { j0offs = FS_MAXI(0, j - 4) * w; - for (i = 0; i < w; i++) - col_sums_x[i] -= im1[j0offs + i]; - for (i = 0; i < w; i++) - col_sums_y[i] -= im2[j0offs + i]; + for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i]; j1offs = FS_MINI(j + 4, h - 1) * w; - for (i = 0; i < w; i++) - col_sums_x[i] += im1[j1offs + i]; - for (i = 0; i < w; i++) - col_sums_y[i] += im2[j1offs + i]; + for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; } } } -#define FS_COL_SET(_col, _joffs, _ioffs) \ - do { \ - unsigned gx; \ - unsigned gy; \ +#define FS_COL_SET(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ - col_sums_gx2[(_col)] = gx * (double)gx; \ - col_sums_gy2[(_col)] = gy * (double)gy; \ - col_sums_gxgy[(_col)] = gx * (double)gy; \ - } \ - while (0) + col_sums_gx2[(_col)] = gx * (double)gx; \ + col_sums_gy2[(_col)] = gy * (double)gy; \ + col_sums_gxgy[(_col)] = gx * (double)gy; \ + } while (0) -#define FS_COL_ADD(_col, _joffs, _ioffs) \ - do { \ - unsigned gx; \ - unsigned gy; \ +#define FS_COL_ADD(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ - col_sums_gx2[(_col)] += gx * (double)gx; \ - col_sums_gy2[(_col)] += gy * (double)gy; \ - col_sums_gxgy[(_col)] += gx * (double)gy; \ - } \ - while (0) + col_sums_gx2[(_col)] += gx * (double)gx; \ + col_sums_gy2[(_col)] += gy * (double)gy; \ + col_sums_gxgy[(_col)] += gx * (double)gy; \ + } while (0) -#define FS_COL_SUB(_col, _joffs, _ioffs) \ - do { \ - unsigned gx; \ - unsigned gy; \ +#define FS_COL_SUB(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ - col_sums_gx2[(_col)] -= gx * (double)gx; \ - col_sums_gy2[(_col)] -= gy * (double)gy; \ - col_sums_gxgy[(_col)] -= gx * (double)gy; \ - } \ - while (0) + col_sums_gx2[(_col)] -= gx * (double)gx; \ + col_sums_gy2[(_col)] -= gy * (double)gy; \ + col_sums_gxgy[(_col)] -= gx * (double)gy; \ + } while (0) -#define FS_COL_COPY(_col1, _col2) \ - do { \ - col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \ - col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \ +#define FS_COL_COPY(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \ - } \ - while (0) + } while (0) -#define FS_COL_HALVE(_col1, _col2) \ - do { \ - col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \ - col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \ +#define FS_COL_HALVE(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \ - } \ - while (0) + } while (0) -#define FS_COL_DOUBLE(_col1, _col2) \ - do { \ - col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \ - col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \ +#define FS_COL_DOUBLE(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \ col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \ - } \ - while (0) + } while (0) static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { uint32_t *im1; @@ -340,10 +322,8 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { int j; double ssim_c2 = SSIM_C2; #if CONFIG_VP9_HIGHBITDEPTH - if (bit_depth == 10) - ssim_c2 = SSIM_C2_10; - if (bit_depth == 12) - ssim_c2 = SSIM_C2_12; + if (bit_depth == 10) ssim_c2 = SSIM_C2_10; + if (bit_depth == 12) ssim_c2 = SSIM_C2_12; #else assert(bit_depth == 8); #endif @@ -398,14 +378,11 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { double mugy2; double mugxgy; mugx2 = col_sums_gx2[0]; - for (k = 1; k < 8; k++) - mugx2 += col_sums_gx2[k]; + for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k]; mugy2 = col_sums_gy2[0]; - for (k = 1; k < 8; k++) - mugy2 += col_sums_gy2[k]; + for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k]; mugxgy = col_sums_gxgy[0]; - for (k = 1; k < 8; k++) - mugxgy += col_sums_gxgy[k]; + for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k]; ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2); if (i + 1 < w) { FS_COL_SET(0, -1, 1); @@ -440,8 +417,9 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}. We drop the finest scale and renormalize the rest to sum to 1.*/ -static const double FS_WEIGHTS[FS_NLEVELS] = {0.2989654541015625, - 0.3141326904296875, 0.2473602294921875, 0.1395416259765625}; +static const double FS_WEIGHTS[FS_NLEVELS] = { + 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625 +}; static double fs_average(fs_ctx *_ctx, int _l) { double *ssim; @@ -455,28 +433,26 @@ static double fs_average(fs_ctx *_ctx, int _l) { ssim = _ctx->level[_l].ssim; ret = 0; for (j = 0; j < h; j++) - for (i = 0; i < w; i++) - ret += ssim[j * w + i]; + for (i = 0; i < w; i++) ret += ssim[j * w + i]; return pow(ret / (w * h), FS_WEIGHTS[_l]); } static double convert_ssim_db(double _ssim, double _weight) { assert(_weight >= _ssim); - if ((_weight - _ssim) < 1e-10) - return MAX_SSIM_DB; + if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB; return 10 * (log10(_weight) - log10(_weight - _ssim)); } -static double calc_ssim(const uint8_t *_src, int _systride, - const uint8_t *_dst, int _dystride, - int _w, int _h, uint32_t _bd, uint32_t _shift) { +static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst, + int _dystride, int _w, int _h, uint32_t _bd, + uint32_t _shift) { fs_ctx ctx; double ret; int l; ret = 1; fs_ctx_init(&ctx, _w, _h, FS_NLEVELS); - fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, - _w, _h, _bd, _shift); + fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd, + _shift); for (l = 0; l < FS_NLEVELS - 1; l++) { fs_calc_structure(&ctx, l, _bd); ret *= fs_average(&ctx, l); @@ -490,9 +466,9 @@ static double calc_ssim(const uint8_t *_src, int _systride, } double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, - double *ssim_y, double *ssim_u, double *ssim_v, - uint32_t bd, uint32_t in_bd) { + const YV12_BUFFER_CONFIG *dest, double *ssim_y, + double *ssim_u, double *ssim_v, uint32_t bd, + uint32_t in_bd) { double ssimv; uint32_t bd_shift = 0; vpx_clear_system_state(); diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c index 4c0d5db83760beb1b69789caf228e02fcd3e8ec7..4e7d4053ea9ceb20d2e89eff8d88843b9d3f74ee 100644 --- a/vpx_dsp/fwd_txfm.c +++ b/vpx_dsp/fwd_txfm.c @@ -72,8 +72,7 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { { int i, j; for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - output[j + i * 4] = (output[j + i * 4] + 1) >> 2; + for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2; } } } @@ -82,8 +81,7 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { int r, c; tran_low_t sum = 0; for (r = 0; r < 4; ++r) - for (c = 0; c < 4; ++c) - sum += input[r * stride + c]; + for (c = 0; c < 4; ++c) sum += input[r * stride + c]; output[0] = sum << 1; } @@ -133,8 +131,8 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { x3 = s0 - s3; t0 = (x0 + x1) * cospi_16_64; t1 = (x0 - x1) * cospi_16_64; - t2 = x2 * cospi_24_64 + x3 * cospi_8_64; - t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; output[0] = (tran_low_t)fdct_round_shift(t0); output[2] = (tran_low_t)fdct_round_shift(t2); output[4] = (tran_low_t)fdct_round_shift(t1); @@ -153,24 +151,23 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { x3 = s7 + t3; // Stage 4 - t0 = x0 * cospi_28_64 + x3 * cospi_4_64; - t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; - t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; output[1] = (tran_low_t)fdct_round_shift(t0); output[3] = (tran_low_t)fdct_round_shift(t2); output[5] = (tran_low_t)fdct_round_shift(t1); output[7] = (tran_low_t)fdct_round_shift(t3); output += 8; } - in = intermediate; + in = intermediate; output = final_output; } // Rows for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - final_output[j + i * 8] /= 2; + for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; } } @@ -178,8 +175,7 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { int r, c; tran_low_t sum = 0; for (r = 0; r < 8; ++r) - for (c = 0; c < 8; ++c) - sum += input[r * stride + c]; + for (c = 0; c < 8; ++c) sum += input[r * stride + c]; output[0] = sum; } @@ -214,11 +210,11 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4; input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4; input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4; - input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4; - input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4; + input[6] = (in_pass0[6 * stride] + in_pass0[9 * stride]) * 4; + input[7] = (in_pass0[7 * stride] + in_pass0[8 * stride]) * 4; // Calculate input for the next 8 results. - step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4; - step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4; + step1[0] = (in_pass0[7 * stride] - in_pass0[8 * stride]) * 4; + step1[1] = (in_pass0[6 * stride] - in_pass0[9 * stride]) * 4; step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4; step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4; step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4; @@ -233,11 +229,11 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); - input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); - input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); + input[6] = ((in[6 * 16] + 1) >> 2) + ((in[9 * 16] + 1) >> 2); + input[7] = ((in[7 * 16] + 1) >> 2) + ((in[8 * 16] + 1) >> 2); // Calculate input for the next 8 results. - step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); - step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); + step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[8 * 16] + 1) >> 2); + step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[9 * 16] + 1) >> 2); step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); @@ -268,7 +264,7 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { x3 = s0 - s3; t0 = (x0 + x1) * cospi_16_64; t1 = (x0 - x1) * cospi_16_64; - t2 = x3 * cospi_8_64 + x2 * cospi_24_64; + t2 = x3 * cospi_8_64 + x2 * cospi_24_64; t3 = x3 * cospi_24_64 - x2 * cospi_8_64; out[0] = (tran_low_t)fdct_round_shift(t0); out[4] = (tran_low_t)fdct_round_shift(t2); @@ -288,10 +284,10 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { x3 = s7 + t3; // Stage 4 - t0 = x0 * cospi_28_64 + x3 * cospi_4_64; - t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; - t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; out[2] = (tran_low_t)fdct_round_shift(t0); out[6] = (tran_low_t)fdct_round_shift(t2); out[10] = (tran_low_t)fdct_round_shift(t1); @@ -318,12 +314,12 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { step3[6] = step1[6] + step2[5]; step3[7] = step1[7] + step2[4]; // step 4 - temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; - temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; + temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; + temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; step2[1] = fdct_round_shift(temp1); step2[2] = fdct_round_shift(temp2); temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; - temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; + temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; step2[5] = fdct_round_shift(temp1); step2[6] = fdct_round_shift(temp2); // step 5 @@ -336,20 +332,20 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { step1[6] = step3[7] - step2[6]; step1[7] = step3[7] + step2[6]; // step 6 - temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; + temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; out[1] = (tran_low_t)fdct_round_shift(temp1); out[9] = (tran_low_t)fdct_round_shift(temp2); temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; - temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; + temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; out[5] = (tran_low_t)fdct_round_shift(temp1); out[13] = (tran_low_t)fdct_round_shift(temp2); - temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; + temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; out[3] = (tran_low_t)fdct_round_shift(temp1); out[11] = (tran_low_t)fdct_round_shift(temp2); temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; - temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; + temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; out[7] = (tran_low_t)fdct_round_shift(temp1); out[15] = (tran_low_t)fdct_round_shift(temp2); } @@ -368,8 +364,7 @@ void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { int r, c; int sum = 0; for (r = 0; r < 16; ++r) - for (c = 0; c < 16; ++c) - sum += input[r * stride + c]; + for (c = 0; c < 16; ++c) sum += input[r * stride + c]; output[0] = (tran_low_t)(sum >> 1); } @@ -675,36 +670,36 @@ void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) { step[31] = output[31] + output[30]; // Final stage --- outputs indices are bit-reversed. - output[0] = step[0]; + output[0] = step[0]; output[16] = step[1]; - output[8] = step[2]; + output[8] = step[2]; output[24] = step[3]; - output[4] = step[4]; + output[4] = step[4]; output[20] = step[5]; output[12] = step[6]; output[28] = step[7]; - output[2] = step[8]; + output[2] = step[8]; output[18] = step[9]; output[10] = step[10]; output[26] = step[11]; - output[6] = step[12]; + output[6] = step[12]; output[22] = step[13]; output[14] = step[14]; output[30] = step[15]; - output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); + output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); - output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); + output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); - output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); + output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); - output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); + output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); - output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); + output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); @@ -717,8 +712,7 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { // Columns for (i = 0; i < 32; ++i) { tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = input[j * stride + i] * 4; + for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; vpx_fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; @@ -727,8 +721,7 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { // Rows for (i = 0; i < 32; ++i) { tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = output[j + i * 32]; + for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; vpx_fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) out[j + i * 32] = @@ -746,8 +739,7 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { // Columns for (i = 0; i < 32; ++i) { tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = input[j * stride + i] * 4; + for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; vpx_fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) // TODO(cd): see quality impact of only doing @@ -759,11 +751,9 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { // Rows for (i = 0; i < 32; ++i) { tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = output[j + i * 32]; + for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; vpx_fdct32(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) - out[j + i * 32] = (tran_low_t)temp_out[j]; + for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; } } @@ -771,8 +761,7 @@ void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { int r, c; int sum = 0; for (r = 0; r < 32; ++r) - for (c = 0; c < 32; ++c) - sum += input[r * stride + c]; + for (c = 0; c < 32; ++c) sum += input[r * stride + c]; output[0] = (tran_low_t)(sum >> 3); } diff --git a/vpx_dsp/intrapred.c b/vpx_dsp/intrapred.c index b1076f8f01aaf771e970791927c9d713311509a2..4179e0f78efba6514949710bf2361782cc4e7729 100644 --- a/vpx_dsp/intrapred.c +++ b/vpx_dsp/intrapred.c @@ -14,17 +14,16 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" -#define DST(x, y) dst[(x) + (y) * stride] +#define DST(x, y) dst[(x) + (y)*stride] #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) #define AVG2(a, b) (((a) + (b) + 1) >> 1) static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; - (void) above; + (void)above; // first column - for (r = 0; r < bs - 1; ++r) - dst[r * stride] = AVG2(left[r], left[r + 1]); + for (r = 0; r < bs - 1; ++r) dst[r * stride] = AVG2(left[r], left[r + 1]); dst[(bs - 1) * stride] = left[bs - 1]; dst++; @@ -36,8 +35,7 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, dst++; // rest of last row - for (c = 0; c < bs - 2; ++c) - dst[(bs - 1) * stride + c] = left[bs - 1]; + for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1]; for (r = bs - 2; r >= 0; --r) for (c = 0; c < bs - 2; ++c) @@ -47,13 +45,13 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; - (void) above; + (void)above; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) { dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], left[(c >> 1) + r + 2]) - : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); + : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); } dst += stride; } @@ -79,12 +77,12 @@ static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; - (void) left; + (void)left; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) { dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], above[(r >> 1) + c + 2]) - : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); + : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); } dst += stride; } @@ -112,7 +110,7 @@ static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; - (void) left; + (void)left; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) { dst[c] = AVG3(above[r + c], above[r + c + 1], @@ -127,14 +125,12 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, int r, c; // first row - for (c = 0; c < bs; c++) - dst[c] = AVG2(above[c - 1], above[c]); + for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]); dst += stride; // second row dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) - dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); dst += stride; // the rest of first col @@ -144,8 +140,7 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, // the rest of the block for (r = 2; r < bs; ++r) { - for (c = 1; c < bs; c++) - dst[c] = dst[-2 * stride + c - 1]; + for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1]; dst += stride; } } @@ -182,8 +177,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; dst[0] = AVG2(above[-1], left[0]); - for (r = 1; r < bs; r++) - dst[r * stride] = AVG2(left[r - 1], left[r]); + for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); dst++; dst[0] = AVG3(left[0], above[-1], above[0]); @@ -197,8 +191,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs, dst += stride; for (r = 1; r < bs; ++r) { - for (c = 0; c < bs - 2; c++) - dst[c] = dst[-stride + c - 2]; + for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2]; dst += stride; } } @@ -206,7 +199,7 @@ static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r; - (void) left; + (void)left; for (r = 0; r < bs; r++) { memcpy(dst, above, bs); @@ -217,7 +210,7 @@ static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r; - (void) above; + (void)above; for (r = 0; r < bs; r++) { memset(dst, left[r], bs); @@ -240,8 +233,8 @@ static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r; - (void) above; - (void) left; + (void)above; + (void)left; for (r = 0; r < bs; r++) { memset(dst, 128, bs); @@ -253,10 +246,9 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int i, r, expected_dc, sum = 0; - (void) above; + (void)above; - for (i = 0; i < bs; i++) - sum += left[i]; + for (i = 0; i < bs; i++) sum += left[i]; expected_dc = (sum + (bs >> 1)) / bs; for (r = 0; r < bs; r++) { @@ -268,10 +260,9 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs, static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int i, r, expected_dc, sum = 0; - (void) left; + (void)left; - for (i = 0; i < bs; i++) - sum += above[i]; + for (i = 0; i < bs; i++) sum += above[i]; expected_dc = (sum + (bs >> 1)) / bs; for (r = 0; r < bs; r++) { @@ -338,14 +329,13 @@ void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const int K = left[2]; const int L = left[3]; (void)above; - DST(0, 0) = AVG2(I, J); + DST(0, 0) = AVG2(I, J); DST(2, 0) = DST(0, 1) = AVG2(J, K); DST(2, 1) = DST(0, 2) = AVG2(K, L); - DST(1, 0) = AVG3(I, J, K); + DST(1, 0) = AVG3(I, J, K); DST(3, 0) = DST(1, 1) = AVG3(J, K, L); DST(3, 1) = DST(1, 2) = AVG3(K, L, L); - DST(3, 2) = DST(2, 2) = - DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; + DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; } void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, @@ -358,17 +348,17 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const int F = above[5]; const int G = above[6]; (void)left; - DST(0, 0) = AVG2(A, B); + DST(0, 0) = AVG2(A, B); DST(1, 0) = DST(0, 2) = AVG2(B, C); DST(2, 0) = DST(1, 2) = AVG2(C, D); DST(3, 0) = DST(2, 2) = AVG2(D, E); - DST(3, 2) = AVG2(E, F); // differs from vp8 + DST(3, 2) = AVG2(E, F); // differs from vp8 - DST(0, 1) = AVG3(A, B, C); + DST(0, 1) = AVG3(A, B, C); DST(1, 1) = DST(0, 3) = AVG3(B, C, D); DST(2, 1) = DST(1, 3) = AVG3(C, D, E); DST(3, 1) = DST(2, 3) = AVG3(D, E, F); - DST(3, 3) = AVG3(E, F, G); // differs from vp8 + DST(3, 3) = AVG3(E, F, G); // differs from vp8 } void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, @@ -382,17 +372,17 @@ void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const int G = above[6]; const int H = above[7]; (void)left; - DST(0, 0) = AVG2(A, B); + DST(0, 0) = AVG2(A, B); DST(1, 0) = DST(0, 2) = AVG2(B, C); DST(2, 0) = DST(1, 2) = AVG2(C, D); DST(3, 0) = DST(2, 2) = AVG2(D, E); - DST(3, 2) = AVG3(E, F, G); + DST(3, 2) = AVG3(E, F, G); - DST(0, 1) = AVG3(A, B, C); + DST(0, 1) = AVG3(A, B, C); DST(1, 1) = DST(0, 3) = AVG3(B, C, D); DST(2, 1) = DST(1, 3) = AVG3(C, D, E); DST(3, 1) = DST(2, 3) = AVG3(D, E, F); - DST(3, 3) = AVG3(F, G, H); + DST(3, 3) = AVG3(F, G, H); } void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, @@ -407,13 +397,13 @@ void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const int H = above[7]; (void)stride; (void)left; - DST(0, 0) = AVG3(A, B, C); - DST(1, 0) = DST(0, 1) = AVG3(B, C, D); - DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); - DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); - DST(3, 2) = DST(2, 3) = AVG3(F, G, H); - DST(3, 3) = H; // differs from vp8 + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = H; // differs from vp8 } void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, @@ -428,13 +418,13 @@ void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const int H = above[7]; (void)stride; (void)left; - DST(0, 0) = AVG3(A, B, C); - DST(1, 0) = DST(0, 1) = AVG3(B, C, D); - DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); - DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); - DST(3, 2) = DST(2, 3) = AVG3(F, G, H); - DST(3, 3) = AVG3(G, H, H); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = AVG3(G, H, H); } void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, @@ -450,14 +440,14 @@ void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, DST(0, 0) = DST(1, 2) = AVG2(X, A); DST(1, 0) = DST(2, 2) = AVG2(A, B); DST(2, 0) = DST(3, 2) = AVG2(B, C); - DST(3, 0) = AVG2(C, D); + DST(3, 0) = AVG2(C, D); - DST(0, 3) = AVG3(K, J, I); - DST(0, 2) = AVG3(J, I, X); + DST(0, 3) = AVG3(K, J, I); + DST(0, 2) = AVG3(J, I, X); DST(0, 1) = DST(1, 3) = AVG3(I, X, A); DST(1, 1) = DST(2, 3) = AVG3(X, A, B); DST(2, 1) = DST(3, 3) = AVG3(A, B, C); - DST(3, 1) = AVG3(B, C, D); + DST(3, 1) = AVG3(B, C, D); } void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, @@ -472,13 +462,13 @@ void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const int C = above[2]; const int D = above[3]; (void)stride; - DST(0, 3) = AVG3(J, K, L); - DST(1, 3) = DST(0, 2) = AVG3(I, J, K); - DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J); + DST(0, 3) = AVG3(J, K, L); + DST(1, 3) = DST(0, 2) = AVG3(I, J, K); + DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J); DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I); - DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X); - DST(3, 1) = DST(2, 0) = AVG3(C, B, A); - DST(3, 0) = AVG3(D, C, B); + DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X); + DST(3, 1) = DST(2, 0) = AVG3(C, B, A); + DST(3, 0) = AVG3(D, C, B); } void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, @@ -495,14 +485,14 @@ void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, DST(0, 0) = DST(2, 1) = AVG2(I, X); DST(0, 1) = DST(2, 2) = AVG2(J, I); DST(0, 2) = DST(2, 3) = AVG2(K, J); - DST(0, 3) = AVG2(L, K); + DST(0, 3) = AVG2(L, K); - DST(3, 0) = AVG3(A, B, C); - DST(2, 0) = AVG3(X, A, B); + DST(3, 0) = AVG3(A, B, C); + DST(2, 0) = AVG3(X, A, B); DST(1, 0) = DST(3, 1) = AVG3(I, X, A); DST(1, 1) = DST(3, 2) = AVG3(J, I, X); DST(1, 2) = DST(3, 3) = AVG3(K, J, I); - DST(1, 3) = AVG3(L, K, J); + DST(1, 3) = AVG3(L, K, J); } #if CONFIG_VP9_HIGHBITDEPTH @@ -510,8 +500,8 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int r, c; - (void) above; - (void) bd; + (void)above; + (void)bd; // First column. for (r = 0; r < bs - 1; ++r) { @@ -529,8 +519,7 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride, dst++; // Rest of last row. - for (c = 0; c < bs - 2; ++c) - dst[(bs - 1) * stride + c] = left[bs - 1]; + for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1]; for (r = bs - 2; r >= 0; --r) { for (c = 0; c < bs - 2; ++c) @@ -542,30 +531,30 @@ static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int r, c; - (void) above; - (void) bd; + (void)above; + (void)bd; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) { dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], left[(c >> 1) + r + 2]) - : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); + : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); } dst += stride; } } -static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, +static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, const uint16_t *left, int bd) { int r, c; - (void) left; - (void) bd; + (void)left; + (void)bd; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) { dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], above[(r >> 1) + c + 2]) - : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); + : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); } dst += stride; } @@ -577,13 +566,13 @@ static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int r, c; - (void) left; - (void) bd; + (void)left; + (void)bd; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) { - dst[c] = r + c + 2 < bs * 2 ? AVG3(above[r + c], above[r + c + 1], - above[r + c + 2]) - : above[bs * 2 - 1]; + dst[c] = r + c + 2 < bs * 2 + ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2]) + : above[bs * 2 - 1]; } dst += stride; } @@ -593,8 +582,8 @@ static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int r, c; - (void) left; - (void) bd; + (void)left; + (void)bd; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) { dst[c] = AVG3(above[r + c], above[r + c + 1], @@ -608,17 +597,15 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int r, c; - (void) bd; + (void)bd; // first row - for (c = 0; c < bs; c++) - dst[c] = AVG2(above[c - 1], above[c]); + for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]); dst += stride; // second row dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) - dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); dst += stride; // the rest of first col @@ -628,8 +615,7 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, // the rest of the block for (r = 2; r < bs; ++r) { - for (c = 1; c < bs; c++) - dst[c] = dst[-2 * stride + c - 1]; + for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1]; dst += stride; } } @@ -638,10 +624,9 @@ static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int r, c; - (void) bd; + (void)bd; dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) - dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); dst[stride] = AVG3(above[-1], left[0], left[1]); for (r = 2; r < bs; ++r) @@ -649,8 +634,7 @@ static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride, dst += stride; for (r = 1; r < bs; ++r) { - for (c = 1; c < bs; c++) - dst[c] = dst[-stride + c - 1]; + for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1]; dst += stride; } } @@ -659,10 +643,9 @@ static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int r, c; - (void) bd; + (void)bd; dst[0] = AVG2(above[-1], left[0]); - for (r = 1; r < bs; r++) - dst[r * stride] = AVG2(left[r - 1], left[r]); + for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); dst++; dst[0] = AVG3(left[0], above[-1], above[0]); @@ -676,42 +659,41 @@ static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride, dst += stride; for (r = 1; r < bs; ++r) { - for (c = 0; c < bs - 2; c++) - dst[c] = dst[-stride + c - 2]; + for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2]; dst += stride; } } -static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, +static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, const uint16_t *left, int bd) { int r; - (void) left; - (void) bd; + (void)left; + (void)bd; for (r = 0; r < bs; r++) { memcpy(dst, above, bs * sizeof(uint16_t)); dst += stride; } } -static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, +static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, const uint16_t *left, int bd) { int r; - (void) above; - (void) bd; + (void)above; + (void)bd; for (r = 0; r < bs; r++) { vpx_memset16(dst, left[r], bs); dst += stride; } } -static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, +static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, const uint16_t *left, int bd) { int r, c; int ytop_left = above[-1]; - (void) bd; + (void)bd; for (r = 0; r < bs; r++) { for (c = 0; c < bs; c++) @@ -724,8 +706,8 @@ static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int r; - (void) above; - (void) left; + (void)above; + (void)left; for (r = 0; r < bs; r++) { vpx_memset16(dst, 128 << (bd - 8), bs); @@ -737,11 +719,10 @@ static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int i, r, expected_dc, sum = 0; - (void) above; - (void) bd; + (void)above; + (void)bd; - for (i = 0; i < bs; i++) - sum += left[i]; + for (i = 0; i < bs; i++) sum += left[i]; expected_dc = (sum + (bs >> 1)) / bs; for (r = 0; r < bs; r++) { @@ -754,11 +735,10 @@ static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int i, r, expected_dc, sum = 0; - (void) left; - (void) bd; + (void)left; + (void)bd; - for (i = 0; i < bs; i++) - sum += above[i]; + for (i = 0; i < bs; i++) sum += above[i]; expected_dc = (sum + (bs >> 1)) / bs; for (r = 0; r < bs; r++) { @@ -767,12 +747,12 @@ static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, } } -static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, +static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, const uint16_t *left, int bd) { int i, r, expected_dc, sum = 0; const int count = 2 * bs; - (void) bd; + (void)bd; for (i = 0; i < bs; i++) { sum += above[i]; @@ -791,22 +771,22 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, // This serves as a wrapper function, so that all the prediction functions // can be unified and accessed as a pointer array. Note that the boundary // above and left are not necessarily used all the time. -#define intra_pred_sized(type, size) \ - void vpx_##type##_predictor_##size##x##size##_c(uint8_t *dst, \ - ptrdiff_t stride, \ - const uint8_t *above, \ - const uint8_t *left) { \ - type##_predictor(dst, stride, size, above, left); \ +#define intra_pred_sized(type, size) \ + void vpx_##type##_predictor_##size##x##size##_c( \ + uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \ + const uint8_t *left) { \ + type##_predictor(dst, stride, size, above, left); \ } #if CONFIG_VP9_HIGHBITDEPTH -#define intra_pred_highbd_sized(type, size) \ - void vpx_highbd_##type##_predictor_##size##x##size##_c( \ - uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ - const uint16_t *left, int bd) { \ +#define intra_pred_highbd_sized(type, size) \ + void vpx_highbd_##type##_predictor_##size##x##size##_c( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ highbd_##type##_predictor(dst, stride, size, above, left, bd); \ } +/* clang-format off */ #define intra_pred_allsizes(type) \ intra_pred_sized(type, 4) \ intra_pred_sized(type, 8) \ @@ -855,4 +835,5 @@ intra_pred_allsizes(dc_128) intra_pred_allsizes(dc_left) intra_pred_allsizes(dc_top) intra_pred_allsizes(dc) +/* clang-format on */ #undef intra_pred_allsizes diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c index 707cb92bbb2dcea4ff3a4f4e2d70207907a013f8..d5be32e7db927c5800bda2f77f69a2d96ec1e2cb 100644 --- a/vpx_dsp/inv_txfm.c +++ b/vpx_dsp/inv_txfm.c @@ -15,8 +15,8 @@ #include "vpx_dsp/inv_txfm.h" void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { -/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, - 0.5 shifts per pixel. */ + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ int i; tran_low_t output[16]; tran_high_t a1, b1, c1, d1, e1; @@ -127,8 +127,7 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { // Columns for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j * 4 + i]; + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; idct4_c(temp_in, temp_out); for (j = 0; j < 4; ++j) { dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], @@ -223,8 +222,7 @@ void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { // Then transform columns for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; idct8_c(temp_in, temp_out); for (j = 0; j < 8; ++j) { dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], @@ -240,8 +238,7 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { - for (i = 0; i < 8; ++i) - dest[i] = clip_pixel_add(dest[i], a1); + for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1); dest += stride; } } @@ -296,20 +293,20 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) { tran_high_t x7 = input[6]; if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - output[0] = output[1] = output[2] = output[3] = output[4] - = output[5] = output[6] = output[7] = 0; + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = 0; return; } // stage 1 - s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); - s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); + s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); + s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); - s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); - s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); + s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); + s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); @@ -376,8 +373,7 @@ void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { // Then transform columns for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; idct8_c(temp_in, temp_out); for (j = 0; j < 8; ++j) { dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], @@ -391,22 +387,22 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) { tran_high_t temp1, temp2; // stage 1 - step1[0] = input[0/2]; - step1[1] = input[16/2]; - step1[2] = input[8/2]; - step1[3] = input[24/2]; - step1[4] = input[4/2]; - step1[5] = input[20/2]; - step1[6] = input[12/2]; - step1[7] = input[28/2]; - step1[8] = input[2/2]; - step1[9] = input[18/2]; - step1[10] = input[10/2]; - step1[11] = input[26/2]; - step1[12] = input[6/2]; - step1[13] = input[22/2]; - step1[14] = input[14/2]; - step1[15] = input[30/2]; + step1[0] = input[0 / 2]; + step1[1] = input[16 / 2]; + step1[2] = input[8 / 2]; + step1[3] = input[24 / 2]; + step1[4] = input[4 / 2]; + step1[5] = input[20 / 2]; + step1[6] = input[12 / 2]; + step1[7] = input[28 / 2]; + step1[8] = input[2 / 2]; + step1[9] = input[18 / 2]; + step1[10] = input[10 / 2]; + step1[11] = input[26 / 2]; + step1[12] = input[6 / 2]; + step1[13] = input[22 / 2]; + step1[14] = input[14 / 2]; + step1[15] = input[30 / 2]; // stage 2 step2[0] = step1[0]; @@ -567,8 +563,7 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, // Then transform columns for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; idct16_c(temp_in, temp_out); for (j = 0; j < 16; ++j) { dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], @@ -598,21 +593,20 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { tran_high_t x14 = input[1]; tran_high_t x15 = input[14]; - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 - | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { - output[0] = output[1] = output[2] = output[3] = output[4] - = output[5] = output[6] = output[7] = output[8] - = output[9] = output[10] = output[11] = output[12] - = output[13] = output[14] = output[15] = 0; + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = output[8] = output[9] = output[10] = + output[11] = output[12] = output[13] = output[14] = output[15] = 0; return; } // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; s5 = x4 * cospi_23_64 - x5 * cospi_9_64; s6 = x6 * cospi_13_64 + x7 * cospi_19_64; s7 = x6 * cospi_19_64 - x7 * cospi_13_64; @@ -621,9 +615,9 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { s10 = x10 * cospi_21_64 + x11 * cospi_11_64; s11 = x10 * cospi_11_64 - x11 * cospi_21_64; s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); @@ -651,14 +645,14 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { s5 = x5; s6 = x6; s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; x0 = WRAPLOW(s0 + s4); x1 = WRAPLOW(s1 + s5); @@ -682,18 +676,18 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { s1 = x1; s2 = x2; s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; s8 = x8; s9 = x9; s10 = x10; s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; x0 = WRAPLOW(s0 + s2); x1 = WRAPLOW(s1 + s3); @@ -713,13 +707,13 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); // stage 4 - s2 = (- cospi_16_64) * (x2 + x3); + s2 = (-cospi_16_64) * (x2 + x3); s3 = cospi_16_64 * (x2 - x3); s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (- x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (- x10 + x11); - s14 = (- cospi_16_64) * (x14 + x15); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); s15 = cospi_16_64 * (x14 - x15); x2 = WRAPLOW(dct_const_round_shift(s2)); @@ -766,8 +760,7 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, // Then transform columns for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j*16 + i]; + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; idct16_c(temp_in, temp_out); for (j = 0; j < 16; ++j) { dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], @@ -783,8 +776,7 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { - for (i = 0; i < 16; ++i) - dest[i] = clip_pixel_add(dest[i], a1); + for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1); dest += stride; } } @@ -1166,8 +1158,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, // Rows for (i = 0; i < 32; ++i) { int16_t zero_coeff[16]; - for (j = 0; j < 16; ++j) - zero_coeff[j] = input[2 * j] | input[2 * j + 1]; + for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; for (j = 0; j < 8; ++j) zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; for (j = 0; j < 4; ++j) @@ -1185,8 +1176,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, // Columns for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) - temp_in[j] = out[j * 32 + i]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; idct32_c(temp_in, temp_out); for (j = 0; j < 32; ++j) { dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], @@ -1197,7 +1187,7 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[32 * 32] = {0}; + tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; int i, j; tran_low_t temp_in[32], temp_out[32]; @@ -1212,8 +1202,7 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, // Columns for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) - temp_in[j] = out[j * 32 + i]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; idct32_c(temp_in, temp_out); for (j = 0; j < 32; ++j) { dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], @@ -1224,7 +1213,7 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[32 * 32] = {0}; + tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; int i, j; tran_low_t temp_in[32], temp_out[32]; @@ -1239,8 +1228,7 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, // Columns for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) - temp_in[j] = out[j * 32 + i]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; idct32_c(temp_in, temp_out); for (j = 0; j < 32; ++j) { dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], @@ -1258,8 +1246,7 @@ void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { - for (i = 0; i < 32; ++i) - dest[i] = clip_pixel_add(dest[i], a1); + for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1); dest += stride; } } @@ -1309,14 +1296,14 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, c1 = e1 - c1; a1 -= b1; d1 += c1; - dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], - HIGHBD_WRAPLOW(a1, bd), bd); - dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], - HIGHBD_WRAPLOW(b1, bd), bd); - dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], - HIGHBD_WRAPLOW(c1, bd), bd); - dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], - HIGHBD_WRAPLOW(d1, bd), bd); + dest[stride * 0] = + highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd); + dest[stride * 1] = + highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd); + dest[stride * 2] = + highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd); + dest[stride * 3] = + highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd); ip++; dest++; @@ -1331,7 +1318,7 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, const tran_low_t *ip = in; tran_low_t *op = tmp; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - (void) bd; + (void)bd; a1 = ip[0] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; @@ -1343,14 +1330,14 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, for (i = 0; i < 4; i++) { e1 = ip[0] >> 1; a1 = ip[0] - e1; - dest[dest_stride * 0] = highbd_clip_pixel_add( - dest[dest_stride * 0], a1, bd); - dest[dest_stride * 1] = highbd_clip_pixel_add( - dest[dest_stride * 1], e1, bd); - dest[dest_stride * 2] = highbd_clip_pixel_add( - dest[dest_stride * 2], e1, bd); - dest[dest_stride * 3] = highbd_clip_pixel_add( - dest[dest_stride * 3], e1, bd); + dest[dest_stride * 0] = + highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd); + dest[dest_stride * 1] = + highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd); + dest[dest_stride * 2] = + highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd); + dest[dest_stride * 3] = + highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd); ip++; dest++; } @@ -1359,7 +1346,7 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t step[4]; tran_high_t temp1, temp2; - (void) bd; + (void)bd; // stage 1 temp1 = (input[0] + input[2]) * cospi_16_64; temp2 = (input[0] - input[2]) * cospi_16_64; @@ -1394,8 +1381,7 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, // Columns for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j * 4 + i]; + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; vpx_highbd_idct4_c(temp_in, temp_out, bd); for (j = 0; j < 4; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( @@ -1408,8 +1394,8 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, int dest_stride, int bd) { int i; tran_high_t a1; - tran_low_t out = HIGHBD_WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = + HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); @@ -1486,8 +1472,7 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, // Then transform columns. for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; vpx_highbd_idct8_c(temp_in, temp_out, bd); for (j = 0; j < 8; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( @@ -1500,14 +1485,13 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = HIGHBD_WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = + HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { - for (i = 0; i < 8; ++i) - dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); dest += stride; } } @@ -1519,7 +1503,7 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t x1 = input[1]; tran_low_t x2 = input[2]; tran_low_t x3 = input[3]; - (void) bd; + (void)bd; if (!(x0 | x1 | x2 | x3)) { memset(output, 0, 4 * sizeof(*output)); @@ -1561,7 +1545,7 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t x5 = input[4]; tran_low_t x6 = input[1]; tran_low_t x7 = input[6]; - (void) bd; + (void)bd; if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { memset(output, 0, 8 * sizeof(*output)); @@ -1569,14 +1553,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { } // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; s2 = cospi_10_64 * x2 + cospi_22_64 * x3; s3 = cospi_22_64 * x2 - cospi_10_64 * x3; s4 = cospi_18_64 * x4 + cospi_14_64 * x5; s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd); x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd); @@ -1592,10 +1576,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { s1 = x1; s2 = x2; s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; x0 = HIGHBD_WRAPLOW(s0 + s2, bd); x1 = HIGHBD_WRAPLOW(s1 + s3, bd); @@ -1644,8 +1628,7 @@ void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, } // Then transform columns. for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; vpx_highbd_idct8_c(temp_in, temp_out, bd); for (j = 0; j < 8; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( @@ -1657,25 +1640,25 @@ void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t step1[16], step2[16]; tran_high_t temp1, temp2; - (void) bd; + (void)bd; // stage 1 - step1[0] = input[0/2]; - step1[1] = input[16/2]; - step1[2] = input[8/2]; - step1[3] = input[24/2]; - step1[4] = input[4/2]; - step1[5] = input[20/2]; - step1[6] = input[12/2]; - step1[7] = input[28/2]; - step1[8] = input[2/2]; - step1[9] = input[18/2]; - step1[10] = input[10/2]; - step1[11] = input[26/2]; - step1[12] = input[6/2]; - step1[13] = input[22/2]; - step1[14] = input[14/2]; - step1[15] = input[30/2]; + step1[0] = input[0 / 2]; + step1[1] = input[16 / 2]; + step1[2] = input[8 / 2]; + step1[3] = input[24 / 2]; + step1[4] = input[4 / 2]; + step1[5] = input[20 / 2]; + step1[6] = input[12 / 2]; + step1[7] = input[28 / 2]; + step1[8] = input[2 / 2]; + step1[9] = input[18 / 2]; + step1[10] = input[10 / 2]; + step1[11] = input[26 / 2]; + step1[12] = input[6 / 2]; + step1[13] = input[22 / 2]; + step1[14] = input[14 / 2]; + step1[15] = input[30 / 2]; // stage 2 step2[0] = step1[0]; @@ -1837,8 +1820,7 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, // Then transform columns. for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; vpx_highbd_idct16_c(temp_in, temp_out, bd); for (j = 0; j < 16; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( @@ -1867,20 +1849,20 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t x13 = input[12]; tran_low_t x14 = input[1]; tran_low_t x15 = input[14]; - (void) bd; + (void)bd; - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 - | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { memset(output, 0, 16 * sizeof(*output)); return; } // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; s5 = x4 * cospi_23_64 - x5 * cospi_9_64; s6 = x6 * cospi_13_64 + x7 * cospi_19_64; s7 = x6 * cospi_19_64 - x7 * cospi_13_64; @@ -1889,9 +1871,9 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { s10 = x10 * cospi_21_64 + x11 * cospi_11_64; s11 = x10 * cospi_11_64 - x11 * cospi_21_64; s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd); x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd); @@ -1901,8 +1883,8 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd); x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd); x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd); - x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd); - x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd); + x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd); + x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd); x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd); x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd); x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd); @@ -1981,13 +1963,13 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd); // stage 4 - s2 = (- cospi_16_64) * (x2 + x3); + s2 = (-cospi_16_64) * (x2 + x3); s3 = cospi_16_64 * (x2 - x3); s6 = cospi_16_64 * (x6 + x7); s7 = cospi_16_64 * (-x6 + x7); s10 = cospi_16_64 * (x10 + x11); s11 = cospi_16_64 * (-x10 + x11); - s14 = (- cospi_16_64) * (x14 + x15); + s14 = (-cospi_16_64) * (x14 + x15); s15 = cospi_16_64 * (x14 - x15); x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); @@ -2035,8 +2017,7 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, // Then transform columns. for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j*16 + i]; + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; vpx_highbd_idct16_c(temp_in, temp_out, bd); for (j = 0; j < 16; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( @@ -2049,24 +2030,22 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = HIGHBD_WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = + HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { - for (i = 0; i < 16; ++i) - dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); dest += stride; } } -void vpx_highbd_idct32_c(const tran_low_t *input, - tran_low_t *output, int bd) { +void vpx_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t step1[32], step2[32]; tran_high_t temp1, temp2; - (void) bd; + (void)bd; // stage 1 step1[0] = input[0]; @@ -2442,8 +2421,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, // Rows for (i = 0; i < 32; ++i) { tran_low_t zero_coeff[16]; - for (j = 0; j < 16; ++j) - zero_coeff[j] = input[2 * j] | input[2 * j + 1]; + for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; for (j = 0; j < 8; ++j) zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; for (j = 0; j < 4; ++j) @@ -2461,8 +2439,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, // Columns for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) - temp_in[j] = out[j * 32 + i]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; vpx_highbd_idct32_c(temp_in, temp_out, bd); for (j = 0; j < 32; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( @@ -2473,7 +2450,7 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { - tran_low_t out[32 * 32] = {0}; + tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; int i, j; tran_low_t temp_in[32], temp_out[32]; @@ -2488,8 +2465,7 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, } // Columns for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) - temp_in[j] = out[j * 32 + i]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; vpx_highbd_idct32_c(temp_in, temp_out, bd); for (j = 0; j < 32; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( @@ -2504,14 +2480,13 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, int a1; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - tran_low_t out = HIGHBD_WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = + HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { - for (i = 0; i < 32; ++i) - dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); dest += stride; } } diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h index 0c4359c270831d0cad66394ed2f1ec57705ff384..0f31a79b5d1d6ad0608fa2f0c0bb2be61072e95e 100644 --- a/vpx_dsp/inv_txfm.h +++ b/vpx_dsp/inv_txfm.h @@ -41,8 +41,7 @@ static INLINE tran_high_t dct_const_round_shift(tran_high_t input) { } #if CONFIG_VP9_HIGHBITDEPTH -static INLINE tran_high_t highbd_check_range(tran_high_t input, - int bd) { +static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) { #if CONFIG_COEFFICIENT_RANGE_CHECKING // For valid highbitdepth VP9 streams, intermediate stage coefficients will // stay within the ranges: @@ -53,9 +52,9 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, const int32_t int_min = -int_max - 1; assert(int_min <= input); assert(input <= int_max); - (void) int_min; + (void)int_min; #endif // CONFIG_COEFFICIENT_RANGE_CHECKING - (void) bd; + (void)bd; return input; } @@ -86,15 +85,14 @@ static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) { #define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16) #if CONFIG_VP9_HIGHBITDEPTH #define HIGHBD_WRAPLOW(x, bd) \ - ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd)) + ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd)) #endif // CONFIG_VP9_HIGHBITDEPTH -#else // CONFIG_EMULATE_HARDWARE +#else // CONFIG_EMULATE_HARDWARE #define WRAPLOW(x) ((int32_t)check_range(x)) #if CONFIG_VP9_HIGHBITDEPTH -#define HIGHBD_WRAPLOW(x, bd) \ - ((int32_t)highbd_check_range((x), bd)) +#define HIGHBD_WRAPLOW(x, bd) ((int32_t)highbd_check_range((x), bd)) #endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_EMULATE_HARDWARE diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c index 645a1ab95ee996085af04c2391d723446f9ce7a4..40f02b46d93aaf4e3bd395d73225ea82c2097634 100644 --- a/vpx_dsp/loopfilter.c +++ b/vpx_dsp/loopfilter.c @@ -22,23 +22,18 @@ static INLINE int8_t signed_char_clamp(int t) { #if CONFIG_VP9_HIGHBITDEPTH static INLINE int16_t signed_char_clamp_high(int t, int bd) { switch (bd) { - case 10: - return (int16_t)clamp(t, -128*4, 128*4-1); - case 12: - return (int16_t)clamp(t, -128*16, 128*16-1); + case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1); + case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1); case 8: - default: - return (int16_t)clamp(t, -128, 128-1); + default: return (int16_t)clamp(t, -128, 128 - 1); } } #endif // should we apply any filter at all: 11111111 yes, 00000000 no -static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, - uint8_t p3, uint8_t p2, - uint8_t p1, uint8_t p0, - uint8_t q0, uint8_t q1, - uint8_t q2, uint8_t q3) { +static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, + uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, + uint8_t q1, uint8_t q2, uint8_t q3) { int8_t mask = 0; mask |= (abs(p3 - p2) > limit) * -1; mask |= (abs(p2 - p1) > limit) * -1; @@ -46,14 +41,12 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, mask |= (abs(q1 - q0) > limit) * -1; mask |= (abs(q2 - q1) > limit) * -1; mask |= (abs(q3 - q2) > limit) * -1; - mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; return ~mask; } -static INLINE int8_t flat_mask4(uint8_t thresh, - uint8_t p3, uint8_t p2, - uint8_t p1, uint8_t p0, - uint8_t q0, uint8_t q1, +static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, uint8_t q2, uint8_t q3) { int8_t mask = 0; mask |= (abs(p1 - p0) > thresh) * -1; @@ -65,12 +58,10 @@ static INLINE int8_t flat_mask4(uint8_t thresh, return ~mask; } -static INLINE int8_t flat_mask5(uint8_t thresh, - uint8_t p4, uint8_t p3, - uint8_t p2, uint8_t p1, - uint8_t p0, uint8_t q0, - uint8_t q1, uint8_t q2, - uint8_t q3, uint8_t q4) { +static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3, + uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, + uint8_t q1, uint8_t q2, uint8_t q3, + uint8_t q4) { int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3); mask |= (abs(p4 - p0) > thresh) * -1; mask |= (abs(q4 - q0) > thresh) * -1; @@ -81,8 +72,8 @@ static INLINE int8_t flat_mask5(uint8_t thresh, static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1) { int8_t hev = 0; - hev |= (abs(p1 - p0) > thresh) * -1; - hev |= (abs(q1 - q0) > thresh) * -1; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; return hev; } @@ -90,10 +81,10 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { int8_t filter1, filter2; - const int8_t ps1 = (int8_t) *op1 ^ 0x80; - const int8_t ps0 = (int8_t) *op0 ^ 0x80; - const int8_t qs0 = (int8_t) *oq0 ^ 0x80; - const int8_t qs1 = (int8_t) *oq1 ^ 0x80; + const int8_t ps1 = (int8_t)*op1 ^ 0x80; + const int8_t ps0 = (int8_t)*op0 ^ 0x80; + const int8_t qs0 = (int8_t)*oq0 ^ 0x80; + const int8_t qs1 = (int8_t)*oq1 ^ 0x80; const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); // add outer taps if we have high edge variance @@ -127,9 +118,9 @@ void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; - const int8_t mask = filter_mask(*limit, *blimit, - p3, p2, p1, p0, q0, q1, q2, q3); + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); ++s; } @@ -151,9 +142,9 @@ void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; - const int8_t mask = filter_mask(*limit, *blimit, - p3, p2, p1, p0, q0, q1, q2, q3); + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); filter4(mask, *thresh, s - 2, s - 1, s, s + 1); s += pitch; } @@ -168,9 +159,8 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, } static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t *op3, uint8_t *op2, - uint8_t *op1, uint8_t *op0, - uint8_t *oq0, uint8_t *oq1, + uint8_t *op3, uint8_t *op2, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, uint8_t *oq3) { if (flat && mask) { const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; @@ -184,7 +174,7 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); } else { - filter4(mask, thresh, op1, op0, oq0, oq1); + filter4(mask, thresh, op1, op0, oq0, oq1); } } @@ -198,11 +188,11 @@ void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; - const int8_t mask = filter_mask(*limit, *blimit, - p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, - s, s + 1 * p, s + 2 * p, s + 3 * p); + filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, s + 3 * p); ++s; } } @@ -222,11 +212,11 @@ void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, for (i = 0; i < 8; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; - const int8_t mask = filter_mask(*limit, *blimit, - p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, - s, s + 1, s + 2, s + 3); + filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, + s + 3); s += pitch; } } @@ -239,52 +229,55 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } -static INLINE void filter16(int8_t mask, uint8_t thresh, - uint8_t flat, uint8_t flat2, - uint8_t *op7, uint8_t *op6, - uint8_t *op5, uint8_t *op4, - uint8_t *op3, uint8_t *op2, - uint8_t *op1, uint8_t *op0, - uint8_t *oq0, uint8_t *oq1, - uint8_t *oq2, uint8_t *oq3, - uint8_t *oq4, uint8_t *oq5, +static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t flat2, uint8_t *op7, uint8_t *op6, + uint8_t *op5, uint8_t *op4, uint8_t *op3, + uint8_t *op2, uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, + uint8_t *oq3, uint8_t *oq4, uint8_t *oq5, uint8_t *oq6, uint8_t *oq7) { if (flat2 && flat && mask) { - const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, - p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, + p2 = *op2, p1 = *op1, p0 = *op0; - const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, - q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, + q5 = *oq5, q6 = *oq6, q7 = *oq7; // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] - *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + - q0, 4); - *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + - q0 + q1, 4); - *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + - q0 + q1 + q2, 4); - *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + - q0 + q1 + q2 + q3, 4); - *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + - q0 + q1 + q2 + q3 + q4, 4); + *op6 = ROUND_POWER_OF_TWO( + p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); + *op5 = ROUND_POWER_OF_TWO( + p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4); + *op4 = ROUND_POWER_OF_TWO( + p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4); + *op3 = ROUND_POWER_OF_TWO( + p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4); + *op2 = ROUND_POWER_OF_TWO( + p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4, + 4); *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + - q0 + q1 + q2 + q3 + q4 + q5, 4); - *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + - q0 + q1 + q2 + q3 + q4 + q5 + q6, 4); - *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + - q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); - *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + - q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4); - *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + - q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4); - *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + - q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); - *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + - q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); - *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + - q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); - *oq6 = ROUND_POWER_OF_TWO(p0 + - q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); + q0 + q1 + q2 + q3 + q4 + q5, + 4); + *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + + q1 + q2 + q3 + q4 + q5 + q6, + 4); + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + + q2 + q3 + q4 + q5 + q6 + q7, + 4); + *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + + q3 + q4 + q5 + q6 + q7 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); + *oq6 = ROUND_POWER_OF_TWO( + p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); } else { filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); } @@ -300,18 +293,17 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, for (i = 0; i < 8 * count; ++i) { const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; - const int8_t mask = filter_mask(*limit, *blimit, - p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat2 = flat_mask5(1, - s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, - q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]); - - filter16(mask, *thresh, flat, flat2, - s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, - s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, - s, s + 1 * p, s + 2 * p, s + 3 * p, - s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p); + const int8_t flat2 = + flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, + s[4 * p], s[5 * p], s[6 * p], s[7 * p]); + + filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, + s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, + s + 7 * p); ++s; } } @@ -326,25 +318,23 @@ void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit, mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); } -static void mb_lpf_vertical_edge_w(uint8_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, +static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, int count) { int i; for (i = 0; i < count; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; - const int8_t mask = filter_mask(*limit, *blimit, - p3, p2, p1, p0, q0, q1, q2, q3); + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, - q0, s[4], s[5], s[6], s[7]); + const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4], + s[5], s[6], s[7]); - filter16(mask, *thresh, flat, flat2, - s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, - s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7); + filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, + s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, + s + 7); s += p; } } @@ -362,9 +352,8 @@ void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, #if CONFIG_VP9_HIGHBITDEPTH // Should we apply any filter at all: 11111111 yes, 00000000 no ? static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, - uint16_t p3, uint16_t p2, - uint16_t p1, uint16_t p0, - uint16_t q0, uint16_t q1, + uint16_t p3, uint16_t p2, uint16_t p1, + uint16_t p0, uint16_t q0, uint16_t q1, uint16_t q2, uint16_t q3, int bd) { int8_t mask = 0; int16_t limit16 = (uint16_t)limit << (bd - 8); @@ -375,15 +364,14 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, mask |= (abs(q1 - q0) > limit16) * -1; mask |= (abs(q2 - q1) > limit16) * -1; mask |= (abs(q3 - q2) > limit16) * -1; - mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; return ~mask; } -static INLINE int8_t highbd_flat_mask4(uint8_t thresh, - uint16_t p3, uint16_t p2, - uint16_t p1, uint16_t p0, - uint16_t q0, uint16_t q1, - uint16_t q2, uint16_t q3, int bd) { +static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, + uint16_t p1, uint16_t p0, uint16_t q0, + uint16_t q1, uint16_t q2, uint16_t q3, + int bd) { int8_t mask = 0; int16_t thresh16 = (uint16_t)thresh << (bd - 8); mask |= (abs(p1 - p0) > thresh16) * -1; @@ -395,11 +383,9 @@ static INLINE int8_t highbd_flat_mask4(uint8_t thresh, return ~mask; } -static INLINE int8_t highbd_flat_mask5(uint8_t thresh, - uint16_t p4, uint16_t p3, - uint16_t p2, uint16_t p1, - uint16_t p0, uint16_t q0, - uint16_t q1, uint16_t q2, +static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3, + uint16_t p2, uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, uint16_t q2, uint16_t q3, uint16_t q4, int bd) { int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd); int16_t thresh16 = (uint16_t)thresh << (bd - 8); @@ -470,21 +456,17 @@ void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, const uint16_t q1 = s[1 * p]; const uint16_t q2 = s[2 * p]; const uint16_t q3 = s[3 * p]; - const int8_t mask = highbd_filter_mask(*limit, *blimit, - p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); ++s; } } -void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1, - int bd) { +void vpx_highbd_lpf_horizontal_4_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd); } @@ -498,31 +480,26 @@ void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; - const int8_t mask = highbd_filter_mask(*limit, *blimit, - p3, p2, p1, p0, q0, q1, q2, q3, bd); + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); s += pitch; } } -void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1, - int bd) { +void vpx_highbd_lpf_vertical_4_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); - vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, - thresh1, bd); + vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, + bd); } static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, - uint16_t *op3, uint16_t *op2, - uint16_t *op1, uint16_t *op0, - uint16_t *oq0, uint16_t *oq1, + uint16_t *op3, uint16_t *op2, uint16_t *op1, + uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, int bd) { if (flat && mask) { const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; @@ -536,7 +513,7 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); } else { - highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); + highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); } } @@ -551,25 +528,20 @@ void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; - const int8_t mask = highbd_filter_mask(*limit, *blimit, - p3, p2, p1, p0, q0, q1, q2, q3, bd); - const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, - bd); - highbd_filter8(mask, *thresh, flat, - s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, - s, s + 1 * p, s + 2 * p, s + 3 * p, bd); + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, + s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd); ++s; } } -void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1, - int bd) { +void vpx_highbd_lpf_horizontal_8_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); } @@ -582,40 +554,31 @@ void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, for (i = 0; i < 8; ++i) { const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; - const int8_t mask = highbd_filter_mask(*limit, *blimit, - p3, p2, p1, p0, q0, q1, q2, q3, bd); - const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, - bd); - highbd_filter8(mask, *thresh, flat, - s - 4, s - 3, s - 2, s - 1, - s, s + 1, s + 2, s + 3, - bd); + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, + s + 2, s + 3, bd); s += pitch; } } -void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1, - int bd) { +void vpx_highbd_lpf_vertical_8_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); - vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, - thresh1, bd); -} - -static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, - uint8_t flat, uint8_t flat2, - uint16_t *op7, uint16_t *op6, - uint16_t *op5, uint16_t *op4, - uint16_t *op3, uint16_t *op2, - uint16_t *op1, uint16_t *op0, - uint16_t *oq0, uint16_t *oq1, - uint16_t *oq2, uint16_t *oq3, - uint16_t *oq4, uint16_t *oq5, + vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t flat2, uint16_t *op7, uint16_t *op6, + uint16_t *op5, uint16_t *op4, uint16_t *op3, + uint16_t *op2, uint16_t *op1, uint16_t *op0, + uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, + uint16_t *oq3, uint16_t *oq4, uint16_t *oq5, uint16_t *oq6, uint16_t *oq7, int bd) { if (flat2 && flat && mask) { const uint16_t p7 = *op7; @@ -636,34 +599,40 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, const uint16_t q7 = *oq7; // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] - *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + - q0, 4); - *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + - q0 + q1, 4); - *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + - q0 + q1 + q2, 4); - *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + - q0 + q1 + q2 + q3, 4); - *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + - q0 + q1 + q2 + q3 + q4, 4); + *op6 = ROUND_POWER_OF_TWO( + p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); + *op5 = ROUND_POWER_OF_TWO( + p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4); + *op4 = ROUND_POWER_OF_TWO( + p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4); + *op3 = ROUND_POWER_OF_TWO( + p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4); + *op2 = ROUND_POWER_OF_TWO( + p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4, + 4); *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + - q0 + q1 + q2 + q3 + q4 + q5, 4); - *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + - q0 + q1 + q2 + q3 + q4 + q5 + q6, 4); - *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + - q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); - *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + - q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4); - *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + - q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4); - *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + - q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); - *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + - q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); - *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + - q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); - *oq6 = ROUND_POWER_OF_TWO(p0 + - q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); + q0 + q1 + q2 + q3 + q4 + q5, + 4); + *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + + q1 + q2 + q3 + q4 + q5 + q6, + 4); + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + + q2 + q3 + q4 + q5 + q6 + q7, + 4); + *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + + q3 + q4 + q5 + q6 + q7 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); + *oq6 = ROUND_POWER_OF_TWO( + p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); } else { highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, bd); @@ -673,8 +642,8 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count, int bd) { + const uint8_t *thresh, int count, + int bd) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -688,20 +657,18 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, const uint16_t q1 = s[1 * p]; const uint16_t q2 = s[2 * p]; const uint16_t q3 = s[3 * p]; - const int8_t mask = highbd_filter_mask(*limit, *blimit, - p3, p2, p1, p0, q0, q1, q2, q3, bd); - const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, - bd); - const int8_t flat2 = highbd_flat_mask5( - 1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, - q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd); - - highbd_filter16(mask, *thresh, flat, flat2, - s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, - s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, - s, s + 1 * p, s + 2 * p, s + 3 * p, - s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p, - bd); + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat2 = + highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, + s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd); + + highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, + s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, + s + 6 * p, s + 7 * p, bd); ++s; } } @@ -723,8 +690,8 @@ void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p, static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int count, int bd) { + const uint8_t *thresh, int count, + int bd) { int i; for (i = 0; i < count; ++i) { @@ -736,17 +703,16 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, const uint16_t q1 = s[1]; const uint16_t q2 = s[2]; const uint16_t q3 = s[3]; - const int8_t mask = highbd_filter_mask(*limit, *blimit, - p3, p2, p1, p0, q0, q1, q2, q3, bd); - const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, - bd); + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], s[7], bd); - highbd_filter16(mask, *thresh, flat, flat2, - s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, - s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7, - bd); + highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, + s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, + s + 5, s + 6, s + 7, bd); s += p; } } @@ -760,8 +726,7 @@ void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int bd) { + const uint8_t *thresh, int bd) { highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/mips/common_dspr2.h b/vpx_dsp/mips/common_dspr2.h index 7a10bf1c4050719bec76e0fef47d88d47ce86f01..0a42f5cec21f88c1564a228d45e9dd2ee2729d07 100644 --- a/vpx_dsp/mips/common_dspr2.h +++ b/vpx_dsp/mips/common_dspr2.h @@ -24,37 +24,21 @@ extern "C" { extern uint8_t *vpx_ff_cropTbl; // From "vpx_dsp/mips/intrapred4_dspr2.c" static INLINE void prefetch_load(const unsigned char *src) { - __asm__ __volatile__ ( - "pref 0, 0(%[src]) \n\t" - : - : [src] "r" (src) - ); + __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); } /* prefetch data for store */ static INLINE void prefetch_store(unsigned char *dst) { - __asm__ __volatile__ ( - "pref 1, 0(%[dst]) \n\t" - : - : [dst] "r" (dst) - ); + __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst)); } static INLINE void prefetch_load_streamed(const unsigned char *src) { - __asm__ __volatile__ ( - "pref 4, 0(%[src]) \n\t" - : - : [src] "r" (src) - ); + __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src)); } /* prefetch data for store */ static INLINE void prefetch_store_streamed(unsigned char *dst) { - __asm__ __volatile__ ( - "pref 5, 0(%[dst]) \n\t" - : - : [dst] "r" (dst) - ); + __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst)); } #endif // #if HAVE_DSPR2 #ifdef __cplusplus diff --git a/vpx_dsp/mips/convolve2_avg_dspr2.c b/vpx_dsp/mips/convolve2_avg_dspr2.c index 3c767672fbd23afdae79668dc6f08c1c20ed34a9..ae88eddfd61dcf38c7be78e3c7fc85120b94bd45 100644 --- a/vpx_dsp/mips/convolve2_avg_dspr2.c +++ b/vpx_dsp/mips/convolve2_avg_dspr2.c @@ -18,25 +18,22 @@ #include "vpx_ports/mem.h" #if HAVE_DSPR2 -static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_y, - int32_t w, +static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, int32_t h) { - int32_t x, y; + int32_t x, y; const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = vpx_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2; - uint32_t p1, p2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t Temp1, Temp2; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t Temp1, Temp2; const int16_t *filter = &filter_y[3]; - uint32_t filter45; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -48,7 +45,7 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, src_ptr = src + x; dst_ptr = dst + x; - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[load1], 0(%[src_ptr]) \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t" @@ -105,16 +102,13 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, "sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [p1] "=&r" (p1), [p2] "=&r" (p2), - [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [store1] "=&r" (store1), [store2] "=&r" (store2), - [src_ptr] "+r" (src_ptr) - : [filter45] "r" (filter45), [vector4a] "r" (vector4a), - [src_stride] "r" (src_stride), [cm] "r" (cm), - [dst_ptr] "r" (dst_ptr) - ); + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); } /* Next row... */ @@ -124,23 +118,21 @@ static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, } static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, - int32_t h) { - int32_t x, y; + const int16_t *filter_y, int32_t h) { + int32_t x, y; const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = vpx_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2; - uint32_t p1, p2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t Temp1, Temp2; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t Temp1, Temp2; const int16_t *filter = &filter_y[3]; - uint32_t filter45;; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -153,7 +145,7 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, src_ptr = src + x; dst_ptr = dst + x; - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[load1], 0(%[src_ptr]) \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t" @@ -210,16 +202,13 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, "sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [p1] "=&r" (p1), [p2] "=&r" (p2), - [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [store1] "=&r" (store1), [store2] "=&r" (store2), - [src_ptr] "+r" (src_ptr) - : [filter45] "r" (filter45), [vector4a] "r" (vector4a), - [src_stride] "r" (src_stride), [cm] "r" (cm), - [dst_ptr] "r" (dst_ptr) - ); + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); } /* Next row... */ @@ -231,18 +220,16 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { uint32_t pos = 38; assert(y_step_q4 == 16); /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); prefetch_store(dst); @@ -251,22 +238,17 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, case 8: case 16: case 32: - convolve_bi_avg_vert_4_dspr2(src, src_stride, - dst, dst_stride, - filter_y, w, h); + convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, + w, h); break; case 64: prefetch_store(dst + 32); - convolve_bi_avg_vert_64_dspr2(src, src_stride, - dst, dst_stride, - filter_y, h); + convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, + h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c index 932a73d39b17beda50b0ac1e59fafb62d5cf0a2b..e944207b6ee79761320e76e7360d7788a0696095 100644 --- a/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c +++ b/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c @@ -19,20 +19,18 @@ #if HAVE_DSPR2 static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { + const int16_t *filter_x0, int32_t h) { int32_t y; uint8_t *cm = vpx_ff_cropTbl; - int32_t Temp1, Temp2, Temp3, Temp4; + int32_t Temp1, Temp2, Temp3, Temp4; uint32_t vector4a = 64; uint32_t tp1, tp2; uint32_t p1, p2, p3; uint32_t tn1, tn2; const int16_t *filter = &filter_x0[3]; - uint32_t filter45; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -42,7 +40,7 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" @@ -61,51 +59,49 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" "extp %[Temp3], $ac2, 31 \n\t" - "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ "mtlo %[vector4a], $ac3 \n\t" "mthi $zero, $ac3 \n\t" - "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ "preceu.ph.qbr %[p1], %[tp2] \n\t" "preceu.ph.qbl %[p3], %[tp2] \n\t" "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" "extp %[Temp2], $ac3, 31 \n\t" - "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ /* odd 2. pixel */ - "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ "mtlo %[vector4a], $ac2 \n\t" "mthi $zero, $ac2 \n\t" - "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ - "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" "extp %[Temp4], $ac2, 31 \n\t" - "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ - "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ /* clamp */ - "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ - "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */ - "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ - "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ - "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ - "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ - "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) - : [filter45] "r" (filter45), [vector4a] "r" (vector4a), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); /* Next row... */ src += src_stride; @@ -114,11 +110,9 @@ static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, } static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { int32_t y; uint8_t *cm = vpx_ff_cropTbl; uint32_t vector4a = 64; @@ -127,7 +121,7 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, uint32_t p1, p2, p3, p4, n1; uint32_t st0, st1; const int16_t *filter = &filter_x0[3]; - uint32_t filter45;; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -137,7 +131,7 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" @@ -246,15 +240,12 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, "sb %[tp4], 5(%[dst]) \n\t" "sb %[tp1], 7(%[dst]) \n\t" - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), - [st0] "=&r" (st0), [st1] "=&r" (st1), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [n1] "=&r" (n1), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) - : [filter45] "r" (filter45), [vector4a] "r" (vector4a), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); /* Next row... */ src += src_stride; @@ -263,12 +254,10 @@ static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, } static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, - int32_t src_stride, - uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h, - int32_t count) { + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { int32_t y, c; const uint8_t *src; uint8_t *dst; @@ -279,7 +268,7 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, uint32_t p1, p2, p3, p4, p5; uint32_t st1, st2, st3; const int16_t *filter = &filter_x0[3]; - uint32_t filter45;; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -293,7 +282,7 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, prefetch_store(dst_ptr + dst_stride); for (c = 0; c < count; c++) { - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t" @@ -493,14 +482,13 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ - : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), - [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [qload3] "=&r" (qload3), [p5] "=&r" (p5), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) - : [filter45] "r" (filter45), [vector_64] "r" (vector_64), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); src += 16; dst += 16; @@ -513,11 +501,10 @@ static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, } static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, - int32_t src_stride, - uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { int32_t y, c; const uint8_t *src; uint8_t *dst; @@ -528,7 +515,7 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, uint32_t p1, p2, p3, p4, p5; uint32_t st1, st2, st3; const int16_t *filter = &filter_x0[3]; - uint32_t filter45;; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -544,7 +531,7 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, prefetch_store(dst_ptr + dst_stride + 32); for (c = 0; c < 4; c++) { - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t" @@ -744,14 +731,13 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ - : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), - [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [qload3] "=&r" (qload3), [p5] "=&r" (p5), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) - : [filter45] "r" (filter45), [vector_64] "r" (vector_64), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); src += 16; dst += 16; @@ -773,11 +759,9 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 == 16); /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); /* prefetch data to cache memory */ prefetch_load(src); @@ -786,39 +770,31 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, switch (w) { case 4: - convolve_bi_avg_horiz_4_dspr2(src, src_stride, - dst, dst_stride, - filter_x, h); + convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); break; case 8: - convolve_bi_avg_horiz_8_dspr2(src, src_stride, - dst, dst_stride, - filter_x, h); + convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); break; case 16: - convolve_bi_avg_horiz_16_dspr2(src, src_stride, - dst, dst_stride, - filter_x, h, 1); + convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 1); break; case 32: - convolve_bi_avg_horiz_16_dspr2(src, src_stride, - dst, dst_stride, - filter_x, h, 2); + convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 2); break; case 64: prefetch_load(src + 64); prefetch_store(dst + 32); - convolve_bi_avg_horiz_64_dspr2(src, src_stride, - dst, dst_stride, - filter_x, h); + convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve2_dspr2.c b/vpx_dsp/mips/convolve2_dspr2.c index d111029d42a59858dccc7329d3298f2876826f57..e355ba3a06cb24a498bbed89a63e7ee75eb780d1 100644 --- a/vpx_dsp/mips/convolve2_dspr2.c +++ b/vpx_dsp/mips/convolve2_dspr2.c @@ -18,21 +18,18 @@ #include "vpx_ports/mem.h" #if HAVE_DSPR2 -static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { - int32_t y; - uint8_t *cm = vpx_ff_cropTbl; - uint8_t *dst_ptr; - int32_t Temp1, Temp2; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2; +static void convolve_bi_horiz_4_transposed_dspr2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = vpx_ff_cropTbl; + uint8_t *dst_ptr; + int32_t Temp1, Temp2; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; const int16_t *filter = &filter_x0[3]; - uint32_t filter45; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -42,7 +39,7 @@ static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src, prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" @@ -94,13 +91,10 @@ static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src, "sb %[p2], 0(%[dst_ptr]) \n\t" "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [p1] "=&r" (p1), [p2] "=&r" (p2), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [dst_ptr] "+r" (dst_ptr) - : [filter45] "r" (filter45),[vector4a] "r" (vector4a), - [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [src] "r"(src), [dst_stride] "r"(dst_stride)); /* Next row... */ src += src_stride; @@ -108,12 +102,9 @@ static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src, } } -static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { +static void convolve_bi_horiz_8_transposed_dspr2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { int32_t y; uint8_t *cm = vpx_ff_cropTbl; uint8_t *dst_ptr; @@ -124,7 +115,7 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src, uint8_t *odd_dst; uint32_t dst_pitch_2 = (dst_stride << 1); const int16_t *filter = &filter_x0[3]; - uint32_t filter45; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -136,7 +127,7 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src, dst_ptr = dst; odd_dst = (dst_ptr + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" @@ -180,7 +171,8 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src, "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" "extp %[Temp3], $ac2, 31 \n\t" - "lbux %[Temp1], %[p3](%[cm]) \n\t" + "lbux %[Temp1], %[p3](%[cm]) " + "\n\t" /* odd 1. pixel */ "mtlo %[vector4a], $ac1 \n\t" @@ -231,13 +223,12 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src, "sb %[p1], 0(%[odd_dst]) \n\t" - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst) - : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm), - [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr), + [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); /* Next row... */ src += src_stride; @@ -245,26 +236,22 @@ static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src, } } -static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr, - int32_t src_stride, - uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h, - int32_t count) { - int32_t c, y; +static void convolve_bi_horiz_16_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { + int32_t c, y; const uint8_t *src; - uint8_t *dst; - uint8_t *cm = vpx_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - uint32_t dst_pitch_2 = (dst_stride << 1); - uint8_t *odd_dst; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; const int16_t *filter = &filter_x0[3]; - uint32_t filter45; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -279,193 +266,329 @@ static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr, odd_dst = (dst + dst_stride); for (c = 0; c < count; c++) { - __asm__ __volatile__ ( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload1], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "ulw %[qload1], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p5], %[qload1] \n\t" - "ulw %[qload2], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload1] " + "\n\t" + "ulw %[qload2], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[qload2] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbl %[p3], %[qload2] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "ulw %[qload1], 20(%[src]) \n\t" - "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p5], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ - "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "ulw %[qload2], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload2] \n\t" - "preceu.ph.qbl %[p5], %[qload2] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "ulw %[qload1], 21(%[src]) \n\t" - "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p5], %[qload1] \n\t" - "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - - "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ - - : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), - [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) - : [filter45] "r" (filter45), [vector_64] "r" (vector_64), - [cm] "r" (cm), - [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) - ); + "dpa.w.ph $ac1, %[p3], %[filter45] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); src += 16; dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); @@ -478,25 +601,22 @@ static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr, } } -static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr, - int32_t src_stride, - uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { - int32_t c, y; +static void convolve_bi_horiz_64_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h) { + int32_t c, y; const uint8_t *src; - uint8_t *dst; - uint8_t *cm = vpx_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - uint32_t dst_pitch_2 = (dst_stride << 1); - uint8_t *odd_dst; + uint8_t *dst; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; const int16_t *filter = &filter_x0[3]; - uint32_t filter45; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -512,193 +632,329 @@ static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr, odd_dst = (dst + dst_stride); for (c = 0; c < 4; c++) { - __asm__ __volatile__ ( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload1], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "ulw %[qload1], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p5], %[qload1] \n\t" - "ulw %[qload2], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload1] " + "\n\t" + "ulw %[qload2], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[qload2] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbl %[p3], %[qload2] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "ulw %[qload1], 20(%[src]) \n\t" - "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p5], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ - "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "ulw %[qload2], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload2] \n\t" - "preceu.ph.qbl %[p5], %[qload2] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "ulw %[qload1], 21(%[src]) \n\t" - "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p5], %[qload1] \n\t" - "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - - "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ - - : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), - [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) - : [filter45] "r" (filter45), [vector_64] "r" (vector_64), - [cm] "r" (cm), - [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) - ); + "dpa.w.ph $ac1, %[p3], %[filter45] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); src += 16; dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); @@ -731,18 +987,15 @@ void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, } } -void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter, - int w, int h) { +void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter, int w, + int h) { uint32_t pos = 38; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); /* prefetch data to cache memory */ prefetch_load(src); @@ -750,32 +1003,26 @@ void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, switch (w) { case 4: - convolve_bi_horiz_4_transposed_dspr2(src, src_stride, - dst, dst_stride, + convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride, filter, h); break; case 8: - convolve_bi_horiz_8_transposed_dspr2(src, src_stride, - dst, dst_stride, + convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride, filter, h); break; case 16: case 32: - convolve_bi_horiz_16_transposed_dspr2(src, src_stride, - dst, dst_stride, - filter, h, - (w/16)); + convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h, (w / 16)); break; case 64: prefetch_load(src + 32); - convolve_bi_horiz_64_transposed_dspr2(src, src_stride, - dst, dst_stride, + convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride, filter, h); break; default: - convolve_bi_horiz_transposed(src, src_stride, - dst, dst_stride, - filter, w, h); + convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w, + h); break; } } diff --git a/vpx_dsp/mips/convolve2_horiz_dspr2.c b/vpx_dsp/mips/convolve2_horiz_dspr2.c index 9fe1a3454b7e43fbdb0bf6175767eecd9b60e0c7..5cc06b5f26007da6b0d5ade8e11c1b4c8e0083fd 100644 --- a/vpx_dsp/mips/convolve2_horiz_dspr2.c +++ b/vpx_dsp/mips/convolve2_horiz_dspr2.c @@ -18,12 +18,9 @@ #include "vpx_ports/mem.h" #if HAVE_DSPR2 -static void convolve_bi_horiz_4_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { +static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { int32_t y; uint8_t *cm = vpx_ff_cropTbl; int32_t Temp1, Temp2, Temp3, Temp4; @@ -31,7 +28,7 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src, uint32_t tp1, tp2; uint32_t p1, p2; const int16_t *filter = &filter_x0[3]; - uint32_t filter45;; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -41,7 +38,7 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src, prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" @@ -86,13 +83,11 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src, "sb %[tp2], 2(%[dst]) \n\t" "sb %[p2], 3(%[dst]) \n\t" - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [p1] "=&r" (p1), [p2] "=&r" (p2), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) - : [filter45] "r" (filter45), [vector4a] "r" (vector4a), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); /* Next row... */ src += src_stride; @@ -100,12 +95,9 @@ static void convolve_bi_horiz_4_dspr2(const uint8_t *src, } } -static void convolve_bi_horiz_8_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { +static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { int32_t y; uint8_t *cm = vpx_ff_cropTbl; uint32_t vector4a = 64; @@ -114,7 +106,7 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src, uint32_t p1, p2, p3, p4; uint32_t st0, st1; const int16_t *filter = &filter_x0[3]; - uint32_t filter45;; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -124,7 +116,7 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src, prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" @@ -210,13 +202,12 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src, "sb %[p2], 5(%[dst]) \n\t" "sb %[p1], 7(%[dst]) \n\t" - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), - [st0] "=&r" (st0), [st1] "=&r" (st1), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) - : [filter45] "r" (filter45), [vector4a] "r" (vector4a), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); /* Next row... */ src += src_stride; @@ -225,11 +216,9 @@ static void convolve_bi_horiz_8_dspr2(const uint8_t *src, } static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, - int32_t src_stride, - uint8_t *dst_ptr, + int32_t src_stride, uint8_t *dst_ptr, int32_t dst_stride, - const int16_t *filter_x0, - int32_t h, + const int16_t *filter_x0, int32_t h, int32_t count) { int32_t y, c; const uint8_t *src; @@ -241,7 +230,7 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, uint32_t p1, p2, p3, p4, p5; uint32_t st1, st2, st3; const int16_t *filter = &filter_x0[3]; - uint32_t filter45;; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -255,7 +244,7 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, prefetch_store(dst_ptr + dst_stride); for (c = 0; c < count; c++) { - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t" @@ -413,14 +402,13 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ - : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), - [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [p5] "=&r" (p5), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) - : [filter45] "r" (filter45), [vector_64] "r" (vector_64), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); src += 16; dst += 16; @@ -433,11 +421,9 @@ static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, } static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, - int32_t src_stride, - uint8_t *dst_ptr, + int32_t src_stride, uint8_t *dst_ptr, int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { + const int16_t *filter_x0, int32_t h) { int32_t y, c; const uint8_t *src; uint8_t *dst; @@ -448,7 +434,7 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, uint32_t p1, p2, p3, p4, p5; uint32_t st1, st2, st3; const int16_t *filter = &filter_x0[3]; - uint32_t filter45;; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -464,7 +450,7 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, prefetch_store(dst_ptr + dst_stride + 32); for (c = 0; c < 4; c++) { - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t" @@ -622,14 +608,13 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ - : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), - [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [p5] "=&r" (p5), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) - : [filter45] "r" (filter45), [vector_64] "r" (vector_64), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); src += 16; dst += 16; @@ -644,8 +629,8 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { uint32_t pos = 38; assert(x_step_q4 == 16); @@ -653,11 +638,9 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, prefetch_load((const uint8_t *)filter_x); /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); /* prefetch data to cache memory */ prefetch_load(src); @@ -666,39 +649,31 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, switch (w) { case 4: - convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filter_x, (int32_t)h); + convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); break; case 8: - convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filter_x, (int32_t)h); + convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); break; case 16: - convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filter_x, (int32_t)h, 1); + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 1); break; case 32: - convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filter_x, (int32_t)h, 2); + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 2); break; case 64: prefetch_load(src + 64); prefetch_store(dst + 32); - convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filter_x, (int32_t)h); + convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); break; default: - vpx_convolve8_horiz_c(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve2_vert_dspr2.c b/vpx_dsp/mips/convolve2_vert_dspr2.c index dde6ffd54f8182bad34f3128079da4d94f87a0b4..eb1975e4475ac4883ae0b73391595361b35000d7 100644 --- a/vpx_dsp/mips/convolve2_vert_dspr2.c +++ b/vpx_dsp/mips/convolve2_vert_dspr2.c @@ -18,25 +18,22 @@ #include "vpx_ports/mem.h" #if HAVE_DSPR2 -static void convolve_bi_vert_4_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_y, - int32_t w, +static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, int32_t h) { - int32_t x, y; + int32_t x, y; const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = vpx_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2; - uint32_t p1, p2; - uint32_t scratch1; - uint32_t store1, store2; - int32_t Temp1, Temp2; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; const int16_t *filter = &filter_y[3]; - uint32_t filter45; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -48,7 +45,7 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src, src_ptr = src + x; dst_ptr = dst + x; - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[load1], 0(%[src_ptr]) \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t" @@ -98,16 +95,12 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src, "sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [p1] "=&r" (p1), [p2] "=&r" (p2), - [scratch1] "=&r" (scratch1), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [store1] "=&r" (store1), [store2] "=&r" (store2), - [src_ptr] "+r" (src_ptr) - : [filter45] "r" (filter45),[vector4a] "r" (vector4a), - [src_stride] "r" (src_stride), - [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) - ); + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); } /* Next row... */ @@ -116,24 +109,21 @@ static void convolve_bi_vert_4_dspr2(const uint8_t *src, } } -static void convolve_bi_vert_64_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_y, - int32_t h) { - int32_t x, y; +static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = vpx_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2; - uint32_t p1, p2; - uint32_t scratch1; - uint32_t store1, store2; - int32_t Temp1, Temp2; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; const int16_t *filter = &filter_y[3]; - uint32_t filter45; + uint32_t filter45; filter45 = ((const int32_t *)filter)[0]; @@ -145,7 +135,7 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, src_ptr = src + x; dst_ptr = dst + x; - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[load1], 0(%[src_ptr]) \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t" @@ -195,16 +185,12 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, "sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [p1] "=&r" (p1), [p2] "=&r" (p2), - [scratch1] "=&r" (scratch1), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [store1] "=&r" (store1), [store2] "=&r" (store2), - [src_ptr] "+r" (src_ptr) - : [filter45] "r" (filter45),[vector4a] "r" (vector4a), - [src_stride] "r" (src_stride), - [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) - ); + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); } /* Next row... */ @@ -216,42 +202,34 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { uint32_t pos = 38; assert(y_step_q4 == 16); /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); prefetch_store(dst); switch (w) { - case 4 : - case 8 : - case 16 : - case 32 : - convolve_bi_vert_4_dspr2(src, src_stride, - dst, dst_stride, - filter_y, w, h); + case 4: + case 8: + case 16: + case 32: + convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, + h); break; - case 64 : + case 64: prefetch_store(dst + 32); - convolve_bi_vert_64_dspr2(src, src_stride, - dst, dst_stride, - filter_y, h); + convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); break; default: - vpx_convolve8_vert_c(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve8_avg_dspr2.c b/vpx_dsp/mips/convolve8_avg_dspr2.c index 43da9e54fb2f7b88578d38a476c3660e6de7d884..31812299c34e1d17d909ed5b550f6aacf64ef97b 100644 --- a/vpx_dsp/mips/convolve8_avg_dspr2.c +++ b/vpx_dsp/mips/convolve8_avg_dspr2.c @@ -18,25 +18,22 @@ #include "vpx_ports/mem.h" #if HAVE_DSPR2 -static void convolve_avg_vert_4_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_y, - int32_t w, +static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, int32_t h) { - int32_t x, y; + int32_t x, y; const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = vpx_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2, load3, load4; - uint32_t p1, p2; - uint32_t n1, n2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; vector1b = ((const int32_t *)filter_y)[0]; vector2b = ((const int32_t *)filter_y)[1]; @@ -53,7 +50,7 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src, src_ptr = src + x; dst_ptr = dst + x; - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[load1], 0(%[src_ptr]) \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t" @@ -160,18 +157,16 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src, "sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2), - [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [store1] "=&r" (store1), [store2] "=&r" (store2), - [src_ptr] "+r" (src_ptr) - : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), - [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), - [vector4a] "r" (vector4a), - [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) - ); + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); } /* Next row... */ @@ -180,24 +175,21 @@ static void convolve_avg_vert_4_dspr2(const uint8_t *src, } } -static void convolve_avg_vert_64_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_y, - int32_t h) { - int32_t x, y; +static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = vpx_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2, load3, load4; - uint32_t p1, p2; - uint32_t n1, n2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2; + uint8_t *dst_ptr; + uint8_t *cm = vpx_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; vector1b = ((const int32_t *)filter_y)[0]; vector2b = ((const int32_t *)filter_y)[1]; @@ -215,7 +207,7 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src, src_ptr = src + x; dst_ptr = dst + x; - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[load1], 0(%[src_ptr]) \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t" @@ -322,18 +314,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src, "sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2), - [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [store1] "=&r" (store1), [store2] "=&r" (store2), - [src_ptr] "+r" (src_ptr) - : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), - [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), - [vector4a] "r" (vector4a), - [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) - ); + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); } /* Next row... */ @@ -345,26 +335,21 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src, void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { assert(y_step_q4 == 16); assert(((const int32_t *)filter_y)[1] != 0x800000); if (((const int32_t *)filter_y)[0] == 0) { - vpx_convolve2_avg_vert_dspr2(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); } else { uint32_t pos = 38; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); prefetch_store(dst); @@ -373,22 +358,17 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, case 8: case 16: case 32: - convolve_avg_vert_4_dspr2(src, src_stride, - dst, dst_stride, - filter_y, w, h); + convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, + h); break; case 64: prefetch_store(dst + 32); - convolve_avg_vert_64_dspr2(src, src_stride, - dst, dst_stride, - filter_y, h); + convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, + h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } @@ -397,8 +377,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { /* Fixed size intermediate buffer places limits on parameters. */ DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; @@ -408,27 +388,20 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 == 16); assert(y_step_q4 == 16); - if (intermediate_height < h) - intermediate_height = h; + if (intermediate_height < h) intermediate_height = h; - vpx_convolve8_horiz(src - (src_stride * 3), src_stride, - temp, 64, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, intermediate_height); + vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x, + x_step_q4, filter_y, y_step_q4, w, intermediate_height); - vpx_convolve8_avg_vert(temp + 64 * 3, 64, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); } void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { + const int16_t *filter_y, int filter_y_stride, int w, + int h) { int x, y; uint32_t tp1, tp2, tn1; uint32_t tp3, tp4, tn2; @@ -441,21 +414,19 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, switch (w) { case 4: /* 1 word storage */ - for (y = h; y--; ) { + for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 0(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ - : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1), - [tp2] "=&r" (tp2) - : [src] "r" (src), [dst] "r" (dst) - ); + : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) + : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; @@ -463,26 +434,24 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, break; case 8: /* 2 word storage */ - for (y = h; y--; ) { + for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 0(%[dst]) \n\t" "ulw %[tp3], 4(%[src]) \n\t" "ulw %[tp4], 4(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), - [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) - : [src] "r" (src), [dst] "r" (dst) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; @@ -490,34 +459,32 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, break; case 16: /* 4 word storage */ - for (y = h; y--; ) { + for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 0(%[dst]) \n\t" "ulw %[tp3], 4(%[src]) \n\t" "ulw %[tp4], 4(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ "ulw %[tp1], 8(%[src]) \n\t" "ulw %[tp2], 8(%[dst]) \n\t" - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ "ulw %[tp3], 12(%[src]) \n\t" "ulw %[tp4], 12(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 8(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), - [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) - : [src] "r" (src), [dst] "r" (dst) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; @@ -525,50 +492,48 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, break; case 32: /* 8 word storage */ - for (y = h; y--; ) { + for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 0(%[dst]) \n\t" "ulw %[tp3], 4(%[src]) \n\t" "ulw %[tp4], 4(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ "ulw %[tp1], 8(%[src]) \n\t" "ulw %[tp2], 8(%[dst]) \n\t" - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ "ulw %[tp3], 12(%[src]) \n\t" "ulw %[tp4], 12(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ "ulw %[tp1], 16(%[src]) \n\t" "ulw %[tp2], 16(%[dst]) \n\t" - "sw %[tn1], 8(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ "ulw %[tp3], 20(%[src]) \n\t" "ulw %[tp4], 20(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ "ulw %[tp1], 24(%[src]) \n\t" "ulw %[tp2], 24(%[dst]) \n\t" - "sw %[tn1], 16(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ "ulw %[tp3], 28(%[src]) \n\t" "ulw %[tp4], 28(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 24(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 28(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), - [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) - : [src] "r" (src), [dst] "r" (dst) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; @@ -579,84 +544,82 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, prefetch_store(dst + 32); /* 16 word storage */ - for (y = h; y--; ) { + for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 64); prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride + 32); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 0(%[dst]) \n\t" "ulw %[tp3], 4(%[src]) \n\t" "ulw %[tp4], 4(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ "ulw %[tp1], 8(%[src]) \n\t" "ulw %[tp2], 8(%[dst]) \n\t" - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ "ulw %[tp3], 12(%[src]) \n\t" "ulw %[tp4], 12(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ "ulw %[tp1], 16(%[src]) \n\t" "ulw %[tp2], 16(%[dst]) \n\t" - "sw %[tn1], 8(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ "ulw %[tp3], 20(%[src]) \n\t" "ulw %[tp4], 20(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ "ulw %[tp1], 24(%[src]) \n\t" "ulw %[tp2], 24(%[dst]) \n\t" - "sw %[tn1], 16(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ "ulw %[tp3], 28(%[src]) \n\t" "ulw %[tp4], 28(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ "ulw %[tp1], 32(%[src]) \n\t" "ulw %[tp2], 32(%[dst]) \n\t" - "sw %[tn1], 24(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 28(%[dst]) \n\t" /* store */ + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ "ulw %[tp3], 36(%[src]) \n\t" "ulw %[tp4], 36(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ "ulw %[tp1], 40(%[src]) \n\t" "ulw %[tp2], 40(%[dst]) \n\t" - "sw %[tn1], 32(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 36(%[dst]) \n\t" /* store */ + "sw %[tn1], 32(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 36(%[dst]) \n\t" /* store */ "ulw %[tp3], 44(%[src]) \n\t" "ulw %[tp4], 44(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ "ulw %[tp1], 48(%[src]) \n\t" "ulw %[tp2], 48(%[dst]) \n\t" - "sw %[tn1], 40(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 44(%[dst]) \n\t" /* store */ + "sw %[tn1], 40(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 44(%[dst]) \n\t" /* store */ "ulw %[tp3], 52(%[src]) \n\t" "ulw %[tp4], 52(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ "ulw %[tp1], 56(%[src]) \n\t" "ulw %[tp2], 56(%[dst]) \n\t" - "sw %[tn1], 48(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 52(%[dst]) \n\t" /* store */ + "sw %[tn1], 48(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 52(%[dst]) \n\t" /* store */ "ulw %[tp3], 60(%[src]) \n\t" "ulw %[tp4], 60(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 56(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 60(%[dst]) \n\t" /* store */ - - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), - [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) - : [src] "r" (src), [dst] "r" (dst) - ); + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 56(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; diff --git a/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c index db0c2a4da5ad6c317ffe17acdef0e357df73e9a6..9a9bab25a59a79a087a121e3e4095d5f395aaa2b 100644 --- a/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c +++ b/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c @@ -18,16 +18,13 @@ #include "vpx_ports/mem.h" #if HAVE_DSPR2 -static void convolve_avg_horiz_4_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { +static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { int32_t y; uint8_t *cm = vpx_ff_cropTbl; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3, Temp4; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; uint32_t vector4a = 64; uint32_t tp1, tp2; uint32_t p1, p2, p3, p4; @@ -45,7 +42,7 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src, prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" @@ -76,13 +73,13 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src, "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" "extp %[Temp3], $ac2, 31 \n\t" - "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ "mtlo %[vector4a], $ac3 \n\t" "mthi $zero, $ac3 \n\t" - "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ "preceu.ph.qbr %[n1], %[tp2] \n\t" "preceu.ph.qbl %[n2], %[tp2] \n\t" "preceu.ph.qbr %[n3], %[tn2] \n\t" @@ -93,46 +90,44 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src, "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" "extp %[Temp2], $ac3, 31 \n\t" - "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ /* odd 2. pixel */ - "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ "mtlo %[vector4a], $ac2 \n\t" "mthi $zero, $ac2 \n\t" "preceu.ph.qbr %[n1], %[tn1] \n\t" - "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ - "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" "extp %[Temp4], $ac2, 31 \n\t" - "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ - "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ /* clamp */ - "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ - "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ - "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ - - "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ - "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ - - "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ - "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ - - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) - : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), - [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), - [vector4a] "r" (vector4a), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + + "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), + [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); /* Next row... */ src += src_stride; @@ -140,12 +135,9 @@ static void convolve_avg_horiz_4_dspr2(const uint8_t *src, } } -static void convolve_avg_horiz_8_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { +static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { int32_t y; uint8_t *cm = vpx_ff_cropTbl; uint32_t vector4a = 64; @@ -167,7 +159,7 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src, prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" @@ -309,17 +301,15 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src, "sb %[tn3], 5(%[dst]) \n\t" "sb %[tn1], 7(%[dst]) \n\t" - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3), - [st0] "=&r" (st0), [st1] "=&r" (st1), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [n1] "=&r" (n1), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) - : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), - [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), - [vector4a] "r" (vector4a), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), + [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); /* Next row... */ src += src_stride; @@ -328,11 +318,9 @@ static void convolve_avg_horiz_8_dspr2(const uint8_t *src, } static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, - int32_t src_stride, - uint8_t *dst_ptr, + int32_t src_stride, uint8_t *dst_ptr, int32_t dst_stride, - const int16_t *filter_x0, - int32_t h, + const int16_t *filter_x0, int32_t h, int32_t count) { int32_t y, c; const uint8_t *src; @@ -360,7 +348,7 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, prefetch_store(dst_ptr + dst_stride); for (c = 0; c < count; c++) { - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t" @@ -618,16 +606,15 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ - : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), - [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [qload3] "=&r" (qload3), [p5] "=&r" (p5), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) - : [filter12] "r" (filter12), [filter34] "r" (filter34), - [filter56] "r" (filter56), [filter78] "r" (filter78), - [vector_64] "r" (vector_64), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); src += 16; dst += 16; @@ -640,11 +627,9 @@ static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, } static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, - int32_t src_stride, - uint8_t *dst_ptr, + int32_t src_stride, uint8_t *dst_ptr, int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { + const int16_t *filter_x0, int32_t h) { int32_t y, c; const uint8_t *src; uint8_t *dst; @@ -673,7 +658,7 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, prefetch_store(dst_ptr + dst_stride + 32); for (c = 0; c < 4; c++) { - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t" @@ -931,16 +916,15 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ - : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), - [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [qload3] "=&r" (qload3), [p5] "=&r" (p5), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) - : [filter12] "r" (filter12), [filter34] "r" (filter34), - [filter56] "r" (filter56), [filter78] "r" (filter78), - [vector_64] "r" (vector_64), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); src += 16; dst += 16; @@ -961,22 +945,17 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, assert(((const int32_t *)filter_x)[1] != 0x800000); if (((const int32_t *)filter_x)[0] == 0) { - vpx_convolve2_avg_horiz_dspr2(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); } else { uint32_t pos = 38; src -= 3; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); /* prefetch data to cache memory */ prefetch_load(src); @@ -985,39 +964,32 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, switch (w) { case 4: - convolve_avg_horiz_4_dspr2(src, src_stride, - dst, dst_stride, - filter_x, h); + convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); break; case 8: - convolve_avg_horiz_8_dspr2(src, src_stride, - dst, dst_stride, - filter_x, h); + convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); break; case 16: - convolve_avg_horiz_16_dspr2(src, src_stride, - dst, dst_stride, - filter_x, h, 1); + convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 1); break; case 32: - convolve_avg_horiz_16_dspr2(src, src_stride, - dst, dst_stride, - filter_x, h, 2); + convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 2); break; case 64: prefetch_load(src + 64); prefetch_store(dst + 32); - convolve_avg_horiz_64_dspr2(src, src_stride, - dst, dst_stride, - filter_x, h); + convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); break; default: - vpx_convolve8_avg_horiz_c(src + 3, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, w, + h); break; } } diff --git a/vpx_dsp/mips/convolve8_dspr2.c b/vpx_dsp/mips/convolve8_dspr2.c index ddad186922835c8933fa1c377e7e966b43dc4925..789ec8d53d8313ae7672b041a1a858d53ad85745 100644 --- a/vpx_dsp/mips/convolve8_dspr2.c +++ b/vpx_dsp/mips/convolve8_dspr2.c @@ -19,8 +19,7 @@ #if HAVE_DSPR2 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int16_t *filter_x0, int32_t h) { @@ -45,7 +44,7 @@ static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" @@ -118,15 +117,14 @@ static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, "sb %[p2], 0(%[dst_ptr]) \n\t" "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4), - [dst_ptr] "+r" (dst_ptr) - : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), - [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), - [vector4a] "r" (vector4a), - [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), + [dst_stride] "r"(dst_stride)); /* Next row... */ src += src_stride; @@ -135,8 +133,7 @@ static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, } static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int16_t *filter_x0, int32_t h) { @@ -164,7 +161,7 @@ static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, dst_ptr = dst; odd_dst = (dst_ptr + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp2], 0(%[src]) \n\t" "ulw %[tp1], 4(%[src]) \n\t" @@ -293,16 +290,14 @@ static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, "sb %[n1], 0(%[odd_dst]) \n\t" - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [n1] "=&r" (n1), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst) - : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), - [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), - [vector4a] "r" (vector4a), [cm] "r" (cm), - [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); /* Next row... */ src += src_stride; @@ -310,25 +305,21 @@ static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, } } -static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr, - int32_t src_stride, - uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h, - int32_t count) { +static void convolve_horiz_16_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { int32_t c, y; const uint8_t *src; uint8_t *dst; uint8_t *cm = vpx_ff_cropTbl; uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; uint32_t qload1, qload2; uint32_t p1, p2, p3, p4, p5; uint32_t st1, st2, st3; uint32_t dst_pitch_2 = (dst_stride << 1); - uint8_t *odd_dst; + uint8_t *odd_dst; filter12 = ((const int32_t *)filter_x0)[0]; filter34 = ((const int32_t *)filter_x0)[1]; @@ -346,248 +337,439 @@ static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr, odd_dst = (dst + dst_stride); for (c = 0; c < count; c++) { - __asm__ __volatile__ ( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "ulw %[qload2], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "ulw %[qload2], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload2] \n\t" - "preceu.ph.qbl %[p5], %[qload2] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "ulw %[qload1], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "ulw %[qload2], 16(%[src]) \n\t" - "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 16(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "ulw %[qload1], 20(%[src]) \n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p5], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ - "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "ulw %[qload2], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload2] \n\t" - "preceu.ph.qbl %[p5], %[qload2] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "ulw %[qload2], 17(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 17(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "ulw %[qload1], 21(%[src]) \n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p5], %[qload1] \n\t" - "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - - "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ - - : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), - [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) - : [filter12] "r" (filter12), [filter34] "r" (filter34), - [filter56] "r" (filter56), [filter78] "r" (filter78), - [vector_64] "r" (vector_64), [cm] "r" (cm), - [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) - ); + "dpa.w.ph $ac1, %[p3], %[filter12] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); src += 16; dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); @@ -601,24 +783,21 @@ static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr, } } -static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr, - int32_t src_stride, - uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { +static void convolve_horiz_64_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h) { int32_t c, y; const uint8_t *src; uint8_t *dst; uint8_t *cm = vpx_ff_cropTbl; uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; uint32_t qload1, qload2; uint32_t p1, p2, p3, p4, p5; uint32_t st1, st2, st3; uint32_t dst_pitch_2 = (dst_stride << 1); - uint8_t *odd_dst; + uint8_t *odd_dst; filter12 = ((const int32_t *)filter_x0)[0]; filter34 = ((const int32_t *)filter_x0)[1]; @@ -637,248 +816,439 @@ static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr, odd_dst = (dst + dst_stride); for (c = 0; c < 4; c++) { - __asm__ __volatile__ ( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "ulw %[qload2], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "ulw %[qload2], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload2] \n\t" - "preceu.ph.qbl %[p5], %[qload2] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "ulw %[qload1], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "ulw %[qload2], 16(%[src]) \n\t" - "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 16(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "ulw %[qload1], 20(%[src]) \n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p5], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ - "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ - "addu %[dst], %[dst], %[dst_pitch_2] \n\t" - "ulw %[qload2], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload2] \n\t" - "preceu.ph.qbl %[p5], %[qload2] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "ulw %[qload2], 17(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 17(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "ulw %[qload1], 21(%[src]) \n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p5], %[qload1] \n\t" - "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - - "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ - - : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), - [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) - : [filter12] "r" (filter12), [filter34] "r" (filter34), - [filter56] "r" (filter56), [filter78] "r" (filter78), - [vector_64] "r" (vector_64), [cm] "r" (cm), - [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) - ); + "dpa.w.ph $ac1, %[p3], %[filter12] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); src += 16; dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); @@ -901,8 +1271,7 @@ void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, for (x = 0; x < w; ++x) { int sum = 0; - for (k = 0; k < 8; ++k) - sum += src[x + k] * filter[k]; + for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k]; dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); } @@ -913,8 +1282,7 @@ void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, } void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - int w, int h) { + uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { int x, y; for (y = 0; y < h; ++y) { @@ -927,10 +1295,9 @@ void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, } } -void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; @@ -941,27 +1308,20 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, assert(((const int32_t *)filter_x)[1] != 0x800000); assert(((const int32_t *)filter_y)[1] != 0x800000); - /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); - if (intermediate_height < h) - intermediate_height = h; + if (intermediate_height < h) intermediate_height = h; /* copy the src to dst */ if (filter_x[3] == 0x80) { - copy_horiz_transposed(src - src_stride * 3, src_stride, - temp, intermediate_height, - w, intermediate_height); + copy_horiz_transposed(src - src_stride * 3, src_stride, temp, + intermediate_height, w, intermediate_height); } else if (((const int32_t *)filter_x)[0] == 0) { - vpx_convolve2_dspr2(src - src_stride * 3, src_stride, - temp, intermediate_height, - filter_x, - w, intermediate_height); + vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp, + intermediate_height, filter_x, w, intermediate_height); } else { src -= (src_stride * 3 + 3); @@ -971,31 +1331,29 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, switch (w) { case 4: - convolve_horiz_4_transposed_dspr2(src, src_stride, - temp, intermediate_height, - filter_x, intermediate_height); + convolve_horiz_4_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); break; case 8: - convolve_horiz_8_transposed_dspr2(src, src_stride, - temp, intermediate_height, - filter_x, intermediate_height); + convolve_horiz_8_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); break; case 16: case 32: - convolve_horiz_16_transposed_dspr2(src, src_stride, - temp, intermediate_height, - filter_x, intermediate_height, - (w/16)); + convolve_horiz_16_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height, (w / 16)); break; case 64: prefetch_load(src + 32); - convolve_horiz_64_transposed_dspr2(src, src_stride, - temp, intermediate_height, - filter_x, intermediate_height); + convolve_horiz_64_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); break; default: - convolve_horiz_transposed(src, src_stride, - temp, intermediate_height, + convolve_horiz_transposed(src, src_stride, temp, intermediate_height, filter_x, w, intermediate_height); break; } @@ -1003,40 +1361,31 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, /* copy the src to dst */ if (filter_y[3] == 0x80) { - copy_horiz_transposed(temp + 3, intermediate_height, - dst, dst_stride, - h, w); + copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w); } else if (((const int32_t *)filter_y)[0] == 0) { - vpx_convolve2_dspr2(temp + 3, intermediate_height, - dst, dst_stride, - filter_y, - h, w); + vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride, + filter_y, h, w); } else { switch (h) { case 4: - convolve_horiz_4_transposed_dspr2(temp, intermediate_height, - dst, dst_stride, - filter_y, w); + convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); break; case 8: - convolve_horiz_8_transposed_dspr2(temp, intermediate_height, - dst, dst_stride, - filter_y, w); + convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); break; case 16: case 32: - convolve_horiz_16_transposed_dspr2(temp, intermediate_height, - dst, dst_stride, - filter_y, w, (h/16)); + convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w, (h / 16)); break; case 64: - convolve_horiz_64_transposed_dspr2(temp, intermediate_height, - dst, dst_stride, - filter_y, w); + convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); break; default: - convolve_horiz_transposed(temp, intermediate_height, - dst, dst_stride, + convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride, filter_y, h, w); break; } @@ -1056,97 +1405,87 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, prefetch_store(dst); switch (w) { - case 4: - { + case 4: { uint32_t tp1; /* 1 word storage */ - for (y = h; y--; ) { + for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], (%[src]) \n\t" - "sw %[tp1], (%[dst]) \n\t" /* store */ + "sw %[tp1], (%[dst]) \n\t" /* store */ - : [tp1] "=&r" (tp1) - : [src] "r" (src), [dst] "r" (dst) - ); + : [tp1] "=&r"(tp1) + : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; } - } - break; - case 8: - { + } break; + case 8: { uint32_t tp1, tp2; /* 2 word storage */ - for (y = h; y--; ) { + for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" - "sw %[tp1], 0(%[dst]) \n\t" /* store */ - "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2) - : [src] "r" (src), [dst] "r" (dst) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) + : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; } - } - break; - case 16: - { + } break; + case 16: { uint32_t tp1, tp2, tp3, tp4; /* 4 word storage */ - for (y = h; y--; ) { + for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp3], 8(%[src]) \n\t" "ulw %[tp4], 12(%[src]) \n\t" - "sw %[tp1], 0(%[dst]) \n\t" /* store */ - "sw %[tp2], 4(%[dst]) \n\t" /* store */ - "sw %[tp3], 8(%[dst]) \n\t" /* store */ - "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tp3] "=&r" (tp3), [tp4] "=&r" (tp4) - : [src] "r" (src), [dst] "r" (dst) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4) + : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; } - } - break; - case 32: - { + } break; + case 32: { uint32_t tp1, tp2, tp3, tp4; uint32_t tp5, tp6, tp7, tp8; /* 8 word storage */ - for (y = h; y--; ) { + for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp3], 8(%[src]) \n\t" @@ -1156,29 +1495,25 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, "ulw %[tp7], 24(%[src]) \n\t" "ulw %[tp8], 28(%[src]) \n\t" - "sw %[tp1], 0(%[dst]) \n\t" /* store */ - "sw %[tp2], 4(%[dst]) \n\t" /* store */ - "sw %[tp3], 8(%[dst]) \n\t" /* store */ - "sw %[tp4], 12(%[dst]) \n\t" /* store */ - "sw %[tp5], 16(%[dst]) \n\t" /* store */ - "sw %[tp6], 20(%[dst]) \n\t" /* store */ - "sw %[tp7], 24(%[dst]) \n\t" /* store */ - "sw %[tp8], 28(%[dst]) \n\t" /* store */ - - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), - [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), - [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) - : [src] "r" (src), [dst] "r" (dst) - ); + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), + [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) + : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; } - } - break; - case 64: - { + } break; + case 64: { uint32_t tp1, tp2, tp3, tp4; uint32_t tp5, tp6, tp7, tp8; @@ -1186,14 +1521,14 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, prefetch_store(dst + 32); /* 16 word storage */ - for (y = h; y--; ) { + for (y = h; y--;) { prefetch_load(src + src_stride); prefetch_load(src + src_stride + 32); prefetch_load(src + src_stride + 64); prefetch_store(dst + dst_stride); prefetch_store(dst + dst_stride + 32); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" "ulw %[tp3], 8(%[src]) \n\t" @@ -1203,14 +1538,14 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, "ulw %[tp7], 24(%[src]) \n\t" "ulw %[tp8], 28(%[src]) \n\t" - "sw %[tp1], 0(%[dst]) \n\t" /* store */ - "sw %[tp2], 4(%[dst]) \n\t" /* store */ - "sw %[tp3], 8(%[dst]) \n\t" /* store */ - "sw %[tp4], 12(%[dst]) \n\t" /* store */ - "sw %[tp5], 16(%[dst]) \n\t" /* store */ - "sw %[tp6], 20(%[dst]) \n\t" /* store */ - "sw %[tp7], 24(%[dst]) \n\t" /* store */ - "sw %[tp8], 28(%[dst]) \n\t" /* store */ + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ "ulw %[tp1], 32(%[src]) \n\t" "ulw %[tp2], 36(%[src]) \n\t" @@ -1221,29 +1556,26 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, "ulw %[tp7], 56(%[src]) \n\t" "ulw %[tp8], 60(%[src]) \n\t" - "sw %[tp1], 32(%[dst]) \n\t" /* store */ - "sw %[tp2], 36(%[dst]) \n\t" /* store */ - "sw %[tp3], 40(%[dst]) \n\t" /* store */ - "sw %[tp4], 44(%[dst]) \n\t" /* store */ - "sw %[tp5], 48(%[dst]) \n\t" /* store */ - "sw %[tp6], 52(%[dst]) \n\t" /* store */ - "sw %[tp7], 56(%[dst]) \n\t" /* store */ - "sw %[tp8], 60(%[dst]) \n\t" /* store */ - - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), - [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), - [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) - : [src] "r" (src), [dst] "r" (dst) - ); + "sw %[tp1], 32(%[dst]) \n\t" /* store */ + "sw %[tp2], 36(%[dst]) \n\t" /* store */ + "sw %[tp3], 40(%[dst]) \n\t" /* store */ + "sw %[tp4], 44(%[dst]) \n\t" /* store */ + "sw %[tp5], 48(%[dst]) \n\t" /* store */ + "sw %[tp6], 52(%[dst]) \n\t" /* store */ + "sw %[tp7], 56(%[dst]) \n\t" /* store */ + "sw %[tp8], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), + [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) + : [src] "r"(src), [dst] "r"(dst)); src += src_stride; dst += dst_stride; } - } - break; + } break; default: - for (y = h; y--; ) { + for (y = h; y--;) { for (x = 0; x < w; ++x) { dst[x] = src[x]; } diff --git a/vpx_dsp/mips/convolve8_horiz_dspr2.c b/vpx_dsp/mips/convolve8_horiz_dspr2.c index ae78bab8924832ca60ec8a1452f51484f9a7a260..196a0a2f0be98384895dd6b067700b17ceabcb90 100644 --- a/vpx_dsp/mips/convolve8_horiz_dspr2.c +++ b/vpx_dsp/mips/convolve8_horiz_dspr2.c @@ -18,12 +18,9 @@ #include "vpx_ports/mem.h" #if HAVE_DSPR2 -static void convolve_horiz_4_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { +static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { int32_t y; uint8_t *cm = vpx_ff_cropTbl; int32_t vector1b, vector2b, vector3b, vector4b; @@ -45,7 +42,7 @@ static void convolve_horiz_4_dspr2(const uint8_t *src, prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" @@ -111,17 +108,15 @@ static void convolve_horiz_4_dspr2(const uint8_t *src, "sb %[tp2], 2(%[dst]) \n\t" "sb %[n2], 3(%[dst]) \n\t" - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) - : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), - [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), - [vector4a] "r" (vector4a), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), + [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); /* Next row... */ src += src_stride; @@ -129,12 +124,9 @@ static void convolve_horiz_4_dspr2(const uint8_t *src, } } -static void convolve_horiz_8_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { +static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { int32_t y; uint8_t *cm = vpx_ff_cropTbl; uint32_t vector4a = 64; @@ -156,7 +148,7 @@ static void convolve_horiz_8_dspr2(const uint8_t *src, prefetch_load(src + src_stride + 32); prefetch_store(dst + dst_stride); - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[tp1], 0(%[src]) \n\t" "ulw %[tp2], 4(%[src]) \n\t" @@ -275,17 +267,15 @@ static void convolve_horiz_8_dspr2(const uint8_t *src, "sb %[p2], 5(%[dst]) \n\t" "sb %[n1], 7(%[dst]) \n\t" - : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), - [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3), - [st0] "=&r" (st0), [st1] "=&r" (st1), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [n1] "=&r" (n1), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) - : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), - [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), - [vector4a] "r" (vector4a), - [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) - ); + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), + [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); /* Next row... */ src += src_stride; @@ -293,12 +283,9 @@ static void convolve_horiz_8_dspr2(const uint8_t *src, } } -static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, - int32_t src_stride, - uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h, +static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + const int16_t *filter_x0, int32_t h, int32_t count) { int32_t y, c; const uint8_t *src; @@ -326,7 +313,7 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, prefetch_store(dst_ptr + dst_stride); for (c = 0; c < count; c++) { - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t" @@ -542,17 +529,15 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ - : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), - [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [p5] "=&r" (p5), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) - : [filter12] "r" (filter12), [filter34] "r" (filter34), - [filter56] "r" (filter56), [filter78] "r" (filter78), - [vector_64] "r" (vector_64), - [cm] "r" (cm), [dst] "r" (dst), - [src] "r" (src) - ); + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); src += 16; dst += 16; @@ -564,12 +549,9 @@ static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, } } -static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, - int32_t src_stride, - uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { +static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { int32_t y, c; const uint8_t *src; uint8_t *dst; @@ -598,7 +580,7 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, prefetch_store(dst_ptr + dst_stride + 32); for (c = 0; c < 4; c++) { - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[qload1], 0(%[src]) \n\t" "ulw %[qload2], 4(%[src]) \n\t" @@ -814,17 +796,15 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ - : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), - [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), - [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), - [p5] "=&r" (p5), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) - : [filter12] "r" (filter12), [filter34] "r" (filter34), - [filter56] "r" (filter56), [filter78] "r" (filter78), - [vector_64] "r" (vector_64), - [cm] "r" (cm), [dst] "r" (dst), - [src] "r" (src) - ); + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); src += 16; dst += 16; @@ -839,17 +819,14 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { assert(x_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); if (((const int32_t *)filter_x)[0] == 0) { - vpx_convolve2_horiz_dspr2(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -857,11 +834,9 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, src -= 3; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); /* prefetch data to cache memory */ prefetch_load(src); @@ -870,39 +845,31 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, switch (w) { case 4: - convolve_horiz_4_dspr2(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filter_x, (int32_t)h); + convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); break; case 8: - convolve_horiz_8_dspr2(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filter_x, (int32_t)h); + convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); break; case 16: - convolve_horiz_16_dspr2(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filter_x, (int32_t)h, 1); + convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 1); break; case 32: - convolve_horiz_16_dspr2(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filter_x, (int32_t)h, 2); + convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 2); break; case 64: prefetch_load(src + 64); prefetch_store(dst + 32); - convolve_horiz_64_dspr2(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filter_x, (int32_t)h); + convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); break; default: - vpx_convolve8_horiz_c(src + 3, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve8_vert_dspr2.c b/vpx_dsp/mips/convolve8_vert_dspr2.c index d553828c59a2b4a51f3cb65f0ba93281aeeff217..ad107d5c47309d8b9540ac0d43fd9464e7052482 100644 --- a/vpx_dsp/mips/convolve8_vert_dspr2.c +++ b/vpx_dsp/mips/convolve8_vert_dspr2.c @@ -18,12 +18,9 @@ #include "vpx_ports/mem.h" #if HAVE_DSPR2 -static void convolve_vert_4_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_y, - int32_t w, +static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, int32_t h) { int32_t x, y; const uint8_t *src_ptr; @@ -53,7 +50,7 @@ static void convolve_vert_4_dspr2(const uint8_t *src, src_ptr = src + x; dst_ptr = dst + x; - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[load1], 0(%[src_ptr]) \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t" @@ -152,19 +149,16 @@ static void convolve_vert_4_dspr2(const uint8_t *src, "sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [p1] "=&r" (p1), [p2] "=&r" (p2), - [n1] "=&r" (n1), [n2] "=&r" (n2), - [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [store1] "=&r" (store1), [store2] "=&r" (store2), - [src_ptr] "+r" (src_ptr) - : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), - [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), - [vector4a] "r" (vector4a), [src_stride] "r" (src_stride), - [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) - ); + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); } /* Next row... */ @@ -173,12 +167,9 @@ static void convolve_vert_4_dspr2(const uint8_t *src, } } -static void convolve_vert_64_dspr2(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_y, - int32_t h) { +static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { int32_t x, y; const uint8_t *src_ptr; uint8_t *dst_ptr; @@ -208,7 +199,7 @@ static void convolve_vert_64_dspr2(const uint8_t *src, src_ptr = src + x; dst_ptr = dst + x; - __asm__ __volatile__ ( + __asm__ __volatile__( "ulw %[load1], 0(%[src_ptr]) \n\t" "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" "ulw %[load2], 0(%[src_ptr]) \n\t" @@ -307,19 +298,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src, "sb %[store1], 2(%[dst_ptr]) \n\t" "sb %[store2], 3(%[dst_ptr]) \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [p1] "=&r" (p1), [p2] "=&r" (p2), - [n1] "=&r" (n1), [n2] "=&r" (n2), - [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), - [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), - [store1] "=&r" (store1), [store2] "=&r" (store2), - [src_ptr] "+r" (src_ptr) - : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), - [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), - [vector4a] "r" (vector4a), [src_stride] "r" (src_stride), - [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) - ); + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); } /* Next row... */ @@ -331,50 +319,38 @@ static void convolve_vert_64_dspr2(const uint8_t *src, void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { assert(y_step_q4 == 16); assert(((const int32_t *)filter_y)[1] != 0x800000); if (((const int32_t *)filter_y)[0] == 0) { - vpx_convolve2_vert_dspr2(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); } else { uint32_t pos = 38; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); prefetch_store(dst); switch (w) { - case 4 : - case 8 : - case 16 : - case 32 : - convolve_vert_4_dspr2(src, src_stride, - dst, dst_stride, - filter_y, w, h); + case 4: + case 8: + case 16: + case 32: + convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h); break; - case 64 : + case 64: prefetch_store(dst + 32); - convolve_vert_64_dspr2(src, src_stride, - dst, dst_stride, - filter_y, h); + convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); break; default: - vpx_convolve8_vert_c(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve_common_dspr2.h b/vpx_dsp/mips/convolve_common_dspr2.h index 66d77a28544b9619432f646f530943899f320968..4eee3bd5e1580f54648c49ea2b078f7777947d4f 100644 --- a/vpx_dsp/mips/convolve_common_dspr2.h +++ b/vpx_dsp/mips/convolve_common_dspr2.h @@ -25,8 +25,8 @@ extern "C" { void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); + const int16_t *filter_y, int y_step_q4, int w, + int h); void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, @@ -37,19 +37,18 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); + const int16_t *filter_y, int y_step_q4, int w, + int h); -void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter, - int w, int h); +void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter, int w, + int h); void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); + const int16_t *filter_y, int y_step_q4, int w, + int h); #endif // #if HAVE_DSPR2 #ifdef __cplusplus diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c index e98a0399ba3c4fd95bcf29f9a48f5cc160b397fb..402d7ed99793b2831389b08c4c377fe37887a1de 100644 --- a/vpx_dsp/mips/deblock_msa.c +++ b/vpx_dsp/mips/deblock_msa.c @@ -13,133 +13,132 @@ extern const int16_t vpx_rv[]; -#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, \ - out4, out5, out6, out7, \ - out8, out9, out10, out11, \ - out12, out13, out14, out15) \ -{ \ - v8i16 temp0, temp1, temp2, temp3, temp4; \ - v8i16 temp5, temp6, temp7, temp8, temp9; \ - \ - ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ - temp0, temp1, temp2, temp3); \ - ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ - ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ - ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ - temp0, temp1, temp2, temp3); \ - ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_UB(temp5, temp4, out8, out10); \ - ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_UB(temp5, temp4, out12, out14); \ - out0 = (v16u8)temp6; \ - out2 = (v16u8)temp7; \ - out4 = (v16u8)temp8; \ - out6 = (v16u8)temp9; \ - out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \ - out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \ - out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \ - out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \ - out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ - out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ - out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ -} +#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7, \ + out8, out9, out10, out11, out12, out13, out14, \ + out15) \ + { \ + v8i16 temp0, temp1, temp2, temp3, temp4; \ + v8i16 temp5, temp6, temp7, temp8, temp9; \ + \ + ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ + temp3); \ + ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ + ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ + ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ + temp3); \ + ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_UB(temp5, temp4, out8, out10); \ + ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_UB(temp5, temp4, out12, out14); \ + out0 = (v16u8)temp6; \ + out2 = (v16u8)temp7; \ + out4 = (v16u8)temp8; \ + out6 = (v16u8)temp9; \ + out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \ + out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \ + out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \ + out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \ + out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ + out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ + out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ + } -#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, \ - below1_in, below2_in, ref, out) \ -{ \ - v16u8 temp0, temp1; \ - \ - temp1 = __msa_aver_u_b(above2_in, above1_in); \ - temp0 = __msa_aver_u_b(below2_in, below1_in); \ - temp1 = __msa_aver_u_b(temp1, temp0); \ - out = __msa_aver_u_b(src_in, temp1); \ - temp0 = __msa_asub_u_b(src_in, above2_in); \ - temp1 = __msa_asub_u_b(src_in, above1_in); \ - temp0 = (temp0 < ref); \ - temp1 = (temp1 < ref); \ - temp0 = temp0 & temp1; \ - temp1 = __msa_asub_u_b(src_in, below1_in); \ - temp1 = (temp1 < ref); \ - temp0 = temp0 & temp1; \ - temp1 = __msa_asub_u_b(src_in, below2_in); \ - temp1 = (temp1 < ref); \ - temp0 = temp0 & temp1; \ - out = __msa_bmz_v(out, src_in, temp0); \ -} +#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \ + ref, out) \ + { \ + v16u8 temp0, temp1; \ + \ + temp1 = __msa_aver_u_b(above2_in, above1_in); \ + temp0 = __msa_aver_u_b(below2_in, below1_in); \ + temp1 = __msa_aver_u_b(temp1, temp0); \ + out = __msa_aver_u_b(src_in, temp1); \ + temp0 = __msa_asub_u_b(src_in, above2_in); \ + temp1 = __msa_asub_u_b(src_in, above1_in); \ + temp0 = (temp0 < ref); \ + temp1 = (temp1 < ref); \ + temp0 = temp0 & temp1; \ + temp1 = __msa_asub_u_b(src_in, below1_in); \ + temp1 = (temp1 < ref); \ + temp0 = temp0 & temp1; \ + temp1 = __msa_asub_u_b(src_in, below2_in); \ + temp1 = (temp1 < ref); \ + temp0 = temp0 & temp1; \ + out = __msa_bmz_v(out, src_in, temp0); \ + } -#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \ - in8, in9, in10, in11, in12, in13, in14, in15) \ -{ \ - v8i16 temp0, temp1, temp2, temp3, temp4; \ - v8i16 temp5, temp6, temp7, temp8, temp9; \ - \ - ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \ - ILVRL_H2_SH(temp1, temp0, temp2, temp3); \ - ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \ - ILVRL_H2_SH(temp1, temp0, temp4, temp5); \ - ILVRL_W2_SH(temp4, temp2, temp0, temp1); \ - ILVRL_W2_SH(temp5, temp3, temp2, temp3); \ - ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \ - ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \ - ILVRL_H2_SH(temp5, temp4, temp6, temp7); \ - ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \ - ILVRL_H2_SH(temp5, temp4, temp8, temp9); \ - ILVRL_W2_SH(temp8, temp6, temp4, temp5); \ - ILVRL_W2_SH(temp9, temp7, temp6, temp7); \ - ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \ - ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \ - in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \ - in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \ - ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \ - ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \ - in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \ - in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \ - ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \ - temp2, temp3, temp4, temp5); \ - ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \ - temp6, temp7, temp8, temp9); \ - ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \ - in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \ - in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \ - ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \ - in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \ - in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \ -} +#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ + in10, in11, in12, in13, in14, in15) \ + { \ + v8i16 temp0, temp1, temp2, temp3, temp4; \ + v8i16 temp5, temp6, temp7, temp8, temp9; \ + \ + ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \ + ILVRL_H2_SH(temp1, temp0, temp2, temp3); \ + ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \ + ILVRL_H2_SH(temp1, temp0, temp4, temp5); \ + ILVRL_W2_SH(temp4, temp2, temp0, temp1); \ + ILVRL_W2_SH(temp5, temp3, temp2, temp3); \ + ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \ + ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \ + ILVRL_H2_SH(temp5, temp4, temp6, temp7); \ + ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \ + ILVRL_H2_SH(temp5, temp4, temp8, temp9); \ + ILVRL_W2_SH(temp8, temp6, temp4, temp5); \ + ILVRL_W2_SH(temp9, temp7, temp6, temp7); \ + ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \ + ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \ + in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \ + in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \ + ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \ + ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \ + in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \ + in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \ + ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, temp2, temp3, \ + temp4, temp5); \ + ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, temp6, \ + temp7, temp8, temp9); \ + ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \ + in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \ + in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \ + ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \ + in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \ + in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \ + } -#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \ - in6, in7, in8, in9, in10, in11) \ -{ \ - v8i16 temp0, temp1, temp2, temp3; \ - v8i16 temp4, temp5, temp6, temp7; \ - \ - ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \ - ILVRL_H2_SH(temp1, temp0, temp2, temp3); \ - ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \ - ILVRL_H2_SH(temp1, temp0, temp4, temp5); \ - ILVRL_W2_SH(temp4, temp2, temp0, temp1); \ - ILVRL_W2_SH(temp5, temp3, temp2, temp3); \ - ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \ - temp4 = __msa_ilvr_h(temp5, temp4); \ - ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \ - temp5 = __msa_ilvr_h(temp7, temp6); \ - ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ - in0 = (v16u8)temp0; \ - in2 = (v16u8)temp1; \ - in4 = (v16u8)temp2; \ - in6 = (v16u8)temp3; \ - in8 = (v16u8)temp6; \ - in10 = (v16u8)temp7; \ - in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \ - in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \ - in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \ - in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \ - in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \ - in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \ -} +#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, \ + in9, in10, in11) \ + { \ + v8i16 temp0, temp1, temp2, temp3; \ + v8i16 temp4, temp5, temp6, temp7; \ + \ + ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \ + ILVRL_H2_SH(temp1, temp0, temp2, temp3); \ + ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \ + ILVRL_H2_SH(temp1, temp0, temp4, temp5); \ + ILVRL_W2_SH(temp4, temp2, temp0, temp1); \ + ILVRL_W2_SH(temp5, temp3, temp2, temp3); \ + ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \ + temp4 = __msa_ilvr_h(temp5, temp4); \ + ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \ + temp5 = __msa_ilvr_h(temp7, temp6); \ + ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ + in0 = (v16u8)temp0; \ + in2 = (v16u8)temp1; \ + in4 = (v16u8)temp2; \ + in6 = (v16u8)temp3; \ + in8 = (v16u8)temp6; \ + in10 = (v16u8)temp7; \ + in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \ + in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \ + in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \ + in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \ + in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \ + in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \ + } static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, int32_t src_stride, @@ -203,16 +202,16 @@ static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); above1 = LD_UB(p_src + 9 * src_stride); VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); - out0 = __msa_copy_u_d((v2i64) inter0, 0); - out1 = __msa_copy_u_d((v2i64) inter1, 0); - out2 = __msa_copy_u_d((v2i64) inter2, 0); - out3 = __msa_copy_u_d((v2i64) inter3, 0); + out0 = __msa_copy_u_d((v2i64)inter0, 0); + out1 = __msa_copy_u_d((v2i64)inter1, 0); + out2 = __msa_copy_u_d((v2i64)inter2, 0); + out3 = __msa_copy_u_d((v2i64)inter3, 0); SD4(out0, out1, out2, out3, p_dst, dst_stride); - out0 = __msa_copy_u_d((v2i64) inter4, 0); - out1 = __msa_copy_u_d((v2i64) inter5, 0); - out2 = __msa_copy_u_d((v2i64) inter6, 0); - out3 = __msa_copy_u_d((v2i64) inter7, 0); + out0 = __msa_copy_u_d((v2i64)inter4, 0); + out1 = __msa_copy_u_d((v2i64)inter5, 0); + out2 = __msa_copy_u_d((v2i64)inter6, 0); + out3 = __msa_copy_u_d((v2i64)inter7, 0); SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride); } @@ -236,36 +235,36 @@ static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, src = inter2; below1 = inter3; below2 = inter4; - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0); VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2); above2 = inter5; - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1); VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3); above1 = inter6; - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2); VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4); src = inter7; - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3); VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5); below1 = inter8; - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4); VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6); below2 = inter9; - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5); VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7); if (col == (cols / 8 - 1)) { above2 = inter9; } else { above2 = inter10; } - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6); VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8); if (col == (cols / 8 - 1)) { above1 = inter9; } else { above1 = inter11; } - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7); VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9); TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8, inter9, inter2, inter3, inter4, inter5, inter6, inter7, @@ -371,36 +370,36 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, src = inter2; below1 = inter3; below2 = inter4; - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0); VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2); above2 = inter5; - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1); VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3); above1 = inter6; - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2); VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4); src = inter7; - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3); VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5); below1 = inter8; - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4); VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6); below2 = inter9; - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5); VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7); if (col == (cols / 8 - 1)) { above2 = inter9; } else { above2 = inter10; } - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6); VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8); if (col == (cols / 8 - 1)) { above1 = inter9; } else { above1 = inter11; } - ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7); + ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7); VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9); VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8, inter9, inter2, inter3, inter4, inter5, @@ -452,8 +451,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, int32_t row, col, cnt; uint8_t *src_dup = src_ptr; v16u8 src0, src, tmp_orig; - v16u8 tmp = {0}; - v16i8 zero = {0}; + v16u8 tmp = { 0 }; + v16i8 zero = { 0 }; v8u16 sum_h, src_r_h, src_l_h; v4u32 src_r_w, src_l_w; v4i32 flimit_vec; @@ -462,13 +461,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, for (row = rows; row--;) { int32_t sum_sq = 0; int32_t sum = 0; - src0 = (v16u8) __msa_fill_b(src_dup[0]); + src0 = (v16u8)__msa_fill_b(src_dup[0]); ST8x1_UB(src0, (src_dup - 8)); - src0 = (v16u8) __msa_fill_b(src_dup[cols - 1]); + src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]); ST_UB(src0, src_dup + cols); src_dup[cols + 16] = src_dup[cols - 1]; - tmp_orig = (v16u8) __msa_ldi_b(0); + tmp_orig = (v16u8)__msa_ldi_b(0); tmp_orig[15] = tmp[15]; src = LD_UB(src_dup - 8); src[15] = 0; @@ -508,9 +507,9 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, sum = sum_l[7]; src = LD_UB(src_dup + 16 * col); ILVRL_B2_UH(zero, src, src_r_h, src_l_h); - src7 = (v16u8)((const8 + sum_r + (v8i16) src_r_h) >> 4); - src8 = (v16u8)((const8 + sum_l + (v8i16) src_l_h) >> 4); - tmp = (v16u8) __msa_pckev_b((v16i8) src8, (v16i8) src7); + src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4); + src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4); + tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7); HADD_UB2_UH(src_r, src_l, add_r, add_l); UNPCK_SH_SW(sub_r, sub0, sub1); @@ -552,13 +551,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, total2 = (total2 < flimit_vec); total3 = (total3 < flimit_vec); PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); - mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0); - tmp = __msa_bmz_v(tmp, src, (v16u8) mask); + mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0); + tmp = __msa_bmz_v(tmp, src, (v16u8)mask); if (col == 0) { uint64_t src_d; - src_d = __msa_copy_u_d((v2i64) tmp_orig, 1); + src_d = __msa_copy_u_d((v2i64)tmp_orig, 1); SD(src_d, (src_dup - 8)); } @@ -588,15 +587,15 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, for (col = 0; col < (cols >> 4); ++col) { uint8_t *dst_tmp = &dst_ptr[col << 4]; v16u8 dst; - v16i8 zero = {0}; + v16i8 zero = { 0 }; v16u8 tmp[16]; v8i16 mult0, mult1, rv2_0, rv2_1; - v8i16 sum0_h = {0}; - v8i16 sum1_h = {0}; - v4i32 mul0 = {0}; - v4i32 mul1 = {0}; - v4i32 mul2 = {0}; - v4i32 mul3 = {0}; + v8i16 sum0_h = { 0 }; + v8i16 sum1_h = { 0 }; + v4i32 mul0 = { 0 }; + v4i32 mul1 = { 0 }; + v4i32 mul2 = { 0 }; + v4i32 mul3 = { 0 }; v4i32 sum0_w, sum1_w, sum2_w, sum3_w; v4i32 add0, add1, add2, add3; const int16_t *rv2[16]; @@ -618,10 +617,10 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, dst = LD_UB(dst_tmp + (cnt * pitch)); UNPCK_UB_SH(dst, dst_r_h, dst_l_h); MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1); - mul0 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult0); - mul1 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult0); - mul2 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult1); - mul3 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult1); + mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0); + mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0); + mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1); + mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1); ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h); } @@ -652,7 +651,7 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h); dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4); dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4); - tmp[row & 15] = (v16u8) __msa_pckev_b((v16i8) dst8, (v16i8) dst7); + tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7); UNPCK_SH_SW(sum0_h, sum0_w, sum1_w); UNPCK_SH_SW(sum1_h, sum2_w, sum3_w); @@ -669,8 +668,8 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, total2 = (total2 < flimit_vec); total3 = (total3 < flimit_vec); PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); - mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0); - tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8) mask); + mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0); + tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask); if (row >= 8) { ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch)); diff --git a/vpx_dsp/mips/fwd_dct32x32_msa.c b/vpx_dsp/mips/fwd_dct32x32_msa.c index f29c14b3d0e31154add3b6786d506853513b7e46..e41a904808e1826a8d6ee6a98ee2fef2cce8491c 100644 --- a/vpx_dsp/mips/fwd_dct32x32_msa.c +++ b/vpx_dsp/mips/fwd_dct32x32_msa.c @@ -27,10 +27,10 @@ static void fdct8x32_1d_column_load_butterfly(const int16_t *input, SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, - step0, step1, step2, step3, in4, in5, in6, in7); - BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, + step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); ST_SH4(step0, step1, step2, step3, temp_buff, 8); ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); @@ -45,10 +45,10 @@ static void fdct8x32_1d_column_load_butterfly(const int16_t *input, SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, - step0, step1, step2, step3, in4, in5, in6, in7); - BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, + step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); @@ -64,12 +64,12 @@ static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { /* fdct even */ LD_SH4(input, 8, in0, in1, in2, in3); LD_SH4(input + 96, 8, in12, in13, in14, in15); - BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, - vec0, vec1, vec2, vec3, in12, in13, in14, in15); + BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2, + vec3, in12, in13, in14, in15); LD_SH4(input + 32, 8, in4, in5, in6, in7); LD_SH4(input + 64, 8, in8, in9, in10, in11); - BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, - vec4, vec5, vec6, vec7, in8, in9, in10, in11); + BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7, + in8, in9, in10, in11); /* Stage 3 */ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); @@ -258,28 +258,26 @@ static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, - in8, in9, in10, in11, in12, in13, in14, in15); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15, - step0, step1, step2, step3, step4, step5, step6, step7, - in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, + step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); /* 2nd set */ LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, - in8, in9, in10, in11, in12, in13, in14, in15); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15, - step0, step1, step2, step3, step4, step5, step6, step7, - in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, + step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, (output + 8 * 8), 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); @@ -299,10 +297,9 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15, - vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, - in8, in9, in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); @@ -315,19 +312,19 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, UNPCK_SH_SW(vec5, vec5_l, vec5_r); UNPCK_SH_SW(vec6, vec6_l, vec6_r); UNPCK_SH_SW(vec7, vec7_l, vec7_r); - ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, - tmp0_w, tmp1_w, tmp2_w, tmp3_w); + ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, + tmp1_w, tmp2_w, tmp3_w); BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); - ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, - vec0_r, vec1_r, vec2_r, vec3_r); + ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, + vec1_r, vec2_r, vec3_r); tmp3_w = vec0_r + vec3_r; vec0_r = vec0_r - vec3_r; vec3_r = vec1_r + vec2_r; vec1_r = vec1_r - vec2_r; - DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, - cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r); + DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, + vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); @@ -335,8 +332,8 @@ static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out, 8); - DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, - cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r); + DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, + vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); @@ -401,10 +398,9 @@ static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15, - vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, - in8, in9, in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); /* Stage 3 */ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); @@ -610,8 +606,8 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { in3 = LD_SH(temp + 192); in5 = LD_SH(temp + 216); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); /* 2nd set */ in0_1 = LD_SH(temp + 16); @@ -637,10 +633,10 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { in6 = LD_SH(temp + 104); in7 = LD_SH(temp + 144); - ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - output + 8, 32); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8, + 32); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); /* 4th set */ @@ -655,12 +651,11 @@ static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); - ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - output + 24, 32); + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24, + 32); } -static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, - int16_t *output) { +static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) { fdct8x32_1d_row_load_butterfly(temp, temp_buf); fdct8x32_1d_row_even(temp_buf, temp_buf); fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); @@ -706,10 +701,9 @@ static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15, - vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, - in8, in9, in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); FDCT_POSTPROC_2V_NEG_H(vec0, vec1); FDCT_POSTPROC_2V_NEG_H(vec2, vec3); FDCT_POSTPROC_2V_NEG_H(vec4, vec5); diff --git a/vpx_dsp/mips/fwd_txfm_msa.c b/vpx_dsp/mips/fwd_txfm_msa.c index 5571d220e42843006c171760732ac75340e098c1..cb3d6282d5a27b249db307517973c3966dab3ee6 100644 --- a/vpx_dsp/mips/fwd_txfm_msa.c +++ b/vpx_dsp/mips/fwd_txfm_msa.c @@ -18,24 +18,24 @@ void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; - v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, - -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; - v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, - cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; - v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, - 0, 0, 0, 0 }; - - LD_SH16(input, src_stride, - in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15); + v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; + v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, + cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; + v8i16 coeff2 = { + -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 + }; + + LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, + in10, in11, in12, in13, in14, in15); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in8, in9, in10, in11, 2); SLLI_4V(in12, in13, in14, in15, 2); ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); - FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, - tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); @@ -137,10 +137,10 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) { LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, - in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); @@ -150,19 +150,19 @@ void fdct16x8_1d_row(int16_t *input, int16_t *output) { SRA_4V(in8, in9, in10, in11, 2); SRA_4V(in12, in13, in14, in15, 2); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, - tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15); + in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, + tmp7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); - FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, - tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); - FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, - in0, in1, in2, in3, in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, - tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, + tmp1, in1, tmp2, in2, tmp3, in3); ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); - TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, - tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); + TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, + tmp5, in5, tmp6, in6, tmp7, in7); ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); } @@ -203,14 +203,14 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output, LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); - VPX_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - VPX_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); + VPX_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + VPX_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); } diff --git a/vpx_dsp/mips/fwd_txfm_msa.h b/vpx_dsp/mips/fwd_txfm_msa.h index d7bb316d5be812eb01b8441b22f5ffde6968678e..6458dec6d20f3ebc11e78675b41d463ca0e8a5c2 100644 --- a/vpx_dsp/mips/fwd_txfm_msa.h +++ b/vpx_dsp/mips/fwd_txfm_msa.h @@ -14,358 +14,365 @@ #include "vpx_dsp/mips/txfm_macros_msa.h" #include "vpx_dsp/txfm_common.h" -#define LD_HADD(psrc, stride) ({ \ - v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ - v4i32 vec_w_m; \ - \ - LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ - ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ - LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ - ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, \ - in4_m, in6_m, in0_m, in4_m); \ - in0_m += in4_m; \ - \ - vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ - HADD_SW_S32(vec_w_m); \ -}) +#define LD_HADD(psrc, stride) \ + ({ \ + v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ + v4i32 vec_w_m; \ + \ + LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ + ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ + LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ + ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \ + in0_m, in4_m); \ + in0_m += in4_m; \ + \ + vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ + HADD_SW_S32(vec_w_m); \ + }) -#define VPX_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) { \ - v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ - v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ - v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \ - v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \ - cospi_24_64, -cospi_8_64, 0, 0, 0 }; \ - \ - BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ - ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ - SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \ - \ - SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \ - cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \ - vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ - \ - vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \ - cnst2_m = __msa_splati_h(coeff_m, 2); \ - cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \ - vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ - \ - SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \ - PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, \ - vec7_m, vec7_m, out0, out2, out1, out3); \ -} +#define VPX_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ + v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 coeff_m = { \ + cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \ + }; \ + \ + BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ + ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \ + cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \ + vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ + \ + vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \ + cnst2_m = __msa_splati_h(coeff_m, 2); \ + cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \ + vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ + \ + SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \ + vec7_m, out0, out2, out1, out3); \ + } -#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) { \ - v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - \ - SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \ - SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \ - AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, \ - in0, in1, in2, in3); \ - AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, \ - in4, in5, in6, in7); \ -} +#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ + { \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \ + SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \ + AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \ + in2, in3); \ + AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \ + in6, in7); \ + } -#define VPX_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ - v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \ - v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \ - cospi_24_64, cospi_4_64, cospi_28_64, \ - cospi_12_64, cospi_20_64 }; \ - \ - /* FDCT stage1 */ \ - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ - s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ - BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ - ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ - ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ - SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ - x1_m = __msa_ilvev_h(x1_m, x0_m); \ - out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ - \ - SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ - x2_m = -x2_m; \ - x2_m = __msa_ilvev_h(x3_m, x2_m); \ - out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ - \ - out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ - x2_m = __msa_splati_h(coeff_m, 2); \ - x2_m = __msa_ilvev_h(x2_m, x3_m); \ - out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ - \ - /* stage2 */ \ - ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ - \ - s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ - s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ - \ - /* stage3 */ \ - BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ - \ - /* stage4 */ \ - ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ - ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ - \ - SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ - x1_m = __msa_ilvev_h(x0_m, x1_m); \ - out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ - \ - SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ - x2_m = __msa_ilvev_h(x3_m, x2_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ - \ - x1_m = __msa_splati_h(coeff_m, 5); \ - x0_m = -x0_m; \ - x0_m = __msa_ilvev_h(x1_m, x0_m); \ - out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ - \ - x2_m = __msa_splati_h(coeff_m, 6); \ - x3_m = -x3_m; \ - x2_m = __msa_ilvev_h(x2_m, x3_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ -} +#define VPX_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ + v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ + \ + /* FDCT stage1 */ \ + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ + s3_m, s4_m, s5_m, s6_m, s7_m); \ + BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x1_m, x0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ + x2_m = -x2_m; \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + x2_m = __msa_splati_h(coeff_m, 2); \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + /* stage2 */ \ + ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ + \ + s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + /* stage3 */ \ + BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x0_m, x1_m); \ + out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + \ + x1_m = __msa_splati_h(coeff_m, 5); \ + x0_m = -x0_m; \ + x0_m = __msa_ilvev_h(x1_m, x0_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ + \ + x2_m = __msa_splati_h(coeff_m, 6); \ + x3_m = -x3_m; \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + } -#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ - v8i16 x0_m, x1_m, x2_m, x3_m; \ - v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ - cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ +#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v8i16 x0_m, x1_m, x2_m, x3_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ \ - /* FDCT stage1 */ \ - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ - s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ - BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ - ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ - ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ - SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ - x1_m = __msa_ilvev_h(x1_m, x0_m); \ - out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + /* FDCT stage1 */ \ + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ + s3_m, s4_m, s5_m, s6_m, s7_m); \ + BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x1_m, x0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ \ - SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ - x2_m = -x2_m; \ - x2_m = __msa_ilvev_h(x3_m, x2_m); \ - out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ + x2_m = -x2_m; \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ \ - out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ - x2_m = __msa_splati_h(coeff_m, 2); \ - x2_m = __msa_ilvev_h(x2_m, x3_m); \ - out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + x2_m = __msa_splati_h(coeff_m, 2); \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ \ - /* stage2 */ \ - ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ + /* stage2 */ \ + ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ \ - s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ - s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ \ - /* stage3 */ \ - BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + /* stage3 */ \ + BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ \ - /* stage4 */ \ - ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ - ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + /* stage4 */ \ + ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ \ - SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ - x1_m = __msa_ilvev_h(x0_m, x1_m); \ - out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ + SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x0_m, x1_m); \ + out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ \ - SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ - x2_m = __msa_ilvev_h(x3_m, x2_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ \ - x1_m = __msa_splati_h(coeff_m, 5); \ - x0_m = -x0_m; \ - x0_m = __msa_ilvev_h(x1_m, x0_m); \ - out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ + x1_m = __msa_splati_h(coeff_m, 5); \ + x0_m = -x0_m; \ + x0_m = __msa_ilvev_h(x1_m, x0_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ \ - x2_m = __msa_splati_h(coeff_m, 6); \ - x3_m = -x3_m; \ - x2_m = __msa_ilvev_h(x2_m, x3_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ -} + x2_m = __msa_splati_h(coeff_m, 6); \ + x3_m = -x3_m; \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + } -#define FDCT8x16_ODD(input0, input1, input2, input3, \ - input4, input5, input6, input7, \ - out1, out3, out5, out7, \ - out9, out11, out13, out15) { \ - v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ - v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ - v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \ - v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ - v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ - v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \ - cospi_24_64, -cospi_8_64, -cospi_24_64, \ - cospi_12_64, cospi_20_64 }; \ - v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, \ - cospi_18_64, cospi_10_64, cospi_22_64, \ - cospi_6_64, cospi_26_64 }; \ - v8i16 coeff2_m = { -cospi_2_64, -cospi_10_64, -cospi_18_64, \ - -cospi_26_64, 0, 0, 0, 0 }; \ - \ - /* stp 1 */ \ - ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \ - ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \ - \ - cnst4_m = __msa_splati_h(coeff_m, 0); \ - stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \ - \ - cnst5_m = __msa_splati_h(coeff_m, 1); \ - cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \ - stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \ - stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \ - stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \ - \ - /* stp2 */ \ - BUTTERFLY_4(input0, input1, stp22_m, stp23_m, \ - stp30_m, stp31_m, stp32_m, stp33_m); \ - BUTTERFLY_4(input7, input6, stp25_m, stp24_m, \ - stp37_m, stp36_m, stp35_m, stp34_m); \ - \ - ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \ - ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \ - \ - SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ - \ - cnst0_m = __msa_splati_h(coeff_m, 4); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ - \ - SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ - \ - cnst0_m = __msa_splati_h(coeff_m, 3); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ - \ - /* stp4 */ \ - BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, \ - vec6_m, vec2_m, vec4_m, vec5_m); \ - BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, \ - stp21_m, stp23_m, stp24_m, stp31_m); \ - \ - ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \ - SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - \ - out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - cnst0_m = __msa_splati_h(coeff2_m, 0); \ - cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \ - SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - \ - out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ - \ - cnst1_m = __msa_splati_h(coeff2_m, 2); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \ - SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - cnst0_m = __msa_splati_h(coeff2_m, 1); \ - cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \ - SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - \ - out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ - \ - cnst1_m = __msa_splati_h(coeff2_m, 3); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ -} +#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ + input7, out1, out3, out5, out7, out9, out11, out13, \ + out15) \ + { \ + v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ + v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ + v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \ + v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ + v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \ + v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \ + cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \ + v8i16 coeff2_m = { \ + -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \ + }; \ + \ + /* stp 1 */ \ + ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \ + ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \ + \ + cnst4_m = __msa_splati_h(coeff_m, 0); \ + stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \ + \ + cnst5_m = __msa_splati_h(coeff_m, 1); \ + cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \ + stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \ + stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \ + stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \ + \ + /* stp2 */ \ + BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \ + stp33_m); \ + BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \ + stp34_m); \ + \ + ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \ + ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff_m, 4); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ + \ + cnst0_m = __msa_splati_h(coeff_m, 3); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ + \ + /* stp4 */ \ + BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \ + vec5_m); \ + BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \ + stp31_m); \ + \ + ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + \ + out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff2_m, 0); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + \ + cnst1_m = __msa_splati_h(coeff2_m, 2); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff2_m, 1); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + \ + cnst1_m = __msa_splati_h(coeff2_m, 3); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + } -#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) { \ - v8i16 tp0_m, tp1_m; \ - v8i16 one_m = __msa_ldi_h(1); \ - \ - tp0_m = __msa_clti_s_h(vec0, 0); \ - tp1_m = __msa_clti_s_h(vec1, 0); \ - vec0 += 1; \ - vec1 += 1; \ - tp0_m = one_m & tp0_m; \ - tp1_m = one_m & tp1_m; \ - vec0 += tp0_m; \ - vec1 += tp1_m; \ - vec0 >>= 2; \ - vec1 >>= 2; \ -} +#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ + { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one_m = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clti_s_h(vec0, 0); \ + tp1_m = __msa_clti_s_h(vec1, 0); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one_m & tp0_m; \ + tp1_m = one_m & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ + } -#define FDCT32_POSTPROC_NEG_W(vec) { \ - v4i32 temp_m; \ - v4i32 one_m = __msa_ldi_w(1); \ - \ - temp_m = __msa_clti_s_w(vec, 0); \ - vec += 1; \ - temp_m = one_m & temp_m; \ - vec += temp_m; \ - vec >>= 2; \ -} +#define FDCT32_POSTPROC_NEG_W(vec) \ + { \ + v4i32 temp_m; \ + v4i32 one_m = __msa_ldi_w(1); \ + \ + temp_m = __msa_clti_s_w(vec, 0); \ + vec += 1; \ + temp_m = one_m & temp_m; \ + vec += temp_m; \ + vec >>= 2; \ + } -#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) { \ - v8i16 tp0_m, tp1_m; \ - v8i16 one = __msa_ldi_h(1); \ +#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ + { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one = __msa_ldi_h(1); \ \ - tp0_m = __msa_clei_s_h(vec0, 0); \ - tp1_m = __msa_clei_s_h(vec1, 0); \ - tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \ - tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \ - vec0 += 1; \ - vec1 += 1; \ - tp0_m = one & tp0_m; \ - tp1_m = one & tp1_m; \ - vec0 += tp0_m; \ - vec1 += tp1_m; \ - vec0 >>= 2; \ - vec1 >>= 2; \ -} + tp0_m = __msa_clei_s_h(vec0, 0); \ + tp1_m = __msa_clei_s_h(vec1, 0); \ + tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \ + tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one & tp0_m; \ + tp1_m = one & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ + } -#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, \ - reg1_right, const0, const1, \ - out0, out1, out2, out3) { \ - v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ - v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ - v4i32 k0_m = __msa_fill_w((int32_t) const0); \ - \ - s0_m = __msa_fill_w((int32_t) const1); \ - k0_m = __msa_ilvev_w(s0_m, k0_m); \ - \ - ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ - ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \ - ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \ - ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \ - \ - DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \ - DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \ - tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ - tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ - tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ - tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ - out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ - out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ - \ - DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \ - DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \ - tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ - tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ - tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ - tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ - out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ - out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ -} +#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ + const0, const1, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ + v4i32 k0_m = __msa_fill_w((int32_t)const0); \ + \ + s0_m = __msa_fill_w((int32_t)const1); \ + k0_m = __msa_ilvev_w(s0_m, k0_m); \ + \ + ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ + ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \ + ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \ + ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \ + \ + DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ + \ + DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ + } void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride); diff --git a/vpx_dsp/mips/idct16x16_msa.c b/vpx_dsp/mips/idct16x16_msa.c index 6d403efa7f6f71050d01104b2fda0cfaab463ae9..1cbeb35ba5efeccc909e95fa6d5a33a2e6275f5a 100644 --- a/vpx_dsp/mips/idct16x16_msa.c +++ b/vpx_dsp/mips/idct16x16_msa.c @@ -20,10 +20,10 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { input += 8; LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); - TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, - reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, - reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1, + reg2, reg3, reg4, reg5, reg6, reg7); + TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8, + reg9, reg10, reg11, reg12, reg13, reg14, reg15); DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); @@ -93,13 +93,13 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { reg3 = tmp7; /* transpose block */ - TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, - reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14); + TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0, + reg2, reg4, reg6, reg8, reg10, reg12, reg14); ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); /* transpose block */ - TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, - reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15); + TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3, + reg13, reg11, reg5, reg7, reg9, reg1, reg15); ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); } @@ -233,7 +233,7 @@ void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, /* short case just considers top 4 rows as valid output */ out += 4 * 16; for (i = 12; i--;) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sw $zero, 0(%[out]) \n\t" "sw $zero, 4(%[out]) \n\t" "sw $zero, 8(%[out]) \n\t" @@ -244,8 +244,7 @@ void vpx_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, "sw $zero, 28(%[out]) \n\t" : - : [out] "r" (out) - ); + : [out] "r"(out)); out += 16; } @@ -283,8 +282,8 @@ void vpx_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); CLIP_SH4_0_255(res0, res1, res2, res3); CLIP_SH4_0_255(res4, res5, res6, res7); - PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, - tmp0, tmp1, tmp2, tmp3); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, + tmp2, tmp3); ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); dst += (4 * dst_stride); } @@ -295,29 +294,28 @@ void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; /* load input data */ - LD_SH16(input, 8, - l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, l7, l15); - TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, - l0, l1, l2, l3, l4, l5, l6, l7); - TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, - l8, l9, l10, l11, l12, l13, l14, l15); + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, + l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6, + l7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11, + l12, l13, l14, l15); /* ADST in horizontal */ - VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, - l8, l9, l10, l11, l12, l13, l14, l15, - r0, r1, r2, r3, r4, r5, r6, r7, - r8, r9, r10, r11, r12, r13, r14, r15); + VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, + l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, + r12, r13, r14, r15); l1 = -r8; l3 = -r4; l13 = -r13; l15 = -r1; - TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, - l0, l1, l2, l3, l4, l5, l6, l7); + TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5, + l6, l7); ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); - TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, - l8, l9, l10, l11, l12, l13, l14, l15); + TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12, + l13, l14, l15); ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); } diff --git a/vpx_dsp/mips/idct32x32_msa.c b/vpx_dsp/mips/idct32x32_msa.c index de47597a8d54e1ffed2ccb99c81328210549c2f2..ed5cef18a9c0426dc008d3946e27adb8d7ce44b0 100644 --- a/vpx_dsp/mips/idct32x32_msa.c +++ b/vpx_dsp/mips/idct32x32_msa.c @@ -17,10 +17,10 @@ static void idct32x8_row_transpose_store(const int16_t *input, /* 1st & 2nd 8x8 */ LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); - TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, - m0, n0, m1, n1, m2, n2, m3, n3); - TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, - m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); @@ -28,10 +28,10 @@ static void idct32x8_row_transpose_store(const int16_t *input, /* 3rd & 4th 8x8 */ LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); - TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, - m0, n0, m1, n1, m2, n2, m3, n3); - TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, - m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); @@ -186,8 +186,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf, DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); /* 4 Stores */ - SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, - vec0, vec1, vec2, vec3); + SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); @@ -198,8 +197,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf, ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); /* 4 Stores */ - ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, - vec1, vec2, vec0, vec3); + ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3); BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); ST_SH(reg0, (tmp_odd_buf + 13 * 8)); ST_SH(reg1, (tmp_odd_buf + 14 * 8)); @@ -213,8 +211,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf, LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); - ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, - loc0, loc1, loc2, loc3); + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); SUB2(reg0, reg4, reg1, reg5, vec0, vec1); @@ -228,8 +225,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf, LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); - ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, - loc0, loc1, loc2, loc3); + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); SUB2(reg0, reg4, reg3, reg7, vec0, vec1); @@ -242,8 +238,7 @@ static void idct32x8_row_odd_process_store(int16_t *tmp_buf, static void idct_butterfly_transpose_store(int16_t *tmp_buf, int16_t *tmp_eve_buf, - int16_t *tmp_odd_buf, - int16_t *dst) { + int16_t *tmp_odd_buf, int16_t *dst) { v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; @@ -317,26 +312,26 @@ static void idct_butterfly_transpose_store(int16_t *tmp_buf, /* Transpose : 16 vectors */ /* 1st & 2nd 8x8 */ - TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, - m0, n0, m1, n1, m2, n2, m3, n3); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); ST_SH4(m0, n0, m1, n1, (dst + 0), 32); ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); - TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, - m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); ST_SH4(m4, n4, m5, n5, (dst + 8), 32); ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); /* 3rd & 4th 8x8 */ LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); - TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, - m0, n0, m1, n1, m2, n2, m3, n3); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); ST_SH4(m0, n0, m1, n1, (dst + 16), 32); ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); - TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, - m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); ST_SH4(m4, n4, m5, n5, (dst + 24), 32); ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); } @@ -349,8 +344,8 @@ static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { idct32x8_row_transpose_store(input, &tmp_buf[0]); idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); - idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], - &tmp_odd_buf[0], output); + idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], + output); } static void idct8x32_column_even_process_store(int16_t *tmp_buf, @@ -541,8 +536,7 @@ static void idct8x32_column_odd_process_store(int16_t *tmp_buf, } static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, - int16_t *tmp_odd_buf, - uint8_t *dst, + int16_t *tmp_odd_buf, uint8_t *dst, int32_t dst_stride) { v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; @@ -563,8 +557,8 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); SRARI_H4_SH(m0, m2, m4, m6, 6); - VPX_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), - m0, m2, m4, m6); + VPX_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4, + m6); /* Load 8 & Store 8 */ vec0 = LD_SH(tmp_odd_buf + 4 * 8); @@ -578,13 +572,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); SRARI_H4_SH(m1, m3, m5, m7, 6); - VPX_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), - m1, m3, m5, m7); + VPX_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7); SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); SRARI_H4_SH(m1, m3, m5, m7, 6); - VPX_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), - m1, m3, m5, m7); + VPX_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5, + m7); /* Load 8 & Store 8 */ vec0 = LD_SH(tmp_odd_buf + 2 * 8); @@ -598,13 +591,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); SRARI_H4_SH(n0, n2, n4, n6, 6); - VPX_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), - n0, n2, n4, n6); + VPX_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6); SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); SRARI_H4_SH(n0, n2, n4, n6, 6); - VPX_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), - n0, n2, n4, n6); + VPX_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4, + n6); /* Load 8 & Store 8 */ vec0 = LD_SH(tmp_odd_buf + 5 * 8); @@ -618,13 +610,12 @@ static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); SRARI_H4_SH(n1, n3, n5, n7, 6); - VPX_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), - n1, n3, n5, n7); + VPX_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7); SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); SRARI_H4_SH(n1, n3, n5, n7, 6); - VPX_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), - n1, n3, n5, n7); + VPX_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5, + n7); } static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, @@ -634,8 +625,8 @@ static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); - idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], - dst, dst_stride); + idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst, + dst_stride); } void vpx_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, @@ -665,7 +656,7 @@ void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, int16_t *out_ptr = out_arr; for (i = 32; i--;) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sw $zero, 0(%[out_ptr]) \n\t" "sw $zero, 4(%[out_ptr]) \n\t" "sw $zero, 8(%[out_ptr]) \n\t" @@ -684,8 +675,7 @@ void vpx_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, "sw $zero, 60(%[out_ptr]) \n\t" : - : [out_ptr] "r" (out_ptr) - ); + : [out_ptr] "r"(out_ptr)); out_ptr += 32; } @@ -728,8 +718,8 @@ void vpx_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); CLIP_SH4_0_255(res0, res1, res2, res3); CLIP_SH4_0_255(res4, res5, res6, res7); - PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, - tmp0, tmp1, tmp2, tmp3); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, + tmp2, tmp3); ST_UB2(tmp0, tmp1, dst, 16); dst += dst_stride; diff --git a/vpx_dsp/mips/idct4x4_msa.c b/vpx_dsp/mips/idct4x4_msa.c index 04064f87dfef555d140eba00b3f1e2aa06b379b7..50e824850d0c1f9f44ce7ba4a1b353a432231671 100644 --- a/vpx_dsp/mips/idct4x4_msa.c +++ b/vpx_dsp/mips/idct4x4_msa.c @@ -42,8 +42,8 @@ void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, in0_r -= in3_r; in2_r += in1_r; - PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, - in0, in1, in2, in3); + PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1, + in2, in3); ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride); } diff --git a/vpx_dsp/mips/idct8x8_msa.c b/vpx_dsp/mips/idct8x8_msa.c index 6a24935fffaec50d61f0c90cfdc99c3b622fb7bd..c06330b027f93b8df30f99e4b87df8a5d21eafde 100644 --- a/vpx_dsp/mips/idct8x8_msa.c +++ b/vpx_dsp/mips/idct8x8_msa.c @@ -18,17 +18,17 @@ void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst, LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); /* rows transform */ - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); /* 1D idct8x8 */ - VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); + VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); /* columns transform */ - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); /* 1D idct8x8 */ - VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); + VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); /* final rounding (add 2^4, divide by 2^5) and shift */ SRARI_H4_SH(in0, in1, in2, in3, 5); SRARI_H4_SH(in4, in5, in6, in7, 5); @@ -82,12 +82,12 @@ void vpx_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst, PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3); /* stage4 */ - BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, - in0, in1, in2, in3, in4, in5, in6, in7); - TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); + BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6, + in7); + TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); /* final rounding (add 2^4, divide by 2^5) and shift */ SRARI_H4_SH(in0, in1, in2, in3, 5); diff --git a/vpx_dsp/mips/intrapred16_dspr2.c b/vpx_dsp/mips/intrapred16_dspr2.c index 11444c718e7b4ccc8687e7c54bd42b41173e8dac..3e29d0ac39f37fabee210ccdb953332d78fe6e6e 100644 --- a/vpx_dsp/mips/intrapred16_dspr2.c +++ b/vpx_dsp/mips/intrapred16_dspr2.c @@ -13,10 +13,10 @@ #if HAVE_DSPR2 void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; - __asm__ __volatile__ ( + __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" "lb %[tmp2], 1(%[left]) \n\t" "lb %[tmp3], 2(%[left]) \n\t" @@ -146,26 +146,23 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, "sw %[tmp16], 8(%[dst]) \n\t" "sw %[tmp16], 12(%[dst]) \n\t" - : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), - [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4), - [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7), - [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8), - [tmp9] "=&r" (tmp9), [tmp10] "=&r" (tmp10), - [tmp11] "=&r" (tmp11), [tmp12] "=&r" (tmp12), - [tmp13] "=&r" (tmp13), [tmp14] "=&r" (tmp14), - [tmp15] "=&r" (tmp15), [tmp16] "=&r" (tmp16) - : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride) - ); + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), + [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9), + [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12), + [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15), + [tmp16] "=&r"(tmp16) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); } void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int32_t expected_dc; - int32_t average; - int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; - int32_t above2, left2; + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, left2; - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[above1], (%[above]) \n\t" "lw %[above2], 4(%[above]) \n\t" "lw %[left1], (%[left]) \n\t" @@ -316,14 +313,12 @@ void vpx_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, "sw %[expected_dc], 8(%[dst]) \n\t" "sw %[expected_dc], 12(%[dst]) \n\t" - : [left1] "=&r" (left1), [above1] "=&r" (above1), - [left_l1] "=&r" (left_l1), [above_l1] "=&r" (above_l1), - [left_r1] "=&r" (left_r1), [above_r1] "=&r" (above_r1), - [above2] "=&r" (above2), [left2] "=&r" (left2), - [average] "=&r" (average), [tmp] "=&r" (tmp), - [expected_dc] "=&r" (expected_dc) - : [above] "r" (above), [left] "r" (left), - [dst] "r" (dst), [stride] "r" (stride) - ); + : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1), + [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1), + [above_r1] "=&r"(above_r1), [above2] "=&r"(above2), + [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); } #endif // #if HAVE_DSPR2 diff --git a/vpx_dsp/mips/intrapred4_dspr2.c b/vpx_dsp/mips/intrapred4_dspr2.c index 03baf4c9cc81019ba6960e299414831da42fc131..9f51d50c752f820b4a90fd65636375799977d6de 100644 --- a/vpx_dsp/mips/intrapred4_dspr2.c +++ b/vpx_dsp/mips/intrapred4_dspr2.c @@ -13,9 +13,9 @@ #if HAVE_DSPR2 void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int32_t tmp1, tmp2, tmp3, tmp4; + int32_t tmp1, tmp2, tmp3, tmp4; - __asm__ __volatile__ ( + __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" "lb %[tmp2], 1(%[left]) \n\t" "lb %[tmp3], 2(%[left]) \n\t" @@ -32,19 +32,18 @@ void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, "add %[dst], %[dst], %[stride] \n\t" "sw %[tmp4], (%[dst]) \n\t" - : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), - [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4) - : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride) - ); + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); } void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int32_t expected_dc; - int32_t average; - int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l; + int32_t expected_dc; + int32_t average; + int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l; - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[above_c], (%[above]) \n\t" "lw %[left_c], (%[left]) \n\t" @@ -70,27 +69,26 @@ void vpx_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, "add %[dst], %[dst], %[stride] \n\t" "sw %[expected_dc], (%[dst]) \n\t" - : [above_c] "=&r" (above_c), [above_l] "=&r" (above_l), - [above_r] "=&r" (above_r), [left_c] "=&r" (left_c), - [left_l] "=&r" (left_l), [left_r] "=&r" (left_r), - [average] "=&r" (average), [tmp] "=&r" (tmp), - [expected_dc] "=&r" (expected_dc) - : [above] "r" (above), [left] "r" (left), - [dst] "r" (dst), [stride] "r" (stride) - ); + : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l), + [above_r] "=&r"(above_r), [left_c] "=&r"(left_c), + [left_l] "=&r"(left_l), [left_r] "=&r"(left_r), + [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); } void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int32_t abovel, abover; - int32_t left0, left1, left2, left3; - int32_t res0, res1; - int32_t resl; - int32_t resr; - int32_t top_left; - uint8_t *cm = vpx_ff_cropTbl; - - __asm__ __volatile__ ( + int32_t abovel, abover; + int32_t left0, left1, left2, left3; + int32_t res0, res1; + int32_t resl; + int32_t resr; + int32_t top_left; + uint8_t *cm = vpx_ff_cropTbl; + + __asm__ __volatile__( "ulw %[resl], (%[above]) \n\t" "lbu %[left0], (%[left]) \n\t" @@ -174,7 +172,6 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, "sra %[res0], %[res0], 16 \n\t" "lbux %[res0], %[res0](%[cm]) \n\t" - "sra %[res1], %[resr], 16 \n\t" "lbux %[res1], %[res1](%[cm]) \n\t" "sb %[res0], (%[dst]) \n\t" @@ -183,7 +180,6 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, "sra %[res0], %[res0], 16 \n\t" "lbux %[res0], %[res0](%[cm]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" "sra %[res1], %[resl], 16 \n\t" "lbux %[res1], %[res1](%[cm]) \n\t" @@ -218,12 +214,11 @@ void vpx_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, "sb %[res0], 2(%[dst]) \n\t" "sb %[res1], 3(%[dst]) \n\t" - : [abovel] "=&r" (abovel), [abover] "=&r" (abover), - [left0] "=&r" (left0), [left1] "=&r" (left1), [left2] "=&r" (left2), - [res0] "=&r" (res0), [res1] "=&r" (res1), [left3] "=&r" (left3), - [resl] "=&r" (resl), [resr] "=&r" (resr), [top_left] "=&r" (top_left) - : [above] "r" (above), [left] "r" (left), - [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm) - ); + : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0), + [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0), + [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl), + [resr] "=&r"(resr), [top_left] "=&r"(top_left) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride), [cm] "r"(cm)); } #endif // #if HAVE_DSPR2 diff --git a/vpx_dsp/mips/intrapred8_dspr2.c b/vpx_dsp/mips/intrapred8_dspr2.c index 196ff5a062ef53ee03f1a769f534bb91ff042810..eac79d51000b0afb5fc8c2b023282eeb45ed17f2 100644 --- a/vpx_dsp/mips/intrapred8_dspr2.c +++ b/vpx_dsp/mips/intrapred8_dspr2.c @@ -13,9 +13,9 @@ #if HAVE_DSPR2 void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - __asm__ __volatile__ ( + __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" "lb %[tmp2], 1(%[left]) \n\t" "lb %[tmp3], 2(%[left]) \n\t" @@ -58,23 +58,20 @@ void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, "sw %[tmp8], (%[dst]) \n\t" "sw %[tmp8], 4(%[dst]) \n\t" - : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), - [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4), - [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7), - [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8) - : [left] "r" (left), [dst] "r" (dst), - [stride] "r" (stride) - ); + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), + [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); } void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int32_t expected_dc; - int32_t average; - int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; - int32_t above2, above_l2, above_r2, left2, left_r2, left_l2; + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, above_l2, above_r2, left2, left_r2, left_l2; - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[above1], (%[above]) \n\t" "lw %[above2], 4(%[above]) \n\t" "lw %[left1], (%[left]) \n\t" @@ -137,30 +134,29 @@ void vpx_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, "sw %[expected_dc], (%[dst]) \n\t" "sw %[expected_dc], 4(%[dst]) \n\t" - : [above1] "=&r" (above1), [above_l1] "=&r" (above_l1), - [above_r1] "=&r" (above_r1), [left1] "=&r" (left1), - [left_l1] "=&r" (left_l1), [left_r1] "=&r" (left_r1), - [above2] "=&r" (above2), [above_l2] "=&r" (above_l2), - [above_r2] "=&r" (above_r2), [left2] "=&r" (left2), - [left_l2] "=&r" (left_l2), [left_r2] "=&r" (left_r2), - [average] "=&r" (average), [tmp] "=&r" (tmp), - [expected_dc] "=&r" (expected_dc) - : [above] "r" (above), [left] "r" (left), [dst] "r" (dst), - [stride] "r" (stride) - ); + : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1), + [above_r1] "=&r"(above_r1), [left1] "=&r"(left1), + [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1), + [above2] "=&r"(above2), [above_l2] "=&r"(above_l2), + [above_r2] "=&r"(above_r2), [left2] "=&r"(left2), + [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2), + [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); } void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int32_t abovel, abover; - int32_t abovel_1, abover_1; - int32_t left0; - int32_t res0, res1, res2, res3; - int32_t reshw; - int32_t top_left; - uint8_t *cm = vpx_ff_cropTbl; - - __asm__ __volatile__ ( + int32_t abovel, abover; + int32_t abovel_1, abover_1; + int32_t left0; + int32_t res0, res1, res2, res3; + int32_t reshw; + int32_t top_left; + uint8_t *cm = vpx_ff_cropTbl; + + __asm__ __volatile__( "ulw %[reshw], (%[above]) \n\t" "ulw %[top_left], 4(%[above]) \n\t" @@ -595,13 +591,12 @@ void vpx_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, "sb %[res2], 6(%[dst]) \n\t" "sb %[res3], 7(%[dst]) \n\t" - : [abovel] "=&r" (abovel), [abover] "=&r" (abover), - [abovel_1] "=&r" (abovel_1), [abover_1] "=&r" (abover_1), - [left0] "=&r" (left0), [res2] "=&r" (res2), [res3] "=&r" (res3), - [res0] "=&r" (res0), [res1] "=&r" (res1), - [reshw] "=&r" (reshw), [top_left] "=&r" (top_left) - : [above] "r" (above), [left] "r" (left), - [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm) - ); + : [abovel] "=&r"(abovel), [abover] "=&r"(abover), + [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1), + [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3), + [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw), + [top_left] "=&r"(top_left) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride), [cm] "r"(cm)); } #endif // #if HAVE_DSPR2 diff --git a/vpx_dsp/mips/intrapred_msa.c b/vpx_dsp/mips/intrapred_msa.c index f6fbe4016257c93a8cac7a071e7205c031828015..b5ee943031a0982bbff270d62afe9b8346a400ef 100644 --- a/vpx_dsp/mips/intrapred_msa.c +++ b/vpx_dsp/mips/intrapred_msa.c @@ -11,10 +11,11 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" -#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) { \ - out0 = __msa_subs_u_h(out0, in0); \ - out1 = __msa_subs_u_h(out1, in1); \ -} +#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ + { \ + out0 = __msa_subs_u_h(out0, in0); \ + out1 = __msa_subs_u_h(out1, in1); \ + } static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride) { @@ -150,8 +151,8 @@ static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst, } static void intra_predict_dc_4x4_msa(const uint8_t *src_top, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { uint32_t val0, val1; v16i8 store, src = { 0 }; v8u16 sum_h; @@ -199,8 +200,8 @@ static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) { } static void intra_predict_dc_8x8_msa(const uint8_t *src_top, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { uint64_t val0, val1; v16i8 store; v16u8 src = { 0 }; @@ -260,8 +261,8 @@ static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { } static void intra_predict_dc_16x16_msa(const uint8_t *src_top, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { v16u8 top, left, out; v8u16 sum_h, sum_top, sum_left; v4u32 sum_w; @@ -313,8 +314,8 @@ static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) { } static void intra_predict_dc_32x32_msa(const uint8_t *src_top, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { uint32_t row; v16u8 top0, top1, left0, left1, out; v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; @@ -381,8 +382,8 @@ static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { } static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { uint32_t val; uint8_t top_left = src_top_ptr[-1]; v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; @@ -409,8 +410,8 @@ static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, } static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { uint64_t val; uint8_t top_left = src_top_ptr[-1]; uint32_t loop_cnt; @@ -442,8 +443,8 @@ static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, } static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { uint8_t top_left = src_top_ptr[-1]; uint32_t loop_cnt; v16i8 src_top, src_left0, src_left1, src_left2, src_left3; @@ -491,8 +492,8 @@ static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, } static void intra_predict_tm_32x32_msa(const uint8_t *src_top, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { uint8_t top_left = src_top[-1]; uint32_t loop_cnt; v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; diff --git a/vpx_dsp/mips/inv_txfm_dspr2.h b/vpx_dsp/mips/inv_txfm_dspr2.h index abd85091188eb371cc2375b26665c3cd9e95e118..edd54aec5e20378d0cda7fe7b371ac8d396f67dd 100644 --- a/vpx_dsp/mips/inv_txfm_dspr2.h +++ b/vpx_dsp/mips/inv_txfm_dspr2.h @@ -23,31 +23,39 @@ extern "C" { #endif #if HAVE_DSPR2 -#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) ({ \ +#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \ + ({ \ \ - int32_t tmp, out; \ - int dct_cost_rounding = DCT_CONST_ROUNDING; \ - int in = input; \ + int32_t tmp, out; \ + int dct_cost_rounding = DCT_CONST_ROUNDING; \ + int in = input; \ \ - __asm__ __volatile__ ( \ - /* out = dct_const_round_shift(input_dc * cospi_16_64); */ \ - "mtlo %[dct_cost_rounding], $ac1 \n\t"\ - "mthi $zero, $ac1 \n\t"\ - "madd $ac1, %[in], %[cospi_16_64] \n\t"\ - "extp %[tmp], $ac1, 31 \n\t"\ + __asm__ __volatile__(/* out = dct_const_round_shift(dc * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac1 " \ + " \n\t" \ + "mthi $zero, $ac1 " \ + " \n\t" \ + "madd $ac1, %[in], " \ + "%[cospi_16_64] \n\t" \ + "extp %[tmp], $ac1, " \ + "31 \n\t" \ \ - /* out = dct_const_round_shift(out * cospi_16_64); */ \ - "mtlo %[dct_cost_rounding], $ac2 \n\t"\ - "mthi $zero, $ac2 \n\t"\ - "madd $ac2, %[tmp], %[cospi_16_64] \n\t"\ - "extp %[out], $ac2, 31 \n\t"\ + /* out = dct_const_round_shift(out * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac2 " \ + " \n\t" \ + "mthi $zero, $ac2 " \ + " \n\t" \ + "madd $ac2, %[tmp], " \ + "%[cospi_16_64] \n\t" \ + "extp %[out], $ac2, " \ + "31 \n\t" \ \ - : [tmp] "=&r" (tmp), [out] "=r" (out) \ - : [in] "r" (in), \ - [dct_cost_rounding] "r" (dct_cost_rounding), \ - [cospi_16_64] "r" (cospi_16_64) \ - ); \ - out; }) + : [tmp] "=&r"(tmp), [out] "=r"(out) \ + : [in] "r"(in), \ + [dct_cost_rounding] "r"(dct_cost_rounding), \ + [cospi_16_64] "r"(cospi_16_64)); \ + out; \ + }) void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride); @@ -59,10 +67,8 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride); void iadst8_dspr2(const int16_t *input, int16_t *output); -void idct16_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows); -void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); +void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride); void iadst16_dspr2(const int16_t *input, int16_t *output); #endif // #if HAVE_DSPR2 diff --git a/vpx_dsp/mips/inv_txfm_msa.h b/vpx_dsp/mips/inv_txfm_msa.h index 303fb3ea67368745052dfaa8d3637191f30b3124..ee94782c9a45f90353027e0f570922199338b87c 100644 --- a/vpx_dsp/mips/inv_txfm_msa.h +++ b/vpx_dsp/mips/inv_txfm_msa.h @@ -15,391 +15,392 @@ #include "vpx_dsp/mips/txfm_macros_msa.h" #include "vpx_dsp/txfm_common.h" -#define VPX_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ - v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ - v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ - cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ - v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, \ - -cospi_16_64, cospi_24_64, -cospi_24_64, 0, 0 }; \ - \ - SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ - cnst2_m = -cnst0_m; \ - ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ - SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ - cnst4_m = -cnst2_m; \ - ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ - \ - ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ - ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ - DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ - cnst1_m, cnst2_m, cnst3_m, in7, in0, \ - in4, in3); \ - \ - SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ - cnst2_m = -cnst0_m; \ - ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ - SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ - cnst4_m = -cnst2_m; \ - ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ - \ - ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ - ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ - \ - DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ - cnst1_m, cnst2_m, cnst3_m, in5, in2, \ - in6, in1); \ - BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ - out7 = -s0_m; \ - out0 = s1_m; \ - \ - SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, \ - cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ - \ - ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ - cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - cnst1_m = cnst0_m; \ - \ - ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ - ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ - DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, \ - cnst2_m, cnst3_m, cnst1_m, out1, out6, \ - s0_m, s1_m); \ - \ - SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - \ - ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ - ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ - out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ - \ - out1 = -out1; \ - out3 = -out3; \ - out5 = -out5; \ -} +#define VPX_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ + v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ + cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ + v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ + cospi_24_64, -cospi_24_64, 0, 0 }; \ + \ + SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ + ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in7, in0, in4, in3); \ + \ + SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in5, in2, in6, in1); \ + BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ + out7 = -s0_m; \ + out0 = s1_m; \ + \ + SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ + \ + ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + cnst1_m = cnst0_m; \ + \ + ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \ + cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \ + \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + out1 = -out1; \ + out3 = -out3; \ + out5 = -out5; \ + } -#define VPX_SET_COSPI_PAIR(c0_h, c1_h) ({ \ - v8i16 out0_m, r0_m, r1_m; \ - \ - r0_m = __msa_fill_h(c0_h); \ - r1_m = __msa_fill_h(c1_h); \ - out0_m = __msa_ilvev_h(r1_m, r0_m); \ - \ - out0_m; \ -}) +#define VPX_SET_COSPI_PAIR(c0_h, c1_h) \ + ({ \ + v8i16 out0_m, r0_m, r1_m; \ + \ + r0_m = __msa_fill_h(c0_h); \ + r1_m = __msa_fill_h(c1_h); \ + out0_m = __msa_ilvev_h(r1_m, r0_m); \ + \ + out0_m; \ + }) -#define VPX_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) { \ - uint8_t *dst_m = (uint8_t *) (dst); \ - v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ - v16i8 tmp0_m, tmp1_m; \ - v16i8 zero_m = { 0 }; \ - v8i16 res0_m, res1_m, res2_m, res3_m; \ - \ - LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ - ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, \ - zero_m, dst3_m, res0_m, res1_m, res2_m, res3_m); \ - ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, \ - res0_m, res1_m, res2_m, res3_m); \ - CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ - PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ -} +#define VPX_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \ + { \ + uint8_t *dst_m = (uint8_t *)(dst); \ + v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ + v16i8 tmp0_m, tmp1_m; \ + v16i8 zero_m = { 0 }; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ + ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \ + res0_m, res1_m, res2_m, res3_m); \ + ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \ + res2_m, res3_m); \ + CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ + PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ + } -#define VPX_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \ - v8i16 c0_m, c1_m, c2_m, c3_m; \ - v8i16 step0_m, step1_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - c0_m = VPX_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ - c1_m = VPX_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ - step0_m = __msa_ilvr_h(in2, in0); \ - DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ - \ - c2_m = VPX_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ - c3_m = VPX_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ - step1_m = __msa_ilvr_h(in3, in1); \ - DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - \ - PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ - SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ - BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, \ - (v8i16)tmp2_m, (v8i16)tmp3_m, \ - out0, out1, out2, out3); \ -} +#define VPX_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 c0_m, c1_m, c2_m, c3_m; \ + v8i16 step0_m, step1_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + c0_m = VPX_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + c1_m = VPX_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + step0_m = __msa_ilvr_h(in2, in0); \ + DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + \ + c2_m = VPX_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + c3_m = VPX_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + step1_m = __msa_ilvr_h(in3, in1); \ + DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + \ + PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ + SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ + BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \ + out0, out1, out2, out3); \ + } -#define VPX_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) { \ - v8i16 res0_m, res1_m, c0_m, c1_m; \ - v8i16 k1_m, k2_m, k3_m, k4_m; \ - v8i16 zero_m = { 0 }; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v4i32 int0_m, int1_m, int2_m, int3_m; \ - v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, \ - sinpi_4_9, -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, \ - -sinpi_4_9 }; \ - \ - SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ - ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ - ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ - DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ - int0_m = tmp2_m + tmp1_m; \ - \ - SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ - ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ - DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ - int1_m = tmp0_m + tmp1_m; \ - \ - c0_m = __msa_splati_h(mask_m, 6); \ - ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ - ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ - DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ - int2_m = tmp0_m + tmp1_m; \ - \ - c0_m = __msa_splati_h(mask_m, 6); \ - c0_m = __msa_ilvev_h(c0_m, k1_m); \ - \ - res0_m = __msa_ilvr_h((in1), (in3)); \ - tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ - int3_m = tmp2_m + tmp0_m; \ - \ - res0_m = __msa_ilvr_h((in2), (in3)); \ - c1_m = __msa_ilvev_h(k4_m, k3_m); \ - \ - tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ - res1_m = __msa_ilvr_h((in0), (in2)); \ - c1_m = __msa_ilvev_h(k1_m, zero_m); \ - \ - tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ - int3_m += tmp2_m; \ - int3_m += tmp3_m; \ - \ - SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ - PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ -} +#define VPX_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 res0_m, res1_m, c0_m, c1_m; \ + v8i16 k1_m, k2_m, k3_m, k4_m; \ + v8i16 zero_m = { 0 }; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v4i32 int0_m, int1_m, int2_m, int3_m; \ + v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \ + -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \ + \ + SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ + ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ + int0_m = tmp2_m + tmp1_m; \ + \ + SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ + ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int1_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int2_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + c0_m = __msa_ilvev_h(c0_m, k1_m); \ + \ + res0_m = __msa_ilvr_h((in1), (in3)); \ + tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ + int3_m = tmp2_m + tmp0_m; \ + \ + res0_m = __msa_ilvr_h((in2), (in3)); \ + c1_m = __msa_ilvev_h(k4_m, k3_m); \ + \ + tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ + res1_m = __msa_ilvr_h((in0), (in2)); \ + c1_m = __msa_ilvev_h(k1_m, zero_m); \ + \ + tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ + int3_m += tmp2_m; \ + int3_m += tmp3_m; \ + \ + SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ + PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ + } -#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) ({ \ - v8i16 c0_m, c1_m; \ - \ - SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ - c0_m = __msa_ilvev_h(c1_m, c0_m); \ - \ - c0_m; \ -}) +#define VP9_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \ + ({ \ + v8i16 c0_m, c1_m; \ + \ + SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ + c0_m = __msa_ilvev_h(c1_m, c0_m); \ + \ + c0_m; \ + }) /* multiply and add macro */ -#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ - out0, out1, out2, out3) { \ - v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ - ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ - DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, \ - cst0, cst0, cst1, cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ - DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, \ - cst2, cst2, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \ -} +#define VP9_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ + out2, out3) \ + { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ + DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \ + cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ + DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \ + cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \ + } /* idct 8x8 macro */ -#define VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ - v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ - cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ - \ - k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \ - k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \ - k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \ - k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \ - VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ - SUB2(in1, in3, in7, in5, res0_m, res1_m); \ - k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \ - k1_m = __msa_splati_h(mask_m, 4); \ - \ - ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ - DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ - tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - tp4_m = in1 + in3; \ - PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ - tp7_m = in7 + in5; \ - k2_m = VPX_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ - k3_m = VPX_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ - VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, \ - in0, in4, in2, in6); \ - BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ - BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, \ - out0, out1, out2, out3, out4, out5, out6, out7); \ -} +#define VPX_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ + cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ + \ + k0_m = VP9_SET_CONST_PAIR(mask_m, 0, 5); \ + k1_m = VP9_SET_CONST_PAIR(mask_m, 1, 0); \ + k2_m = VP9_SET_CONST_PAIR(mask_m, 6, 3); \ + k3_m = VP9_SET_CONST_PAIR(mask_m, 3, 2); \ + VP9_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ + SUB2(in1, in3, in7, in5, res0_m, res1_m); \ + k0_m = VP9_SET_CONST_PAIR(mask_m, 4, 7); \ + k1_m = __msa_splati_h(mask_m, 4); \ + \ + ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ + DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + tp4_m = in1 + in3; \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ + tp7_m = in7 + in5; \ + k2_m = VPX_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k3_m = VPX_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + VP9_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \ + BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ + BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \ + out1, out2, out3, out4, out5, out6, out7); \ + } -#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ - v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ - v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, \ - cospi_10_64, cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ - v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, \ - cospi_6_64, -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ - v8i16 mask3_m = { -cospi_24_64, cospi_8_64, cospi_16_64, \ - -cospi_16_64, 0, 0, 0, 0 }; \ - \ - k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \ - k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \ - ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ - r0_m, r1_m, r2_m, r3_m); \ - k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \ - k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \ - ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ - r4_m, r5_m, r6_m, r7_m); \ - ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ - m0_m, m1_m, m2_m, m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ - SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ - m0_m, m1_m, m2_m, m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ - k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \ - k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \ - ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ - r0_m, r1_m, r2_m, r3_m); \ - k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \ - k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \ - ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ - r4_m, r5_m, r6_m, r7_m); \ - ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ - m0_m, m1_m, m2_m, m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ - SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, \ - m0_m, m1_m, m2_m, m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ - ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ - BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ - k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \ - k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \ - ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ - r0_m, r1_m, r2_m, r3_m); \ - k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \ - DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \ - r4_m, r5_m, r6_m, r7_m); \ - ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \ - m0_m, m1_m, m2_m, m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ - SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, \ - m0_m, m1_m, m2_m, m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ - k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \ - k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \ - ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, \ - m0_m, m1_m, m2_m, m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ - ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ - DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, \ - m0_m, m1_m, m2_m, m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ - \ - out1 = -in1; \ - out3 = -in3; \ - out5 = -in5; \ - out7 = -in7; \ -} +#define VP9_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ + v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ + v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \ + cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ + v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \ + -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ + v8i16 mask3_m = { \ + -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \ + }; \ + \ + k0_m = VP9_SET_CONST_PAIR(mask1_m, 0, 1); \ + k1_m = VP9_SET_CONST_PAIR(mask1_m, 1, 2); \ + ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k0_m = VP9_SET_CONST_PAIR(mask1_m, 6, 7); \ + k1_m = VP9_SET_CONST_PAIR(mask2_m, 0, 1); \ + ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ + r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ + k0_m = VP9_SET_CONST_PAIR(mask1_m, 3, 4); \ + k1_m = VP9_SET_CONST_PAIR(mask1_m, 4, 5); \ + ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k0_m = VP9_SET_CONST_PAIR(mask2_m, 2, 3); \ + k1_m = VP9_SET_CONST_PAIR(mask2_m, 3, 4); \ + ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ + r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ + ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ + BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ + k0_m = VP9_SET_CONST_PAIR(mask2_m, 5, 6); \ + k1_m = VP9_SET_CONST_PAIR(mask2_m, 6, 7); \ + ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k1_m = VP9_SET_CONST_PAIR(mask3_m, 0, 1); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \ + r6_m, r7_m); \ + ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ + SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ + k0_m = VP9_SET_CONST_PAIR(mask3_m, 2, 2); \ + k1_m = VP9_SET_CONST_PAIR(mask3_m, 2, 3); \ + ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \ + m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ + ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \ + m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ + \ + out1 = -in1; \ + out3 = -in3; \ + out5 = -in5; \ + out7 = -in7; \ + } -#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, \ - r9, r10, r11, r12, r13, r14, r15, \ - out0, out1, out2, out3, out4, out5, \ - out6, out7, out8, out9, out10, out11, \ - out12, out13, out14, out15) { \ - v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ - v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ - v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ - v8i16 h8_m, h9_m, h10_m, h11_m; \ - v8i16 k0_m, k1_m, k2_m, k3_m; \ - \ - /* stage 1 */ \ - k0_m = VPX_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ - k1_m = VPX_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ - k2_m = VPX_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ - k3_m = VPX_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ - MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, \ - g0_m, g1_m, g2_m, g3_m); \ - k0_m = VPX_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ - k1_m = VPX_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ - k2_m = VPX_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ - k3_m = VPX_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ - MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, \ - g4_m, g5_m, g6_m, g7_m); \ - k0_m = VPX_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ - k1_m = VPX_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ - k2_m = VPX_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ - k3_m = VPX_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ - MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, \ - g8_m, g9_m, g10_m, g11_m); \ - k0_m = VPX_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ - k1_m = VPX_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ - k2_m = VPX_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ - k3_m = VPX_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ - MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, \ - g12_m, g13_m, g14_m, g15_m); \ - \ - /* stage 2 */ \ - k0_m = VPX_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ - k1_m = VPX_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ - k2_m = VPX_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ - MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, \ - h0_m, h1_m, h2_m, h3_m); \ - k0_m = VPX_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ - k1_m = VPX_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ - k2_m = VPX_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ - MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, \ - h4_m, h5_m, h6_m, h7_m); \ - BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ - BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, \ - h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ - \ - /* stage 3 */ \ - BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ - k0_m = VPX_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ - k1_m = VPX_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ - k2_m = VPX_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ - MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, \ - out4, out6, out5, out7); \ - MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, \ - out12, out14, out13, out15); \ - \ - /* stage 4 */ \ - k0_m = VPX_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ - k1_m = VPX_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ - k2_m = VPX_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ - k3_m = VPX_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ - MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ - MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ - MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ - MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ -} +#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \ + r12, r13, r14, r15, out0, out1, out2, out3, out4, \ + out5, out6, out7, out8, out9, out10, out11, out12, \ + out13, out14, out15) \ + { \ + v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ + v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ + v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ + v8i16 h8_m, h9_m, h10_m, h11_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m; \ + \ + /* stage 1 */ \ + k0_m = VPX_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ + k1_m = VPX_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ + k2_m = VPX_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ + k3_m = VPX_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ + MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \ + k0_m = VPX_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ + k1_m = VPX_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ + k2_m = VPX_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ + k3_m = VPX_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ + MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \ + k0_m = VPX_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ + k1_m = VPX_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ + k2_m = VPX_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ + k3_m = VPX_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ + MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \ + g11_m); \ + k0_m = VPX_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ + k1_m = VPX_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ + k2_m = VPX_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ + k3_m = VPX_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ + MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \ + g15_m); \ + \ + /* stage 2 */ \ + k0_m = VPX_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ + k1_m = VPX_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ + k2_m = VPX_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ + MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \ + h3_m); \ + k0_m = VPX_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ + k1_m = VPX_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ + k2_m = VPX_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ + MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \ + h6_m, h7_m); \ + BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ + BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \ + h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ + \ + /* stage 3 */ \ + BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ + k0_m = VPX_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + k1_m = VPX_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k2_m = VPX_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ + MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \ + out7); \ + MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \ + out13, out15); \ + \ + /* stage 4 */ \ + k0_m = VPX_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + k1_m = VPX_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ + k2_m = VPX_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + k3_m = VPX_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ + MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ + MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ + MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ + MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ + } void vpx_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, int32_t dst_stride); diff --git a/vpx_dsp/mips/itrans16_dspr2.c b/vpx_dsp/mips/itrans16_dspr2.c index 6d41e6190b78c032023aa851efc9c65c71c7c8f1..0ec0c2059f4463c6c84185892f9e4ec2d9375a8f 100644 --- a/vpx_dsp/mips/itrans16_dspr2.c +++ b/vpx_dsp/mips/itrans16_dspr2.c @@ -26,11 +26,11 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output, int result1, result2, result3, result4; const int const_2_power_13 = 8192; - for (i = no_rows; i--; ) { + for (i = no_rows; i--;) { /* prefetch row */ prefetch_load((const uint8_t *)(input + 16)); - __asm__ __volatile__ ( + __asm__ __volatile__( "lh %[load1], 0(%[input]) \n\t" "lh %[load2], 16(%[input]) \n\t" "lh %[load3], 8(%[input]) \n\t" @@ -64,19 +64,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output, "sub %[step1_2], %[step2_1], %[step2_2] \n\t" "sub %[step1_3], %[step2_0], %[step2_3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), - [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), - [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), - [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), - [cospi_16_64] "r" (cospi_16_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), + [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), + [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( "lh %[load5], 2(%[input]) \n\t" "lh %[load6], 30(%[input]) \n\t" "lh %[load7], 18(%[input]) \n\t" @@ -126,19 +125,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output, "add %[step2_8], %[result1], %[result2] \n\t" "add %[step2_15], %[result4], %[result3] \n\t" - : [load5] "=&r" (load5), [load6] "=&r" (load6), - [load7] "=&r" (load7), [load8] "=&r" (load8), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [result3] "=&r" (result3), [result4] "=&r" (result4), - [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), - [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), - [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - __asm__ __volatile__ ( + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_8] "=r"(step2_8), + [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), + [step2_14] "=r"(step2_14) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( "lh %[load1], 10(%[input]) \n\t" "lh %[load2], 22(%[input]) \n\t" "lh %[load3], 26(%[input]) \n\t" @@ -188,19 +186,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output, "add %[step2_11], %[result1], %[result2] \n\t" "add %[step2_12], %[result4], %[result3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [result3] "=&r" (result3), [result4] "=&r" (result4), - [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), - [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), - [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( "lh %[load5], 4(%[input]) \n\t" "lh %[load6], 28(%[input]) \n\t" "lh %[load7], 20(%[input]) \n\t" @@ -253,19 +250,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output, "add %[step1_4], %[result1], %[result2] \n\t" "add %[step1_7], %[result4], %[result3] \n\t" - : [load5] "=&r" (load5), [load6] "=&r" (load6), - [load7] "=&r" (load7), [load8] "=&r" (load8), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [result3] "=&r" (result3), [result4] "=&r" (result4), - [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), - [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), - [cospi_16_64] "r" (cospi_16_64) - ); - - __asm__ __volatile__ ( + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "mtlo %[const_2_power_13], $ac1 \n\t" @@ -305,18 +301,16 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output, "extp %[step1_11], $ac2, 31 \n\t" "extp %[step1_12], $ac3, 31 \n\t" - : [load5] "=&r" (load5), [load6] "=&r" (load6), - [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), - [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) - : [const_2_power_13] "r" (const_2_power_13), - [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), - [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), - [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), - [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), - [cospi_16_64] "r" (cospi_16_64) - ); - - __asm__ __volatile__ ( + : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), + [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), + [step1_13] "=r"(step1_13) + : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), + [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), + [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), + [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( "add %[load5], %[step1_0], %[step1_7] \n\t" "add %[load5], %[load5], %[step2_12] \n\t" "add %[load5], %[load5], %[step2_15] \n\t" @@ -350,17 +344,15 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output, "sh %[load5], 448(%[output]) \n\t" "sh %[load6], 480(%[output]) \n\t" - : [load5] "=&r" (load5), [load6] "=&r" (load6) - : [output] "r" (output), - [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), - [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), - [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), - [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), - [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), - [step2_14] "r" (step2_14), [step2_15] "r" (step2_15) - ); - - __asm__ __volatile__ ( + : [load5] "=&r"(load5), [load6] "=&r"(load6) + : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1), + [step1_6] "r"(step1_6), [step1_7] "r"(step1_7), + [step2_8] "r"(step2_8), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_11] "r"(step2_11), + [step2_12] "r"(step2_12), [step2_13] "r"(step2_13), + [step2_14] "r"(step2_14), [step2_15] "r"(step2_15)); + + __asm__ __volatile__( "add %[load5], %[step1_2], %[step1_5] \n\t" "add %[load5], %[load5], %[step1_13] \n\t" "add %[load6], %[step1_3], %[step1_4] \n\t" @@ -386,21 +378,18 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output, "sh %[load5], 384(%[output]) \n\t" "sh %[load6], 416(%[output]) \n\t" - : [load5] "=&r" (load5), [load6] "=&r" (load6) - : [output] "r" (output), - [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), - [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), - [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), - [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) - ); + : [load5] "=&r"(load5), [load6] "=&r"(load6) + : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), + [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), + [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13)); input += 16; output += 1; } } -void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { int i; int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int step1_8, step1_9, step1_10, step1_11; @@ -416,9 +405,9 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, /* prefetch vpx_ff_cropTbl */ prefetch_load(vpx_ff_cropTbl); - prefetch_load(vpx_ff_cropTbl + 32); - prefetch_load(vpx_ff_cropTbl + 64); - prefetch_load(vpx_ff_cropTbl + 96); + prefetch_load(vpx_ff_cropTbl + 32); + prefetch_load(vpx_ff_cropTbl + 64); + prefetch_load(vpx_ff_cropTbl + 96); prefetch_load(vpx_ff_cropTbl + 128); prefetch_load(vpx_ff_cropTbl + 160); prefetch_load(vpx_ff_cropTbl + 192); @@ -426,7 +415,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, for (i = 0; i < 16; ++i) { dest_pix = (dest + i); - __asm__ __volatile__ ( + __asm__ __volatile__( "lh %[load1], 0(%[input]) \n\t" "lh %[load2], 16(%[input]) \n\t" "lh %[load3], 8(%[input]) \n\t" @@ -460,19 +449,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "sub %[step1_2], %[step2_1], %[step2_2] \n\t" "sub %[step1_3], %[step2_0], %[step2_3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), - [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), - [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), - [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), - [cospi_16_64] "r" (cospi_16_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), + [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), + [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( "lh %[load5], 2(%[input]) \n\t" "lh %[load6], 30(%[input]) \n\t" "lh %[load7], 18(%[input]) \n\t" @@ -522,19 +510,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[step2_8], %[result1], %[result2] \n\t" "add %[step2_15], %[result4], %[result3] \n\t" - : [load5] "=&r" (load5), [load6] "=&r" (load6), - [load7] "=&r" (load7), [load8] "=&r" (load8), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [result3] "=&r" (result3), [result4] "=&r" (result4), - [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), - [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), - [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - __asm__ __volatile__ ( + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_8] "=r"(step2_8), + [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), + [step2_14] "=r"(step2_14) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( "lh %[load1], 10(%[input]) \n\t" "lh %[load2], 22(%[input]) \n\t" "lh %[load3], 26(%[input]) \n\t" @@ -584,19 +571,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[step2_11], %[result1], %[result2] \n\t" "add %[step2_12], %[result4], %[result3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [result3] "=&r" (result3), [result4] "=&r" (result4), - [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), - [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), - [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( "lh %[load5], 4(%[input]) \n\t" "lh %[load6], 28(%[input]) \n\t" "lh %[load7], 20(%[input]) \n\t" @@ -650,19 +636,18 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[step1_4], %[result1], %[result2] \n\t" "add %[step1_7], %[result4], %[result3] \n\t" - : [load5] "=&r" (load5), [load6] "=&r" (load6), - [load7] "=&r" (load7), [load8] "=&r" (load8), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [result3] "=&r" (result3), [result4] "=&r" (result4), - [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), - [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), - [cospi_16_64] "r" (cospi_16_64) - ); - - __asm__ __volatile__ ( + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "mtlo %[const_2_power_13], $ac1 \n\t" @@ -702,23 +687,21 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "extp %[step1_11], $ac2, 31 \n\t" "extp %[step1_12], $ac3, 31 \n\t" - : [load5] "=&r" (load5), [load6] "=&r" (load6), - [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), - [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) - : [const_2_power_13] "r" (const_2_power_13), - [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), - [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), - [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), - [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), - [cospi_16_64] "r" (cospi_16_64) - ); + : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), + [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), + [step1_13] "=r"(step1_13) + : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), + [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), + [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), + [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); step1_8 = step2_8 + step2_11; step1_9 = step2_9 + step2_10; step1_14 = step2_13 + step2_14; step1_15 = step2_12 + step2_15; - __asm__ __volatile__ ( + __asm__ __volatile__( "lbu %[load7], 0(%[dest_pix]) \n\t" "add %[load5], %[step1_0], %[step1_7] \n\t" "add %[load5], %[load5], %[step1_15] \n\t" @@ -870,18 +853,16 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7), - [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), - [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), - [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), - [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), - [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), - [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), - [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), - [step1_14] "r" (step1_14), [step1_15] "r" (step1_15) - ); + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix) + : + [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), + [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), + [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), + [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9), + [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), + [step1_14] "r"(step1_14), [step1_15] "r"(step1_15)); input += 16; } @@ -889,15 +870,11 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); uint32_t pos = 45; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); // First transform rows idct16_rows_dspr2(input, out, 16); @@ -908,17 +885,13 @@ void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); int16_t *outptr = out; uint32_t i; uint32_t pos = 45; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. @@ -926,7 +899,7 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, outptr += 4; for (i = 0; i < 6; ++i) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sw $zero, 0(%[outptr]) \n\t" "sw $zero, 32(%[outptr]) \n\t" "sw $zero, 64(%[outptr]) \n\t" @@ -945,8 +918,7 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, "sw $zero, 480(%[outptr]) \n\t" : - : [outptr] "r" (outptr) - ); + : [outptr] "r"(outptr)); outptr += 2; } @@ -966,35 +938,31 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, int32_t vector_1, vector_2, vector_3, vector_4; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + : + : [pos] "r"(pos)); out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); - __asm__ __volatile__ ( + __asm__ __volatile__( "addi %[out], %[out], 32 \n\t" "sra %[a1], %[out], 6 \n\t" - : [out] "+r" (out), [a1] "=r" (a1) - : - ); + : [out] "+r"(out), [a1] "=r"(a1) + :); if (a1 < 0) { /* use quad-byte * input and output memory are four byte aligned */ - __asm__ __volatile__ ( + __asm__ __volatile__( "abs %[absa1], %[a1] \n\t" "replv.qb %[vector_a1], %[absa1] \n\t" - : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); for (r = 16; r--;) { - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[t1], 0(%[dest]) \n\t" "lw %[t2], 4(%[dest]) \n\t" "lw %[t3], 8(%[dest]) \n\t" @@ -1009,25 +977,22 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, "sw %[vector_4], 12(%[dest]) \n\t" "add %[dest], %[dest], %[dest_stride] \n\t" - : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), - [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), - [dest] "+&r" (dest) - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) - ); + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); } } else { /* use quad-byte * input and output memory are four byte aligned */ - __asm__ __volatile__ ( - "replv.qb %[vector_a1], %[a1] \n\t" + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - : [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); for (r = 16; r--;) { - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[t1], 0(%[dest]) \n\t" "lw %[t2], 4(%[dest]) \n\t" "lw %[t3], 8(%[dest]) \n\t" @@ -1042,12 +1007,11 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, "sw %[vector_4], 12(%[dest]) \n\t" "add %[dest], %[dest], %[dest_stride] \n\t" - : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), - [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), - [dest] "+&r" (dest) - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) - ); + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); } } } @@ -1072,21 +1036,20 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) { int x14 = input[1]; int x15 = input[14]; - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 - | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { - output[0] = output[1] = output[2] = output[3] = output[4] - = output[5] = output[6] = output[7] = output[8] - = output[9] = output[10] = output[11] = output[12] - = output[13] = output[14] = output[15] = 0; + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = output[8] = output[9] = output[10] = + output[11] = output[12] = output[13] = output[14] = output[15] = 0; return; } // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; s5 = x4 * cospi_23_64 - x5 * cospi_9_64; s6 = x6 * cospi_13_64 + x7 * cospi_19_64; s7 = x6 * cospi_19_64 - x7 * cospi_13_64; @@ -1095,9 +1058,9 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) { s10 = x10 * cospi_21_64 + x11 * cospi_11_64; s11 = x10 * cospi_11_64 - x11 * cospi_21_64; s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; x0 = dct_const_round_shift(s0 + s8); x1 = dct_const_round_shift(s1 + s9); @@ -1107,8 +1070,8 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) { x5 = dct_const_round_shift(s5 + s13); x6 = dct_const_round_shift(s6 + s14); x7 = dct_const_round_shift(s7 + s15); - x8 = dct_const_round_shift(s0 - s8); - x9 = dct_const_round_shift(s1 - s9); + x8 = dct_const_round_shift(s0 - s8); + x9 = dct_const_round_shift(s1 - s9); x10 = dct_const_round_shift(s2 - s10); x11 = dct_const_round_shift(s3 - s11); x12 = dct_const_round_shift(s4 - s12); @@ -1125,14 +1088,14 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) { s5 = x5; s6 = x6; s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; x0 = s0 + s4; x1 = s1 + s5; @@ -1156,18 +1119,18 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) { s1 = x1; s2 = x2; s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; s8 = x8; s9 = x9; s10 = x10; s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; x0 = s0 + s2; x1 = s1 + s3; @@ -1187,13 +1150,13 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) { x15 = dct_const_round_shift(s13 - s15); // stage 4 - s2 = (- cospi_16_64) * (x2 + x3); + s2 = (-cospi_16_64) * (x2 + x3); s3 = cospi_16_64 * (x2 - x3); s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (- x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (- x10 + x11); - s14 = (- cospi_16_64) * (x14 + x15); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); s15 = cospi_16_64 * (x14 - x15); x2 = dct_const_round_shift(s2); @@ -1205,23 +1168,22 @@ void iadst16_dspr2(const int16_t *input, int16_t *output) { x14 = dct_const_round_shift(s14); x15 = dct_const_round_shift(s15); - output[0] = x0; + output[0] = x0; output[1] = -x8; - output[2] = x12; + output[2] = x12; output[3] = -x4; - output[4] = x6; - output[5] = x14; - output[6] = x10; - output[7] = x2; - output[8] = x3; - output[9] = x11; - output[10] = x15; - output[11] = x7; - output[12] = x5; + output[4] = x6; + output[5] = x14; + output[6] = x10; + output[7] = x2; + output[8] = x3; + output[9] = x11; + output[10] = x15; + output[11] = x7; + output[12] = x5; output[13] = -x13; - output[14] = x9; + output[14] = x9; output[15] = -x1; } - #endif // HAVE_DSPR2 diff --git a/vpx_dsp/mips/itrans32_cols_dspr2.c b/vpx_dsp/mips/itrans32_cols_dspr2.c index 553acb0f5bfd96885447377357fd9966d7f83df0..ce25d55c9c0bb23b4a3bfe28ca14cd84101a0613 100644 --- a/vpx_dsp/mips/itrans32_cols_dspr2.c +++ b/vpx_dsp/mips/itrans32_cols_dspr2.c @@ -39,9 +39,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, /* prefetch vpx_ff_cropTbl */ prefetch_load(vpx_ff_cropTbl); - prefetch_load(vpx_ff_cropTbl + 32); - prefetch_load(vpx_ff_cropTbl + 64); - prefetch_load(vpx_ff_cropTbl + 96); + prefetch_load(vpx_ff_cropTbl + 32); + prefetch_load(vpx_ff_cropTbl + 64); + prefetch_load(vpx_ff_cropTbl + 96); prefetch_load(vpx_ff_cropTbl + 128); prefetch_load(vpx_ff_cropTbl + 160); prefetch_load(vpx_ff_cropTbl + 192); @@ -51,7 +51,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, dest_pix = dest + i; dest_pix1 = dest + i + 31 * dest_stride; - __asm__ __volatile__ ( + __asm__ __volatile__( "lh %[load1], 2(%[input]) \n\t" "lh %[load2], 62(%[input]) \n\t" "lh %[load3], 34(%[input]) \n\t" @@ -101,18 +101,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[step1_16], %[temp0], %[temp1] \n\t" "add %[step1_31], %[temp2], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), - [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), - [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), - [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), + [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), + [step1_31] "=r"(step1_31) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), + [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( "lh %[load1], 18(%[input]) \n\t" "lh %[load2], 46(%[input]) \n\t" "lh %[load3], 50(%[input]) \n\t" @@ -162,18 +161,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[step1_19], %[temp0], %[temp1] \n\t" "add %[step1_28], %[temp2], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), - [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), - [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), - [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), + [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), + [step1_29] "=r"(step1_29) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), + [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( "lh %[load1], 10(%[input]) \n\t" "lh %[load2], 54(%[input]) \n\t" "lh %[load3], 42(%[input]) \n\t" @@ -223,18 +221,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[step1_20], %[temp0], %[temp1] \n\t" "add %[step1_27], %[temp2], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), - [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), - [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), - [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), - [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), + [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), + [step1_27] "=r"(step1_27) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), + [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( "lh %[load1], 26(%[input]) \n\t" "lh %[load2], 38(%[input]) \n\t" "lh %[load3], 58(%[input]) \n\t" @@ -280,18 +277,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[step1_23], %[temp0], %[temp1] \n\t" "add %[step1_24], %[temp2], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), - [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), - [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), - [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), - [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), + [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), + [step1_25] "=r"(step1_25) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), + [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( "lh %[load1], 4(%[input]) \n\t" "lh %[load2], 60(%[input]) \n\t" "lh %[load3], 36(%[input]) \n\t" @@ -337,18 +333,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[step2_8], %[temp0], %[temp1] \n\t" "add %[step2_15], %[temp2], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), - [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), - [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), - [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), - [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), + [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), + [step2_15] "=r"(step2_15) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( "lh %[load1], 20(%[input]) \n\t" "lh %[load2], 44(%[input]) \n\t" "lh %[load3], 52(%[input]) \n\t" @@ -394,18 +389,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[step2_11], %[temp0], %[temp1] \n\t" "add %[step2_12], %[temp2], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), - [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), - [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), - [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), - [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "sub %[temp0], %[step2_14], %[step2_13] \n\t" @@ -440,33 +434,31 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "extp %[step3_11], $ac2, 31 \n\t" "extp %[step3_12], $ac3, 31 \n\t" - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), - [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), - [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), - [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) - : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8), - [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), - [step2_11] "r" (step2_11), [step2_12] "r" (step2_12), - [step2_13] "r" (step2_13), [step2_14] "r" (step2_14), - [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64) - ); + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), + [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), + [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), + [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), + [step3_15] "=r"(step3_15) + : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), + [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), + [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), + [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), + [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); step2_18 = step1_17 - step1_18; step2_29 = step1_30 - step1_29; - __asm__ __volatile__ ( + __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" "extp %[step3_18], $ac0, 31 \n\t" - : [step3_18] "=r" (step3_18) - : [const_2_power_13] "r" (const_2_power_13), - [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); + : [step3_18] "=r"(step3_18) + : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), + [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; @@ -474,18 +466,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, step2_19 = step1_16 - step1_19; step2_28 = step1_31 - step1_28; - __asm__ __volatile__ ( + __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" "extp %[step3_19], $ac0, 31 \n\t" - : [step3_19] "=r" (step3_19) - : [const_2_power_13] "r" (const_2_power_13), - [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); + : [step3_19] "=r"(step3_19) + : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), + [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; @@ -498,18 +489,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, step2_20 = step1_23 - step1_20; step2_27 = step1_24 - step1_27; - __asm__ __volatile__ ( + __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" "extp %[step3_20], $ac0, 31 \n\t" - : [step3_20] "=r" (step3_20) - : [const_2_power_13] "r" (const_2_power_13), - [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); + : [step3_20] "=r"(step3_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; @@ -517,18 +507,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, step2_21 = step1_22 - step1_21; step2_26 = step1_25 - step1_26; - __asm__ __volatile__ ( + __asm__ __volatile__( "mtlo %[const_2_power_13], $ac1 \n\t" "mthi $zero, $ac1 \n\t" "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" "extp %[step3_21], $ac1, 31 \n\t" - : [step3_21] "=r" (step3_21) - : [const_2_power_13] "r" (const_2_power_13), - [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); + : [step3_21] "=r"(step3_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; @@ -556,7 +545,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, step2_30 = step3_30 + step3_25; step2_31 = step3_31 + step3_24; - __asm__ __volatile__ ( + __asm__ __volatile__( "lh %[load1], 0(%[input]) \n\t" "lh %[load2], 32(%[input]) \n\t" "lh %[load3], 16(%[input]) \n\t" @@ -588,19 +577,17 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "sub %[step1_2], %[temp1], %[temp2] \n\t" "sub %[step1_3], %[temp0], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), - [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), - [cospi_16_64] "r" (cospi_16_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( "lh %[load1], 8(%[input]) \n\t" "lh %[load2], 56(%[input]) \n\t" "lh %[load3], 40(%[input]) \n\t" @@ -649,17 +636,15 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[step1_4], %[temp0], %[temp1] \n\t" "add %[step1_7], %[temp3], %[temp2] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), - [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), - [cospi_16_64] "r" (cospi_16_64) - ); + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); step2_0 = step1_0 + step1_7; step2_1 = step1_1 + step1_6; @@ -688,67 +673,63 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, step1_14 = step2_1 - step3_14; step1_15 = step2_0 - step3_15; - __asm__ __volatile__ ( + __asm__ __volatile__( "sub %[temp0], %[step2_27], %[step2_20] \n\t" "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t" "extp %[step1_20], $ac0, 31 \n\t" - : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) - : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20), - [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64) - ); + : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); temp21 = (step2_20 + step2_27) * cospi_16_64; step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - __asm__ __volatile__ ( + __asm__ __volatile__( "sub %[temp0], %[step2_26], %[step2_21] \n\t" "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t" "extp %[step1_21], $ac0, 31 \n\t" - : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) - : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26), - [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64) - ); + : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), + [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); temp21 = (step2_21 + step2_26) * cospi_16_64; step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - __asm__ __volatile__ ( + __asm__ __volatile__( "sub %[temp0], %[step2_25], %[step2_22] \n\t" "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t" "extp %[step1_22], $ac0, 31 \n\t" - : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) - : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25), - [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64) - ); + : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) + : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), + [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); temp21 = (step2_22 + step2_25) * cospi_16_64; step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - __asm__ __volatile__ ( + __asm__ __volatile__( "sub %[temp0], %[step2_24], %[step2_23] \n\t" "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t" "extp %[step1_23], $ac0, 31 \n\t" - : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) - : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24), - [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64) - ); + : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) + : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), + [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); temp21 = (step2_23 + step2_24) * cospi_16_64; step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - __asm__ __volatile__ ( + __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix]) \n\t" "add %[temp0], %[step1_0], %[step2_31] \n\t" "addi %[temp0], %[temp0], 32 \n\t" @@ -783,21 +764,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "sb %[temp1], 0(%[dest_pix]) \n\t" "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), - [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), - [step2_28] "r" (step2_28), [step2_29] "r" (step2_29), - [step2_30] "r" (step2_30), [step2_31] "r" (step2_31) - ); + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), + [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), + [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), + [step2_29] "r"(step2_29), [step2_30] "r"(step2_30), + [step2_31] "r"(step2_31)); step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); - __asm__ __volatile__ ( + __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix1]) \n\t" "add %[temp2], %[temp2], %[step3_15] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" @@ -820,14 +800,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "sb %[temp1], 0(%[dest_pix1]) \n\t" "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), - [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) - ); + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); - __asm__ __volatile__ ( + __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix]) \n\t" "add %[temp0], %[step1_4], %[step1_27] \n\t" "addi %[temp0], %[temp0], 32 \n\t" @@ -862,21 +841,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "sb %[temp1], 0(%[dest_pix]) \n\t" "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), - [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), - [step1_24] "r" (step1_24), [step1_25] "r" (step1_25), - [step1_26] "r" (step1_26), [step1_27] "r" (step1_27) - ); + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4), + [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), + [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), + [step1_25] "r"(step1_25), [step1_26] "r"(step1_26), + [step1_27] "r"(step1_27)); step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); - __asm__ __volatile__ ( + __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix1]) \n\t" "add %[temp2], %[temp2], %[step3_15] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" @@ -899,14 +877,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "sb %[temp1], 0(%[dest_pix1]) \n\t" "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), - [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) - ); + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); - __asm__ __volatile__ ( + __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix]) \n\t" "add %[temp0], %[step1_8], %[step1_23] \n\t" "addi %[temp0], %[temp0], 32 \n\t" @@ -941,21 +918,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "sb %[temp1], 0(%[dest_pix]) \n\t" "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), - [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), - [step1_20] "r" (step1_20), [step1_21] "r" (step1_21), - [step1_22] "r" (step1_22), [step1_23] "r" (step1_23) - ); + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8), + [step1_9] "r"(step1_9), [step1_10] "r"(step1_10), + [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), + [step1_21] "r"(step1_21), [step1_22] "r"(step1_22), + [step1_23] "r"(step1_23)); step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); - __asm__ __volatile__ ( + __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix1]) \n\t" "add %[temp2], %[temp2], %[step3_15] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" @@ -978,14 +954,13 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "sb %[temp1], 0(%[dest_pix1]) \n\t" "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), - [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) - ); + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); - __asm__ __volatile__ ( + __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix]) \n\t" "add %[temp0], %[step1_12], %[step2_19] \n\t" "addi %[temp0], %[temp0], 32 \n\t" @@ -1019,21 +994,20 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), - [step1_14] "r" (step1_14), [step1_15] "r" (step1_15), - [step2_16] "r" (step2_16), [step2_17] "r" (step2_17), - [step2_18] "r" (step2_18), [step2_19] "r" (step2_19) - ); + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), + [step1_14] "r"(step1_14), [step1_15] "r"(step1_15), + [step2_16] "r"(step2_16), [step2_17] "r"(step2_17), + [step2_18] "r"(step2_18), [step2_19] "r"(step2_19)); step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); - __asm__ __volatile__ ( + __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix1]) \n\t" "add %[temp2], %[temp2], %[step3_15] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" @@ -1055,12 +1029,11 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), - [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) - : [cm] "r" (cm), [dest_stride] "r" (dest_stride), - [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), - [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) - ); + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); input += 32; } diff --git a/vpx_dsp/mips/itrans32_dspr2.c b/vpx_dsp/mips/itrans32_dspr2.c index 523da1df1bc7bb45e78b94f29041e4a7a990c114..d71c5ffed512feeb347abd147c352b303759b66a 100644 --- a/vpx_dsp/mips/itrans32_dspr2.c +++ b/vpx_dsp/mips/itrans32_dspr2.c @@ -40,16 +40,16 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, const int const_2_power_13 = 8192; const int32_t *input_int; - for (i = no_rows; i--; ) { + for (i = no_rows; i--;) { input_int = (const int32_t *)input; - if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | - input_int[4] | input_int[5] | input_int[6] | input_int[7] | - input_int[8] | input_int[9] | input_int[10] | input_int[11] | + if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | + input_int[4] | input_int[5] | input_int[6] | input_int[7] | + input_int[8] | input_int[9] | input_int[10] | input_int[11] | input_int[12] | input_int[13] | input_int[14] | input_int[15])) { input += 32; - __asm__ __volatile__ ( + __asm__ __volatile__( "sh $zero, 0(%[output]) \n\t" "sh $zero, 64(%[output]) \n\t" "sh $zero, 128(%[output]) \n\t" @@ -84,8 +84,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "sh $zero, 1984(%[output]) \n\t" : - : [output] "r" (output) - ); + : [output] "r"(output)); output += 1; @@ -96,7 +95,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, prefetch_load((const uint8_t *)(input + 32)); prefetch_load((const uint8_t *)(input + 48)); - __asm__ __volatile__ ( + __asm__ __volatile__( "lh %[load1], 2(%[input]) \n\t" "lh %[load2], 62(%[input]) \n\t" "lh %[load3], 34(%[input]) \n\t" @@ -146,19 +145,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "add %[step1_16], %[temp0], %[temp1] \n\t" "add %[step1_31], %[temp2], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), - [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), - [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), + [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), + [step1_31] "=r"(step1_31) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), + [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( "lh %[load1], 18(%[input]) \n\t" "lh %[load2], 46(%[input]) \n\t" "lh %[load3], 50(%[input]) \n\t" @@ -208,19 +205,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "add %[step1_19], %[temp0], %[temp1] \n\t" "add %[step1_28], %[temp2], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), - [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), - [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), + [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), + [step1_29] "=r"(step1_29) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), + [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( "lh %[load1], 10(%[input]) \n\t" "lh %[load2], 54(%[input]) \n\t" "lh %[load3], 42(%[input]) \n\t" @@ -270,19 +265,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "add %[step1_20], %[temp0], %[temp1] \n\t" "add %[step1_27], %[temp2], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), - [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), - [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), - [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), + [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), + [step1_27] "=r"(step1_27) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), + [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( "lh %[load1], 26(%[input]) \n\t" "lh %[load2], 38(%[input]) \n\t" "lh %[load3], 58(%[input]) \n\t" @@ -332,19 +325,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "add %[step1_23], %[temp0], %[temp1] \n\t" "add %[step1_24], %[temp2], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), - [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), - [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), - [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), + [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), + [step1_25] "=r"(step1_25) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), + [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( "lh %[load1], 4(%[input]) \n\t" "lh %[load2], 60(%[input]) \n\t" "lh %[load3], 36(%[input]) \n\t" @@ -394,19 +385,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "add %[step2_8], %[temp0], %[temp1] \n\t" "add %[step2_15], %[temp2], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), - [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), - [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), - [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), + [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), + [step2_15] "=r"(step2_15) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( "lh %[load1], 20(%[input]) \n\t" "lh %[load2], 44(%[input]) \n\t" "lh %[load3], 52(%[input]) \n\t" @@ -456,19 +445,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "add %[step2_11], %[temp0], %[temp1] \n\t" "add %[step2_12], %[temp2], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), - [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), - [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), - [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) - ); - - __asm__ __volatile__ ( + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "sub %[temp0], %[step2_14], %[step2_13] \n\t" @@ -507,34 +494,31 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "extp %[step3_11], $ac2, 31 \n\t" "extp %[step3_12], $ac3, 31 \n\t" - : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), - [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), - [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), - [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) - : [const_2_power_13] "r" (const_2_power_13), - [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), - [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), - [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), - [step2_14] "r" (step2_14), [step2_15] "r" (step2_15), - [cospi_16_64] "r" (cospi_16_64) - ); + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), + [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), + [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), + [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), + [step3_15] "=r"(step3_15) + : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), + [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), + [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), + [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), + [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); step2_18 = step1_17 - step1_18; step2_29 = step1_30 - step1_29; - __asm__ __volatile__ ( + __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" "extp %[step3_18], $ac0, 31 \n\t" - : [step3_18] "=r" (step3_18) - : [const_2_power_13] "r" (const_2_power_13), - [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); + : [step3_18] "=r"(step3_18) + : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), + [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; @@ -542,18 +526,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, step2_19 = step1_16 - step1_19; step2_28 = step1_31 - step1_28; - __asm__ __volatile__ ( + __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" "extp %[step3_19], $ac0, 31 \n\t" - : [step3_19] "=r" (step3_19) - : [const_2_power_13] "r" (const_2_power_13), - [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); + : [step3_19] "=r"(step3_19) + : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), + [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; @@ -566,18 +549,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, step2_20 = step1_23 - step1_20; step2_27 = step1_24 - step1_27; - __asm__ __volatile__ ( + __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" "extp %[step3_20], $ac0, 31 \n\t" - : [step3_20] "=r" (step3_20) - : [const_2_power_13] "r" (const_2_power_13), - [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); + : [step3_20] "=r"(step3_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; @@ -585,18 +567,17 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, step2_21 = step1_22 - step1_21; step2_26 = step1_25 - step1_26; - __asm__ __volatile__ ( + __asm__ __volatile__( "mtlo %[const_2_power_13], $ac1 \n\t" "mthi $zero, $ac1 \n\t" "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" "extp %[step3_21], $ac1, 31 \n\t" - : [step3_21] "=r" (step3_21) - : [const_2_power_13] "r" (const_2_power_13), - [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) - ); + : [step3_21] "=r"(step3_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; @@ -624,7 +605,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, step2_30 = step3_30 + step3_25; step2_31 = step3_31 + step3_24; - __asm__ __volatile__ ( + __asm__ __volatile__( "lh %[load1], 0(%[input]) \n\t" "lh %[load2], 32(%[input]) \n\t" "lh %[load3], 16(%[input]) \n\t" @@ -658,20 +639,19 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "sub %[step1_2], %[temp1], %[temp2] \n\t" "sub %[step1_3], %[temp0], %[temp3] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [result1] "=&r" (result1), [result2] "=&r" (result2), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), - [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_16_64] "r" (cospi_16_64), - [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64) - ); + ); - __asm__ __volatile__ ( + __asm__ __volatile__( "lh %[load1], 8(%[input]) \n\t" "lh %[load2], 56(%[input]) \n\t" "lh %[load3], 40(%[input]) \n\t" @@ -724,17 +704,15 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "add %[step1_4], %[temp0], %[temp1] \n\t" "add %[step1_7], %[temp3], %[temp2] \n\t" - : [load1] "=&r" (load1), [load2] "=&r" (load2), - [load3] "=&r" (load3), [load4] "=&r" (load4), - [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), - [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), - [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), - [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) - : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), - [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), - [cospi_16_64] "r" (cospi_16_64) - ); + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); step2_0 = step1_0 + step1_7; step2_1 = step1_1 + step1_6; @@ -762,66 +740,58 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, step1_14 = step2_1 - step3_14; step1_15 = step2_0 - step3_15; - __asm__ __volatile__ ( + __asm__ __volatile__( "sub %[temp0], %[step2_27], %[step2_20] \n\t" "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t" "extp %[step1_20], $ac0, 31 \n\t" - : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) - : [const_2_power_13] "r" (const_2_power_13), - [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), - [cospi_16_64] "r" (cospi_16_64) - ); + : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); temp21 = (step2_20 + step2_27) * cospi_16_64; step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - __asm__ __volatile__ ( + __asm__ __volatile__( "sub %[temp0], %[step2_26], %[step2_21] \n\t" "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t" "extp %[step1_21], $ac0, 31 \n\t" - : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) - : [const_2_power_13] "r" (const_2_power_13), - [step2_26] "r" (step2_26), [step2_21] "r" (step2_21), - [cospi_16_64] "r" (cospi_16_64) - ); + : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), + [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); temp21 = (step2_21 + step2_26) * cospi_16_64; step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - __asm__ __volatile__ ( + __asm__ __volatile__( "sub %[temp0], %[step2_25], %[step2_22] \n\t" "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t" "extp %[step1_22], $ac0, 31 \n\t" - : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) - : [const_2_power_13] "r" (const_2_power_13), - [step2_25] "r" (step2_25), [step2_22] "r" (step2_22), - [cospi_16_64] "r" (cospi_16_64) - ); + : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) + : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), + [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); temp21 = (step2_22 + step2_25) * cospi_16_64; step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - __asm__ __volatile__ ( + __asm__ __volatile__( "sub %[temp0], %[step2_24], %[step2_23] \n\t" "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t" "extp %[step1_23], $ac0, 31 \n\t" - : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) - : [const_2_power_13] "r" (const_2_power_13), - [step2_24] "r" (step2_24), [step2_23] "r" (step2_23), - [cospi_16_64] "r" (cospi_16_64) - ); + : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) + : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), + [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); temp21 = (step2_23 + step2_24) * cospi_16_64; step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; @@ -867,16 +837,14 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); int16_t *outptr = out; uint32_t pos = 45; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); // Rows idct32_rows_dspr2(input, outptr, 32); @@ -887,23 +855,21 @@ void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { - DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); int16_t *outptr = out; uint32_t i; uint32_t pos = 45; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); // Rows idct32_rows_dspr2(input, outptr, 8); outptr += 8; - __asm__ __volatile__ ( + __asm__ __volatile__( "sw $zero, 0(%[outptr]) \n\t" "sw $zero, 4(%[outptr]) \n\t" "sw $zero, 8(%[outptr]) \n\t" @@ -918,13 +884,12 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, "sw $zero, 44(%[outptr]) \n\t" : - : [outptr] "r" (outptr) - ); + : [outptr] "r"(outptr)); for (i = 0; i < 31; ++i) { outptr += 32; - __asm__ __volatile__ ( + __asm__ __volatile__( "sw $zero, 0(%[outptr]) \n\t" "sw $zero, 4(%[outptr]) \n\t" "sw $zero, 8(%[outptr]) \n\t" @@ -939,8 +904,7 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, "sw $zero, 44(%[outptr]) \n\t" : - : [outptr] "r" (outptr) - ); + : [outptr] "r"(outptr)); } // Columns @@ -949,43 +913,39 @@ void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { - int r, out; - int32_t a1, absa1; - int32_t vector_a1; - int32_t t1, t2, t3, t4; - int32_t vector_1, vector_2, vector_3, vector_4; - uint32_t pos = 45; + int r, out; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + uint32_t pos = 45; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + : + : [pos] "r"(pos)); out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); - __asm__ __volatile__ ( + __asm__ __volatile__( "addi %[out], %[out], 32 \n\t" "sra %[a1], %[out], 6 \n\t" - : [out] "+r" (out), [a1] "=r" (a1) - : - ); + : [out] "+r"(out), [a1] "=r"(a1) + :); if (a1 < 0) { /* use quad-byte * input and output memory are four byte aligned */ - __asm__ __volatile__ ( + __asm__ __volatile__( "abs %[absa1], %[a1] \n\t" "replv.qb %[vector_a1], %[absa1] \n\t" - : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); for (r = 32; r--;) { - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[t1], 0(%[dest]) \n\t" "lw %[t2], 4(%[dest]) \n\t" "lw %[t3], 8(%[dest]) \n\t" @@ -1014,25 +974,22 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, "add %[dest], %[dest], %[stride] \n\t" - : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), - [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), - [dest] "+&r" (dest) - : [stride] "r" (stride), [vector_a1] "r" (vector_a1) - ); + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); } } else { /* use quad-byte * input and output memory are four byte aligned */ - __asm__ __volatile__ ( - "replv.qb %[vector_a1], %[a1] \n\t" + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - : [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); for (r = 32; r--;) { - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[t1], 0(%[dest]) \n\t" "lw %[t2], 4(%[dest]) \n\t" "lw %[t3], 8(%[dest]) \n\t" @@ -1061,12 +1018,11 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, "add %[dest], %[dest], %[stride] \n\t" - : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), - [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), - [dest] "+&r" (dest) - : [stride] "r" (stride), [vector_a1] "r" (vector_a1) - ); + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); } } } diff --git a/vpx_dsp/mips/itrans4_dspr2.c b/vpx_dsp/mips/itrans4_dspr2.c index ecb8bd3de7518d0039d92deeb52e9833953a3ca9..516ea80f4ae96179ebbc2c643130ab0f4a4d3543 100644 --- a/vpx_dsp/mips/itrans4_dspr2.c +++ b/vpx_dsp/mips/itrans4_dspr2.c @@ -15,13 +15,13 @@ #if HAVE_DSPR2 void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) { - int16_t step_0, step_1, step_2, step_3; - int Temp0, Temp1, Temp2, Temp3; + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; const int const_2_power_13 = 8192; - int i; + int i; - for (i = 4; i--; ) { - __asm__ __volatile__ ( + for (i = 4; i--;) { + __asm__ __volatile__( /* temp_1 = (input[0] + input[2]) * cospi_16_64; step_0 = dct_const_round_shift(temp_1); @@ -83,16 +83,12 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) { "sub %[Temp3], %[step_0], %[step_3] \n\t" "sh %[Temp3], 24(%[output]) \n\t" - : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), - [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [step_0] "=&r" (step_0), [step_1] "=&r" (step_1), - [step_2] "=&r" (step_2), [step_3] "=&r" (step_3), - [output] "+r" (output) - : [const_2_power_13] "r" (const_2_power_13), - [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), - [cospi_24_64] "r" (cospi_24_64), - [input] "r" (input) - ); + : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), + [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input)); input += 4; output += 1; @@ -101,27 +97,27 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) { void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { - int16_t step_0, step_1, step_2, step_3; - int Temp0, Temp1, Temp2, Temp3; + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; const int const_2_power_13 = 8192; - int i; - uint8_t *dest_pix; - uint8_t *cm = vpx_ff_cropTbl; + int i; + uint8_t *dest_pix; + uint8_t *cm = vpx_ff_cropTbl; /* prefetch vpx_ff_cropTbl */ prefetch_load(vpx_ff_cropTbl); - prefetch_load(vpx_ff_cropTbl + 32); - prefetch_load(vpx_ff_cropTbl + 64); - prefetch_load(vpx_ff_cropTbl + 96); + prefetch_load(vpx_ff_cropTbl + 32); + prefetch_load(vpx_ff_cropTbl + 64); + prefetch_load(vpx_ff_cropTbl + 96); prefetch_load(vpx_ff_cropTbl + 128); prefetch_load(vpx_ff_cropTbl + 160); prefetch_load(vpx_ff_cropTbl + 192); prefetch_load(vpx_ff_cropTbl + 224); for (i = 0; i < 4; ++i) { - dest_pix = (dest + i); + dest_pix = (dest + i); - __asm__ __volatile__ ( + __asm__ __volatile__( /* temp_1 = (input[0] + input[2]) * cospi_16_64; step_0 = dct_const_round_shift(temp_1); @@ -206,16 +202,14 @@ void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[Temp2], %[Temp1](%[cm]) \n\t" "sb %[Temp2], 0(%[dest_pix]) \n\t" - : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), - [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [step_0] "=&r" (step_0), [step_1] "=&r" (step_1), - [step_2] "=&r" (step_2), [step_3] "=&r" (step_3), - [dest_pix] "+r" (dest_pix) - : [const_2_power_13] "r" (const_2_power_13), - [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), - [cospi_24_64] "r" (cospi_24_64), - [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) - ); + : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), + [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), + [dest_pix] "+r"(dest_pix) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), + [dest_stride] "r"(dest_stride)); input += 4; } @@ -228,11 +222,9 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, uint32_t pos = 45; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); // Rows vpx_idct4_rows_dspr2(input, outptr); @@ -243,73 +235,63 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, int dest_stride) { - int a1, absa1; - int r; - int32_t out; - int t2, vector_a1, vector_a; - uint32_t pos = 45; - int16_t input_dc = input[0]; + int a1, absa1; + int r; + int32_t out; + int t2, vector_a1, vector_a; + uint32_t pos = 45; + int16_t input_dc = input[0]; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + : + : [pos] "r"(pos)); out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); - __asm__ __volatile__ ( + __asm__ __volatile__( "addi %[out], %[out], 8 \n\t" "sra %[a1], %[out], 4 \n\t" - : [out] "+r" (out), [a1] "=r" (a1) - : - ); + : [out] "+r"(out), [a1] "=r"(a1) + :); if (a1 < 0) { /* use quad-byte * input and output memory are four byte aligned */ - __asm__ __volatile__ ( + __asm__ __volatile__( "abs %[absa1], %[a1] \n\t" "replv.qb %[vector_a1], %[absa1] \n\t" - : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); for (r = 4; r--;) { - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[t2], 0(%[dest]) \n\t" "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" "sw %[vector_a], 0(%[dest]) \n\t" "add %[dest], %[dest], %[dest_stride] \n\t" - : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), - [dest] "+&r" (dest) - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) - ); + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); } } else { /* use quad-byte * input and output memory are four byte aligned */ - __asm__ __volatile__ ( - "replv.qb %[vector_a1], %[a1] \n\t" - : [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); for (r = 4; r--;) { - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[t2], 0(%[dest]) \n\t" "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" "sw %[vector_a], 0(%[dest]) \n\t" "add %[dest], %[dest], %[dest_stride] \n\t" - : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), - [dest] "+&r" (dest) - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) - ); + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); } } } diff --git a/vpx_dsp/mips/itrans8_dspr2.c b/vpx_dsp/mips/itrans8_dspr2.c index 823e845d59d5618396990d4b18c5e0ae962df131..08a6c78b6e4d2a6b066f82acce04405954514e08 100644 --- a/vpx_dsp/mips/itrans8_dspr2.c +++ b/vpx_dsp/mips/itrans8_dspr2.c @@ -20,8 +20,8 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { int Temp0, Temp1, Temp2, Temp3, Temp4; int i; - for (i = no_rows; i--; ) { - __asm__ __volatile__ ( + for (i = no_rows; i--;) { + __asm__ __volatile__( /* temp_1 = (input[0] + input[4]) * cospi_16_64; step2_0 = dct_const_round_shift(temp_1); @@ -174,20 +174,18 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { "sub %[Temp1], %[step1_0], %[step1_7] \n\t" "sh %[Temp1], 112(%[output]) \n\t" - : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), - [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), - [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), - [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), - [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), - [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [Temp4] "=&r" (Temp4) - : [const_2_power_13] "r" (const_2_power_13), - [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), - [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), - [cospi_24_64] "r" (cospi_24_64), - [output] "r" (output), [input] "r" (input) - ); + : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), + [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), + [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), + [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), + [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_24_64] "r"(cospi_24_64), [output] "r"(output), + [input] "r"(input)); input += 8; output += 1; @@ -205,18 +203,18 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, /* prefetch vpx_ff_cropTbl */ prefetch_load(vpx_ff_cropTbl); - prefetch_load(vpx_ff_cropTbl + 32); - prefetch_load(vpx_ff_cropTbl + 64); - prefetch_load(vpx_ff_cropTbl + 96); + prefetch_load(vpx_ff_cropTbl + 32); + prefetch_load(vpx_ff_cropTbl + 64); + prefetch_load(vpx_ff_cropTbl + 96); prefetch_load(vpx_ff_cropTbl + 128); prefetch_load(vpx_ff_cropTbl + 160); prefetch_load(vpx_ff_cropTbl + 192); prefetch_load(vpx_ff_cropTbl + 224); for (i = 0; i < 8; ++i) { - dest_pix = (dest + i); + dest_pix = (dest + i); - __asm__ __volatile__ ( + __asm__ __volatile__( /* temp_1 = (input[0] + input[4]) * cospi_16_64; step2_0 = dct_const_round_shift(temp_1); @@ -423,20 +421,18 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[Temp2], %[Temp1](%[cm]) \n\t" "sb %[Temp2], 0(%[dest_pix]) \n\t" - : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), - [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), - [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), - [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), - [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), - [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), - [dest_pix] "+r" (dest_pix) - : [const_2_power_13] "r" (const_2_power_13), - [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), - [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), - [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), - [cospi_24_64] "r" (cospi_24_64), - [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) - ); + : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), + [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), + [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), + [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), + [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), + [dest_stride] "r"(dest_stride)); input += 8; } @@ -449,11 +445,7 @@ void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, uint32_t pos = 45; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); // First transform rows idct8_rows_dspr2(input, outptr, 8); @@ -469,18 +461,14 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, uint32_t pos = 45; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); // First transform rows idct8_rows_dspr2(input, outptr, 4); outptr += 4; - __asm__ __volatile__ ( + __asm__ __volatile__( "sw $zero, 0(%[outptr]) \n\t" "sw $zero, 4(%[outptr]) \n\t" "sw $zero, 16(%[outptr]) \n\t" @@ -499,9 +487,7 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, "sw $zero, 116(%[outptr]) \n\t" : - : [outptr] "r" (outptr) - ); - + : [outptr] "r"(outptr)); // Then transform columns and add to dest idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); @@ -516,35 +502,31 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, int32_t t1, t2, vector_a1, vector_1, vector_2; /* bit positon for extract from acc */ - __asm__ __volatile__ ( - "wrdsp %[pos], 1 \n\t" + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r" (pos) - ); + : + : [pos] "r"(pos)); out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); - __asm__ __volatile__ ( + __asm__ __volatile__( "addi %[out], %[out], 16 \n\t" "sra %[a1], %[out], 5 \n\t" - : [out] "+r" (out), [a1] "=r" (a1) - : - ); + : [out] "+r"(out), [a1] "=r"(a1) + :); if (a1 < 0) { /* use quad-byte * input and output memory are four byte aligned */ - __asm__ __volatile__ ( + __asm__ __volatile__( "abs %[absa1], %[a1] \n\t" "replv.qb %[vector_a1], %[absa1] \n\t" - : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); for (r = 8; r--;) { - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[t1], 0(%[dest]) \n\t" "lw %[t2], 4(%[dest]) \n\t" "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" @@ -553,24 +535,20 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, "sw %[vector_2], 4(%[dest]) \n\t" "add %[dest], %[dest], %[dest_stride] \n\t" - : [t1] "=&r" (t1), [t2] "=&r" (t2), - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), - [dest] "+&r" (dest) - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) - ); + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); } } else { /* use quad-byte * input and output memory are four byte aligned */ - __asm__ __volatile__ ( - "replv.qb %[vector_a1], %[a1] \n\t" + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - : [vector_a1] "=r" (vector_a1) - : [a1] "r" (a1) - ); + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); for (r = 8; r--;) { - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[t1], 0(%[dest]) \n\t" "lw %[t2], 4(%[dest]) \n\t" "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" @@ -579,11 +557,9 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, "sw %[vector_2], 4(%[dest]) \n\t" "add %[dest], %[dest], %[dest_stride] \n\t" - : [t1] "=&r" (t1), [t2] "=&r" (t2), - [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), - [dest] "+r" (dest) - : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) - ); + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); } } } @@ -602,20 +578,20 @@ void iadst8_dspr2(const int16_t *input, int16_t *output) { x7 = input[6]; if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - output[0] = output[1] = output[2] = output[3] = output[4] - = output[5] = output[6] = output[7] = 0; + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = 0; return; } // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; s2 = cospi_10_64 * x2 + cospi_22_64 * x3; s3 = cospi_22_64 * x2 - cospi_10_64 * x3; s4 = cospi_18_64 * x4 + cospi_14_64 * x5; s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); @@ -631,10 +607,10 @@ void iadst8_dspr2(const int16_t *input, int16_t *output) { s1 = x1; s2 = x2; s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; x0 = s0 + s2; x1 = s1 + s3; @@ -656,13 +632,13 @@ void iadst8_dspr2(const int16_t *input, int16_t *output) { x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); - output[0] = x0; + output[0] = x0; output[1] = -x4; - output[2] = x6; + output[2] = x6; output[3] = -x2; - output[4] = x3; + output[4] = x3; output[5] = -x7; - output[6] = x5; + output[6] = x5; output[7] = -x1; } #endif // HAVE_DSPR2 diff --git a/vpx_dsp/mips/loopfilter_16_msa.c b/vpx_dsp/mips/loopfilter_16_msa.c index 0ad1dd2f2d2a86446cc8df8d17e0a345767a21d7..4aad863de02a0d6e48b7d54dce4e8ff9495079ea 100644 --- a/vpx_dsp/mips/loopfilter_16_msa.c +++ b/vpx_dsp/mips/loopfilter_16_msa.c @@ -11,8 +11,7 @@ #include "vpx_ports/mem.h" #include "vpx_dsp/mips/loopfilter_msa.h" -int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, - uint8_t *filter48, +int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, const uint8_t *thresh_ptr) { @@ -33,8 +32,8 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, limit = (v16u8)__msa_fill_b(*limit_ptr); /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, - hev, mask, flat); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat); VPX_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); @@ -43,9 +42,8 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, return 1; } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, - zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, - q2_r, q3_r); + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); @@ -107,8 +105,8 @@ void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { } else { src -= 7 * pitch; - ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, - zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in); q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); @@ -408,8 +406,7 @@ void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { + const uint8_t *thresh_ptr, int32_t count) { DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); uint8_t early_exit = 0; @@ -426,8 +423,7 @@ void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, - int32_t count) { + const uint8_t *thresh_ptr, int32_t count) { if (1 == count) { uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; uint64_t dword0, dword1; @@ -449,8 +445,8 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr); - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, - hev, mask, flat); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat); VPX_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); @@ -472,9 +468,8 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, - zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, - q0_filter8); + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, + q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); /* store pixel values */ @@ -668,8 +663,8 @@ static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - LD_UB8(input, in_pitch, - p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org); + LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, + p1_org, p0_org); /* 8x8 transpose */ TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org, p7, p6, p5, p4, p3, p2, p1, p0); @@ -699,8 +694,8 @@ static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); } -static void transpose_16x16(uint8_t *input, int32_t in_pitch, - uint8_t *output, int32_t out_pitch) { +static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, + int32_t out_pitch) { v16u8 row0, row1, row2, row3, row4, row5, row6, row7; v16u8 row8, row9, row10, row11, row12, row13, row14, row15; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; @@ -709,12 +704,11 @@ static void transpose_16x16(uint8_t *input, int32_t in_pitch, LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); input += (8 * in_pitch); - LD_UB8(input, in_pitch, - row8, row9, row10, row11, row12, row13, row14, row15); + LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15); - TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, - row8, row9, row10, row11, row12, row13, row14, row15, - p7, p6, p5, p4, p3, p2, p1, p0); + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p7, p6, + p5, p4, p3, p2, p1, p0); /* transpose 16x8 matrix into 8x16 */ /* total 8 intermediate register and 32 instructions */ @@ -779,8 +773,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, limit = (v16u8)__msa_fill_b(*limit_ptr); /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, - hev, mask, flat); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); /* flat4 */ VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat); /* filter4 */ @@ -794,9 +788,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); return 1; } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, - zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, - q3_r); + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); @@ -864,9 +857,9 @@ int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, } else { src -= 7 * 16; - ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, - zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, - p3_r_in, p2_r_in, p1_r_in, p0_r_in); + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); tmp0_r = p7_r_in << 3; @@ -1056,9 +1049,9 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); - early_exit = vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), - &filter48[0], src, pitch, b_limit_ptr, - limit_ptr, thresh_ptr); + early_exit = + vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); if (0 == early_exit) { early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, @@ -1093,8 +1086,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, limit = (v16u8)__msa_fill_b(*limit_ptr); /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, - hev, mask, flat); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); /* flat4 */ VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat); /* filter4 */ @@ -1113,9 +1106,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, return 1; } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, - zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, - q3_r); + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); @@ -1196,9 +1188,9 @@ int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, } else { src -= 7 * 16; - ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, - zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, - p3_r_in, p2_r_in, p1_r_in, p0_r_in); + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); tmp0_r = p7_r_in << 3; @@ -1479,9 +1471,9 @@ void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, transpose_16x16((src - 8), pitch, &transposed_input[0], 16); - early_exit = vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), - &filter48[0], src, pitch, b_limit_ptr, - limit_ptr, thresh_ptr); + early_exit = + vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); if (0 == early_exit) { early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, diff --git a/vpx_dsp/mips/loopfilter_4_msa.c b/vpx_dsp/mips/loopfilter_4_msa.c index 5ea27ae6a743e5fb2fc231dcb748e3a436ae4370..fe216c2b6f938bf39c4cb8b9532c0e7b8ab0079d 100644 --- a/vpx_dsp/mips/loopfilter_4_msa.c +++ b/vpx_dsp/mips/loopfilter_4_msa.c @@ -25,8 +25,8 @@ void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr); - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, - hev, mask, flat); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); VPX_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); p1_d = __msa_copy_u_d((v2i64)p1_out, 0); @@ -61,8 +61,8 @@ void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, limit1 = (v16u8)__msa_fill_b(*limit1_ptr); limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, - hev, mask, flat); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); VPX_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); @@ -82,10 +82,10 @@ void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr); - TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, - p3, p2, p1, p0, q0, q1, q2, q3); - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, - hev, mask, flat); + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); VPX_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec2, vec3); @@ -111,12 +111,12 @@ void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch, v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); - LD_UB8(src - 4 + (8 * pitch), pitch, - row8, row9, row10, row11, row12, row13, row14, row15); + LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, + row14, row15); - TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, - row8, row9, row10, row11, row12, row13, row14, row15, - p3, p2, p1, p0, q0, q1, q2, q3); + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); @@ -130,8 +130,8 @@ void vpx_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch, limit1 = (v16u8)__msa_fill_b(*limit1_ptr); limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, - hev, mask, flat); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); VPX_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); diff --git a/vpx_dsp/mips/loopfilter_8_msa.c b/vpx_dsp/mips/loopfilter_8_msa.c index 5d5dbd26cb7f5c4a847bdc0d82c93abc5fe274c4..af0d628fa36e7fdfd75ecd1c6d64e98c027bb32e 100644 --- a/vpx_dsp/mips/loopfilter_8_msa.c +++ b/vpx_dsp/mips/loopfilter_8_msa.c @@ -29,8 +29,8 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr); - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, - hev, mask, flat); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat); VPX_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); @@ -43,16 +43,14 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, q1_d = __msa_copy_u_d((v2i64)q1_out, 0); SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, - zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, - q2_r, q3_r); + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); /* convert 16 bit output data into 8 bit */ - PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, - zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8, - q0_filter8); + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, + q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); /* store pixel values */ @@ -80,13 +78,10 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, } } -void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *b_limit1, - const uint8_t *limit1, - const uint8_t *thresh1) { +void vpx_lpf_horizontal_8_dual_msa( + uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; @@ -112,17 +107,16 @@ void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch, limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, - hev, mask, flat); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat); VPX_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); if (__msa_test_bz_v(flat)) { ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, - zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, - q2_r, q3_r); + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); @@ -170,16 +164,16 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, /* load vector elements */ LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); - TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, - p3, p2, p1, p0, q0, q1, q2, q3); + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); thresh = (v16u8)__msa_fill_b(*thresh_ptr); b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); limit = (v16u8)__msa_fill_b(*limit_ptr); /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, - hev, mask, flat); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); /* flat4 */ VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat); /* filter4 */ @@ -197,9 +191,8 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, src += 4 * pitch; ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, - zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, - q3_r); + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); /* convert 16 bit output data into 8 bit */ @@ -232,11 +225,9 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, } void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit0, - const uint8_t *limit0, + const uint8_t *b_limit0, const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *b_limit1, - const uint8_t *limit1, + const uint8_t *b_limit1, const uint8_t *limit1, const uint8_t *thresh1) { uint8_t *temp_src; v16u8 p3, p2, p1, p0, q3, q2, q1, q0; @@ -257,9 +248,9 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); /* transpose 16x8 matrix into 8x16 */ - TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, - q3, q2, q1, q0, row12, row13, row14, row15, - p3, p2, p1, p0, q0, q1, q2, q3); + TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, + row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, + q3); thresh = (v16u8)__msa_fill_b(*thresh0); vec0 = (v8i16)__msa_fill_b(*thresh1); @@ -274,8 +265,8 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); /* mask and hev */ - LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, - hev, mask, flat); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); /* flat4 */ VPX_FLAT4(p3, p2, p0, q0, q2, q3, flat); /* filter4 */ @@ -292,9 +283,8 @@ void vpx_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, src += 8 * pitch; ST4x8_UB(vec4, vec5, src, pitch); } else { - ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, - zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, - q3_r); + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); VPX_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c index 8414b9ed53f840891913f388a66affe759dba73e..f1743679a7d690063f9daa23e40959c4bff009f6 100644 --- a/vpx_dsp/mips/loopfilter_filters_dspr2.c +++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c @@ -19,33 +19,30 @@ #include "vpx_mem/vpx_mem.h" #if HAVE_DSPR2 -void vpx_lpf_horizontal_4_dspr2(unsigned char *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, +void vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - uint8_t i; - uint32_t mask; - uint32_t hev; - uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; - uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; + uint8_t i; + uint32_t mask; + uint32_t hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; uflimit = *blimit; ulimit = *limit; uthresh = *thresh; /* create quad-byte */ - __asm__ __volatile__ ( + __asm__ __volatile__( "replv.qb %[thresh_vec], %[uthresh] \n\t" "replv.qb %[flimit_vec], %[uflimit] \n\t" "replv.qb %[limit_vec], %[ulimit] \n\t" - : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), - [limit_vec] "=r" (limit_vec) - : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) - ); + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); /* prefetch data for store */ prefetch_store(s); @@ -62,49 +59,44 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s, s5 = s4 + pitch; s6 = s5 + pitch; - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[p1], (%[s1]) \n\t" "lw %[p2], (%[s2]) \n\t" "lw %[p3], (%[s3]) \n\t" "lw %[p4], (%[s4]) \n\t" - : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4) - : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) - ); + : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); /* if (p1 - p4 == 0) and (p2 - p3 == 0) mask will be zero and filtering is not needed */ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[pm1], (%[sm1]) \n\t" "lw %[p0], (%[s0]) \n\t" "lw %[p5], (%[s5]) \n\t" "lw %[p6], (%[s6]) \n\t" - : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5), - [p6] "=&r" (p6) - : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6) - ); + : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6) + : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6)); - filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, - pm1, p0, p3, p4, p5, p6, - thresh_vec, &hev, &mask); + filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, + p6, thresh_vec, &hev, &mask); /* if mask == 0 do filtering is not needed */ if (mask) { /* filtering */ filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); - __asm__ __volatile__ ( + __asm__ __volatile__( "sw %[p1], (%[s1]) \n\t" "sw %[p2], (%[s2]) \n\t" "sw %[p3], (%[s3]) \n\t" "sw %[p4], (%[s4]) \n\t" : - : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4), - [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) - ); + : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4), + [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); } } @@ -112,33 +104,30 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s, } } -void vpx_lpf_vertical_4_dspr2(unsigned char *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, +void vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - uint8_t i; - uint32_t mask, hev; - uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; - uint8_t *s1, *s2, *s3, *s4; - uint32_t prim1, prim2, sec3, sec4, prim3, prim4; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; + uint8_t i; + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; uflimit = *blimit; ulimit = *limit; uthresh = *thresh; /* create quad-byte */ - __asm__ __volatile__ ( + __asm__ __volatile__( "replv.qb %[thresh_vec], %[uthresh] \n\t" "replv.qb %[flimit_vec], %[uflimit] \n\t" "replv.qb %[limit_vec], %[ulimit] \n\t" - : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), - [limit_vec] "=r" (limit_vec) - : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) - ); + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); /* prefetch data for store */ prefetch_store(s + pitch); @@ -148,22 +137,22 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s, s2 = s + pitch; s3 = s2 + pitch; s4 = s3 + pitch; - s = s4 + pitch; + s = s4 + pitch; /* load quad-byte vectors * memory is 4 byte aligned */ - p2 = *((uint32_t *)(s1 - 4)); - p6 = *((uint32_t *)(s1)); - p1 = *((uint32_t *)(s2 - 4)); - p5 = *((uint32_t *)(s2)); - p0 = *((uint32_t *)(s3 - 4)); - p4 = *((uint32_t *)(s3)); + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); pm1 = *((uint32_t *)(s4 - 4)); - p3 = *((uint32_t *)(s4)); + p3 = *((uint32_t *)(s4)); /* transpose pm1, p0, p1, p2 */ - __asm__ __volatile__ ( + __asm__ __volatile__( "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" @@ -179,15 +168,13 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s, "append %[p1], %[sec3], 16 \n\t" "append %[pm1], %[sec4], 16 \n\t" - : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), - [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), - [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1), - [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) - : - ); + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); /* transpose p3, p4, p5, p6 */ - __asm__ __volatile__ ( + __asm__ __volatile__( "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" @@ -203,20 +190,17 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s, "append %[p5], %[sec3], 16 \n\t" "append %[p3], %[sec4], 16 \n\t" - : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), - [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), - [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), - [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) - : - ); + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); /* if (p1 - p4 == 0) and (p2 - p3 == 0) * mask will be zero and filtering is not needed */ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { - filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, - p0, p3, p4, p5, p6, thresh_vec, - &hev, &mask); + filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, + p6, thresh_vec, &hev, &mask); /* if mask == 0 do filtering is not needed */ if (mask) { @@ -227,107 +211,93 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s, * don't use transpose on output data * because memory isn't aligned */ - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p4], 1(%[s4]) \n\t" "sb %[p3], 0(%[s4]) \n\t" "sb %[p2], -1(%[s4]) \n\t" "sb %[p1], -2(%[s4]) \n\t" : - : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), - [s4] "r" (s4) - ); + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s4] "r"(s4)); - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p4], %[p4], 8 \n\t" "srl %[p3], %[p3], 8 \n\t" "srl %[p2], %[p2], 8 \n\t" "srl %[p1], %[p1], 8 \n\t" - : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) - : - ); + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p4], 1(%[s3]) \n\t" "sb %[p3], 0(%[s3]) \n\t" "sb %[p2], -1(%[s3]) \n\t" "sb %[p1], -2(%[s3]) \n\t" - : [p1] "+r" (p1) - : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3) - ); + : [p1] "+r"(p1) + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3)); - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p4], %[p4], 8 \n\t" "srl %[p3], %[p3], 8 \n\t" "srl %[p2], %[p2], 8 \n\t" "srl %[p1], %[p1], 8 \n\t" - : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) - : - ); + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p4], 1(%[s2]) \n\t" "sb %[p3], 0(%[s2]) \n\t" "sb %[p2], -1(%[s2]) \n\t" "sb %[p1], -2(%[s2]) \n\t" : - : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), - [s2] "r" (s2) - ); + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s2] "r"(s2)); - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p4], %[p4], 8 \n\t" "srl %[p3], %[p3], 8 \n\t" "srl %[p2], %[p2], 8 \n\t" "srl %[p1], %[p1], 8 \n\t" - : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) - : - ); + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p4], 1(%[s1]) \n\t" "sb %[p3], 0(%[s1]) \n\t" "sb %[p2], -1(%[s1]) \n\t" "sb %[p1], -2(%[s1]) \n\t" : - : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), - [s1] "r" (s1) - ); + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s1] "r"(s1)); } } } } -void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { +void vpx_lpf_horizontal_4_dual_dspr2( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0); vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1); } -void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { +void vpx_lpf_horizontal_8_dual_dspr2( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); } -void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, - const uint8_t *blimit0, +void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, @@ -337,8 +307,7 @@ void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); } -void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, - const uint8_t *blimit0, +void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, @@ -348,8 +317,7 @@ void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); } -void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, - const uint8_t *blimit, +void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.h b/vpx_dsp/mips/loopfilter_filters_dspr2.h index db39854368cdcaa595df1ecd080d8db657917db1..11f286d281beb6f83290abc48f1c09ad7c6d980a 100644 --- a/vpx_dsp/mips/loopfilter_filters_dspr2.h +++ b/vpx_dsp/mips/loopfilter_filters_dspr2.h @@ -24,22 +24,21 @@ extern "C" { #if HAVE_DSPR2 /* inputs & outputs are quad-byte vectors */ -static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, - uint32_t *ps1, uint32_t *ps0, - uint32_t *qs0, uint32_t *qs1) { - int32_t vpx_filter_l, vpx_filter_r; - int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; - int32_t subr_r, subr_l; - uint32_t t1, t2, HWM, t3; - uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; - int32_t vps1, vps0, vqs0, vqs1; - int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; - uint32_t N128; +static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1, + uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) { + int32_t vpx_filter_l, vpx_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; N128 = 0x80808080; - t1 = 0x03000300; - t2 = 0x04000400; - t3 = 0x01000100; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; HWM = 0xFF00FF00; vps0 = (*ps0) ^ N128; @@ -72,7 +71,7 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, hev_r = hev << 8; hev_r = hev_r & HWM; - __asm__ __volatile__ ( + __asm__ __volatile__( /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */ "subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t" "subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t" @@ -99,20 +98,17 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, "and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t" "and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t" - : [vpx_filter_l] "=&r" (vpx_filter_l), - [vpx_filter_r] "=&r" (vpx_filter_r), - [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r), - [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r) - : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l), - [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r), - [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r), - [mask_l] "r" (mask_l), [mask_r] "r" (mask_r), - [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), - [HWM] "r" (HWM) - ); + : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); /* save bottom 3 bits so that we round one side +4 and the other +3 */ - __asm__ __volatile__ ( + __asm__ __volatile__( /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */ "addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t" "addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t" @@ -137,15 +133,14 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" - : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r), - [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r), - [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r), - [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r) - : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM), - [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r) - ); + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), + [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r)); - __asm__ __volatile__ ( + __asm__ __volatile__( /* (vpx_filter += 1) >>= 1 */ "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" @@ -162,11 +157,10 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" - : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r), - [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), - [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r) - : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r) - ); + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); /* Create quad-bytes from halfword pairs */ vqs0_l = vqs0_l & HWM; @@ -174,16 +168,15 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, vps0_l = vps0_l & HWM; vps1_l = vps1_l & HWM; - __asm__ __volatile__ ( + __asm__ __volatile__( "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" - : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r), - [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r) - : - ); + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); vqs0 = vqs0_l | vqs0_r; vqs1 = vqs1_l | vqs1_r; @@ -196,24 +189,23 @@ static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, *qs1 = vqs1 ^ N128; } -static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, - uint32_t ps1, uint32_t ps0, - uint32_t qs0, uint32_t qs1, +static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1, + uint32_t ps0, uint32_t qs0, uint32_t qs1, uint32_t *p1_f0, uint32_t *p0_f0, uint32_t *q0_f0, uint32_t *q1_f0) { - int32_t vpx_filter_l, vpx_filter_r; - int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; - int32_t subr_r, subr_l; - uint32_t t1, t2, HWM, t3; - uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; - int32_t vps1, vps0, vqs0, vqs1; - int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; - uint32_t N128; + int32_t vpx_filter_l, vpx_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; N128 = 0x80808080; - t1 = 0x03000300; - t2 = 0x04000400; - t3 = 0x01000100; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; HWM = 0xFF00FF00; vps0 = (ps0) ^ N128; @@ -246,7 +238,7 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, hev_r = hev << 8; hev_r = hev_r & HWM; - __asm__ __volatile__ ( + __asm__ __volatile__( /* vpx_filter = vp8_signed_char_clamp(ps1 - qs1); */ "subq_s.ph %[vpx_filter_l], %[vps1_l], %[vqs1_l] \n\t" "subq_s.ph %[vpx_filter_r], %[vps1_r], %[vqs1_r] \n\t" @@ -273,19 +265,17 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, "and %[vpx_filter_l], %[vpx_filter_l], %[mask_l] \n\t" "and %[vpx_filter_r], %[vpx_filter_r], %[mask_r] \n\t" - : [vpx_filter_l] "=&r" (vpx_filter_l), - [vpx_filter_r] "=&r" (vpx_filter_r), - [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r), - [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r) - : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l), - [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r), - [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r), - [mask_l] "r" (mask_l), [mask_r] "r" (mask_r), - [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM) - ); + : [vpx_filter_l] "=&r"(vpx_filter_l), [vpx_filter_r] "=&r"(vpx_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); /* save bottom 3 bits so that we round one side +4 and the other +3 */ - __asm__ __volatile__ ( + __asm__ __volatile__( /* Filter2 = vp8_signed_char_clamp(vpx_filter + 3) >>= 3; */ "addq_s.ph %[Filter1_l], %[vpx_filter_l], %[t2] \n\t" "addq_s.ph %[Filter1_r], %[vpx_filter_r], %[t2] \n\t" @@ -310,15 +300,14 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" - : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r), - [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r), - [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r), - [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r) - : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM), - [vpx_filter_l] "r" (vpx_filter_l), [vpx_filter_r] "r" (vpx_filter_r) - ); + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), + [vpx_filter_l] "r"(vpx_filter_l), [vpx_filter_r] "r"(vpx_filter_r)); - __asm__ __volatile__ ( + __asm__ __volatile__( /* (vpx_filter += 1) >>= 1 */ "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" @@ -335,11 +324,10 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" - : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r), - [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), - [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r) - : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r) - ); + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); /* Create quad-bytes from halfword pairs */ vqs0_l = vqs0_l & HWM; @@ -347,16 +335,15 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, vps0_l = vps0_l & HWM; vps1_l = vps1_l & HWM; - __asm__ __volatile__ ( + __asm__ __volatile__( "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" - : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r), - [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r) - : - ); + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); vqs0 = vqs0_l | vqs0_r; vqs1 = vqs1_l | vqs1_r; @@ -369,18 +356,17 @@ static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, *q1_f0 = vqs1 ^ N128; } -static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, - uint32_t *op1, uint32_t *op0, - uint32_t *oq0, uint32_t *oq1, +static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1, + uint32_t *op0, uint32_t *oq0, uint32_t *oq1, uint32_t *oq2, uint32_t *oq3) { /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; - uint32_t res_op2, res_op1, res_op0; - uint32_t res_oq0, res_oq1, res_oq2; - uint32_t tmp; - uint32_t add_p210_q012; - uint32_t u32Four = 0x00040004; + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ @@ -389,7 +375,7 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ - __asm__ __volatile__ ( + __asm__ __volatile__( "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" @@ -428,15 +414,12 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, "shrl.ph %[res_op0], %[res_op0], 3 \n\t" "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" - : [add_p210_q012] "=&r" (add_p210_q012), - [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2), - [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0), - [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1), - [res_oq2] "=&r" (res_oq2) - : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1), - [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3), - [u32Four] "r" (u32Four) - ); + : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), + [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), + [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); *op2 = res_op2; *op1 = res_op1; @@ -446,20 +429,18 @@ static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, *oq2 = res_oq2; } -static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, - uint32_t p1, uint32_t p0, - uint32_t q0, uint32_t q1, - uint32_t q2, uint32_t q3, - uint32_t *op2_f1, +static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, uint32_t *op2_f1, uint32_t *op1_f1, uint32_t *op0_f1, uint32_t *oq0_f1, uint32_t *oq1_f1, uint32_t *oq2_f1) { /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ - uint32_t res_op2, res_op1, res_op0; - uint32_t res_oq0, res_oq1, res_oq2; - uint32_t tmp; - uint32_t add_p210_q012; - uint32_t u32Four = 0x00040004; + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ @@ -468,7 +449,7 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ - __asm__ __volatile__ ( + __asm__ __volatile__( "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" @@ -507,14 +488,12 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, "shrl.ph %[res_op0], %[res_op0], 3 \n\t" "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" - : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp), - [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1), - [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0), - [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2) - : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1), - [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3), - [u32Four] "r" (u32Four) - ); + : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), + [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), + [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); *op2_f1 = res_op2; *op1_f1 = res_op1; @@ -524,25 +503,22 @@ static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, *oq2_f1 = res_oq2; } -static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6, - uint32_t *op5, uint32_t *op4, - uint32_t *op3, uint32_t *op2, - uint32_t *op1, uint32_t *op0, - uint32_t *oq0, uint32_t *oq1, - uint32_t *oq2, uint32_t *oq3, - uint32_t *oq4, uint32_t *oq5, - uint32_t *oq6, uint32_t *oq7) { +static INLINE void wide_mbfilter_dspr2( + uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3, + uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6, + uint32_t *oq7) { const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4; const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; - uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0; - uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6; - uint32_t tmp; - uint32_t add_p6toq6; - uint32_t u32Eight = 0x00080008; + uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6; + uint32_t tmp; + uint32_t add_p6toq6; + uint32_t u32Eight = 0x00080008; - __asm__ __volatile__ ( + __asm__ __volatile__( /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6 which is used most of the time */ "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t" @@ -560,15 +536,13 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6, "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t" "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t" - : [add_p6toq6] "=&r" (add_p6toq6) - : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), - [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), - [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3), - [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), - [u32Eight] "r" (u32Eight) - ); + : [add_p6toq6] "=&r"(add_p6toq6) + : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), + [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), + [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), + [u32Eight] "r"(u32Eight)); - __asm__ __volatile__ ( + __asm__ __volatile__( /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4) */ "shll.ph %[tmp], %[p7], 3 \n\t" @@ -643,16 +617,14 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6, "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t" "shrl.ph %[res_op0], %[res_op0], 4 \n\t" - : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5), - [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3), - [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1), - [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp) - : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), - [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), - [q2] "r" (q2), [q1] "r" (q1), - [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), - [add_p6toq6] "r" (add_p6toq6) - ); + : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5), + [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp) + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1), + [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), + [add_p6toq6] "r"(add_p6toq6)); *op6 = res_op6; *op5 = res_op5; @@ -662,7 +634,7 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6, *op1 = res_op1; *op0 = res_op0; - __asm__ __volatile__ ( + __asm__ __volatile__( /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */ "addu.ph %[res_oq0], %[q7], %[q0] \n\t" @@ -737,16 +709,14 @@ static INLINE void wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6, "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t" "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t" - : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5), - [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3), - [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1), - [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp) - : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), - [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0), - [p1] "r" (p1), [p2] "r" (p2), - [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6), - [add_p6toq6] "r" (add_p6toq6) - ); + : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5), + [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3), + [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1), + [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp) + : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), + [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2), + [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6), + [add_p6toq6] "r"(add_p6toq6)); *oq0 = res_oq0; *oq1 = res_oq1; diff --git a/vpx_dsp/mips/loopfilter_macros_dspr2.h b/vpx_dsp/mips/loopfilter_macros_dspr2.h index a990b4061bab30490803a13b6a7f4d0aff37fa5c..769371dff8aa2e4f8ee23af309a523a69bb7d2c1 100644 --- a/vpx_dsp/mips/loopfilter_macros_dspr2.h +++ b/vpx_dsp/mips/loopfilter_macros_dspr2.h @@ -22,453 +22,410 @@ extern "C" { #endif #if HAVE_DSPR2 -#define STORE_F0() { \ - __asm__ __volatile__ ( \ - "sb %[q1_f0], 1(%[s4]) \n\t" \ - "sb %[q0_f0], 0(%[s4]) \n\t" \ - "sb %[p0_f0], -1(%[s4]) \n\t" \ - "sb %[p1_f0], -2(%[s4]) \n\t" \ - \ - : \ - : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \ - [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \ - [s4] "r" (s4) \ - ); \ - \ - __asm__ __volatile__ ( \ - "srl %[q1_f0], %[q1_f0], 8 \n\t" \ - "srl %[q0_f0], %[q0_f0], 8 \n\t" \ - "srl %[p0_f0], %[p0_f0], 8 \n\t" \ - "srl %[p1_f0], %[p1_f0], 8 \n\t" \ - \ - : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \ - [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \ - : \ - ); \ - \ - __asm__ __volatile__ ( \ - "sb %[q1_f0], 1(%[s3]) \n\t" \ - "sb %[q0_f0], 0(%[s3]) \n\t" \ - "sb %[p0_f0], -1(%[s3]) \n\t" \ - "sb %[p1_f0], -2(%[s3]) \n\t" \ - \ - : [p1_f0] "+r" (p1_f0) \ - : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \ - [s3] "r" (s3), [p0_f0] "r" (p0_f0) \ - ); \ - \ - __asm__ __volatile__ ( \ - "srl %[q1_f0], %[q1_f0], 8 \n\t" \ - "srl %[q0_f0], %[q0_f0], 8 \n\t" \ - "srl %[p0_f0], %[p0_f0], 8 \n\t" \ - "srl %[p1_f0], %[p1_f0], 8 \n\t" \ - \ - : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \ - [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \ - : \ - ); \ - \ - __asm__ __volatile__ ( \ - "sb %[q1_f0], 1(%[s2]) \n\t" \ - "sb %[q0_f0], 0(%[s2]) \n\t" \ - "sb %[p0_f0], -1(%[s2]) \n\t" \ - "sb %[p1_f0], -2(%[s2]) \n\t" \ - \ - : \ - : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \ - [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \ - [s2] "r" (s2) \ - ); \ - \ - __asm__ __volatile__ ( \ - "srl %[q1_f0], %[q1_f0], 8 \n\t" \ - "srl %[q0_f0], %[q0_f0], 8 \n\t" \ - "srl %[p0_f0], %[p0_f0], 8 \n\t" \ - "srl %[p1_f0], %[p1_f0], 8 \n\t" \ - \ - : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \ - [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \ - : \ - ); \ - \ - __asm__ __volatile__ ( \ - "sb %[q1_f0], 1(%[s1]) \n\t" \ - "sb %[q0_f0], 0(%[s1]) \n\t" \ - "sb %[p0_f0], -1(%[s1]) \n\t" \ - "sb %[p1_f0], -2(%[s1]) \n\t" \ - \ - : \ - : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \ - [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \ - [s1] "r" (s1) \ - ); \ -} +#define STORE_F0() \ + { \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s4]) \n\t" \ + "sb %[q0_f0], 0(%[s4]) \n\t" \ + "sb %[p0_f0], -1(%[s4]) \n\t" \ + "sb %[p1_f0], -2(%[s4]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s3]) \n\t" \ + "sb %[q0_f0], 0(%[s3]) \n\t" \ + "sb %[p0_f0], -1(%[s3]) \n\t" \ + "sb %[p1_f0], -2(%[s3]) \n\t" \ + \ + : [p1_f0] "+r"(p1_f0) \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \ + [p0_f0] "r"(p0_f0)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s2]) \n\t" \ + "sb %[q0_f0], 0(%[s2]) \n\t" \ + "sb %[p0_f0], -1(%[s2]) \n\t" \ + "sb %[p1_f0], -2(%[s2]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s1]) \n\t" \ + "sb %[q0_f0], 0(%[s1]) \n\t" \ + "sb %[p0_f0], -1(%[s1]) \n\t" \ + "sb %[p1_f0], -2(%[s1]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \ + } -#define STORE_F1() { \ - __asm__ __volatile__ ( \ - "sb %[q2_r], 2(%[s4]) \n\t" \ - "sb %[q1_r], 1(%[s4]) \n\t" \ - "sb %[q0_r], 0(%[s4]) \n\t" \ - "sb %[p0_r], -1(%[s4]) \n\t" \ - "sb %[p1_r], -2(%[s4]) \n\t" \ - "sb %[p2_r], -3(%[s4]) \n\t" \ - \ - : \ - : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r), \ - [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \ - [s4] "r" (s4) \ - ); \ - \ - __asm__ __volatile__ ( \ - "srl %[q2_r], %[q2_r], 16 \n\t" \ - "srl %[q1_r], %[q1_r], 16 \n\t" \ - "srl %[q0_r], %[q0_r], 16 \n\t" \ - "srl %[p0_r], %[p0_r], 16 \n\t" \ - "srl %[p1_r], %[p1_r], 16 \n\t" \ - "srl %[p2_r], %[p2_r], 16 \n\t" \ - \ - : [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), [q0_r] "+r" (q0_r), \ - [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r) \ - : \ - ); \ - \ - __asm__ __volatile__ ( \ - "sb %[q2_r], 2(%[s3]) \n\t" \ - "sb %[q1_r], 1(%[s3]) \n\t" \ - "sb %[q0_r], 0(%[s3]) \n\t" \ - "sb %[p0_r], -1(%[s3]) \n\t" \ - "sb %[p1_r], -2(%[s3]) \n\t" \ - "sb %[p2_r], -3(%[s3]) \n\t" \ - \ - : \ - : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r), \ - [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \ - [s3] "r" (s3) \ - ); \ - \ - __asm__ __volatile__ ( \ - "sb %[q2_l], 2(%[s2]) \n\t" \ - "sb %[q1_l], 1(%[s2]) \n\t" \ - "sb %[q0_l], 0(%[s2]) \n\t" \ - "sb %[p0_l], -1(%[s2]) \n\t" \ - "sb %[p1_l], -2(%[s2]) \n\t" \ - "sb %[p2_l], -3(%[s2]) \n\t" \ - \ - : \ - : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l), \ - [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \ - [s2] "r" (s2) \ - ); \ - \ - __asm__ __volatile__ ( \ - "srl %[q2_l], %[q2_l], 16 \n\t" \ - "srl %[q1_l], %[q1_l], 16 \n\t" \ - "srl %[q0_l], %[q0_l], 16 \n\t" \ - "srl %[p0_l], %[p0_l], 16 \n\t" \ - "srl %[p1_l], %[p1_l], 16 \n\t" \ - "srl %[p2_l], %[p2_l], 16 \n\t" \ - \ - : [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), [q0_l] "+r" (q0_l), \ - [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l) \ - : \ - ); \ - \ - __asm__ __volatile__ ( \ - "sb %[q2_l], 2(%[s1]) \n\t" \ - "sb %[q1_l], 1(%[s1]) \n\t" \ - "sb %[q0_l], 0(%[s1]) \n\t" \ - "sb %[p0_l], -1(%[s1]) \n\t" \ - "sb %[p1_l], -2(%[s1]) \n\t" \ - "sb %[p2_l], -3(%[s1]) \n\t" \ - \ - : \ - : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l), \ - [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \ - [s1] "r" (s1) \ - ); \ -} +#define STORE_F1() \ + { \ + __asm__ __volatile__( \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + \ + : \ + : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ + [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + \ + : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \ + [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + \ + : \ + : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ + [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \ + \ + __asm__ __volatile__( \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + \ + : \ + : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ + [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + \ + : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \ + [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + \ + : \ + : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ + [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \ + } -#define STORE_F2() { \ - __asm__ __volatile__ ( \ - "sb %[q6_r], 6(%[s4]) \n\t" \ - "sb %[q5_r], 5(%[s4]) \n\t" \ - "sb %[q4_r], 4(%[s4]) \n\t" \ - "sb %[q3_r], 3(%[s4]) \n\t" \ - "sb %[q2_r], 2(%[s4]) \n\t" \ - "sb %[q1_r], 1(%[s4]) \n\t" \ - "sb %[q0_r], 0(%[s4]) \n\t" \ - "sb %[p0_r], -1(%[s4]) \n\t" \ - "sb %[p1_r], -2(%[s4]) \n\t" \ - "sb %[p2_r], -3(%[s4]) \n\t" \ - "sb %[p3_r], -4(%[s4]) \n\t" \ - "sb %[p4_r], -5(%[s4]) \n\t" \ - "sb %[p5_r], -6(%[s4]) \n\t" \ - "sb %[p6_r], -7(%[s4]) \n\t" \ - \ - : \ - : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r), \ - [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), \ - [q0_r] "r" (q0_r), \ - [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \ - [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r), \ - [p6_r] "r" (p6_r), \ - [s4] "r" (s4) \ - ); \ - \ - __asm__ __volatile__ ( \ - "srl %[q6_r], %[q6_r], 16 \n\t" \ - "srl %[q5_r], %[q5_r], 16 \n\t" \ - "srl %[q4_r], %[q4_r], 16 \n\t" \ - "srl %[q3_r], %[q3_r], 16 \n\t" \ - "srl %[q2_r], %[q2_r], 16 \n\t" \ - "srl %[q1_r], %[q1_r], 16 \n\t" \ - "srl %[q0_r], %[q0_r], 16 \n\t" \ - "srl %[p0_r], %[p0_r], 16 \n\t" \ - "srl %[p1_r], %[p1_r], 16 \n\t" \ - "srl %[p2_r], %[p2_r], 16 \n\t" \ - "srl %[p3_r], %[p3_r], 16 \n\t" \ - "srl %[p4_r], %[p4_r], 16 \n\t" \ - "srl %[p5_r], %[p5_r], 16 \n\t" \ - "srl %[p6_r], %[p6_r], 16 \n\t" \ - \ - : [q6_r] "+r" (q6_r), [q5_r] "+r" (q5_r), [q4_r] "+r" (q4_r), \ - [q3_r] "+r" (q3_r), [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), \ - [q0_r] "+r" (q0_r), \ - [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r), \ - [p3_r] "+r" (p3_r), [p4_r] "+r" (p4_r), [p5_r] "+r" (p5_r), \ - [p6_r] "+r" (p6_r) \ - : \ - ); \ - \ - __asm__ __volatile__ ( \ - "sb %[q6_r], 6(%[s3]) \n\t" \ - "sb %[q5_r], 5(%[s3]) \n\t" \ - "sb %[q4_r], 4(%[s3]) \n\t" \ - "sb %[q3_r], 3(%[s3]) \n\t" \ - "sb %[q2_r], 2(%[s3]) \n\t" \ - "sb %[q1_r], 1(%[s3]) \n\t" \ - "sb %[q0_r], 0(%[s3]) \n\t" \ - "sb %[p0_r], -1(%[s3]) \n\t" \ - "sb %[p1_r], -2(%[s3]) \n\t" \ - "sb %[p2_r], -3(%[s3]) \n\t" \ - "sb %[p3_r], -4(%[s3]) \n\t" \ - "sb %[p4_r], -5(%[s3]) \n\t" \ - "sb %[p5_r], -6(%[s3]) \n\t" \ - "sb %[p6_r], -7(%[s3]) \n\t" \ - \ - : \ - : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r), \ - [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), \ - [q0_r] "r" (q0_r), \ - [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \ - [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r), \ - [p6_r] "r" (p6_r), \ - [s3] "r" (s3) \ - ); \ - \ - __asm__ __volatile__ ( \ - "sb %[q6_l], 6(%[s2]) \n\t" \ - "sb %[q5_l], 5(%[s2]) \n\t" \ - "sb %[q4_l], 4(%[s2]) \n\t" \ - "sb %[q3_l], 3(%[s2]) \n\t" \ - "sb %[q2_l], 2(%[s2]) \n\t" \ - "sb %[q1_l], 1(%[s2]) \n\t" \ - "sb %[q0_l], 0(%[s2]) \n\t" \ - "sb %[p0_l], -1(%[s2]) \n\t" \ - "sb %[p1_l], -2(%[s2]) \n\t" \ - "sb %[p2_l], -3(%[s2]) \n\t" \ - "sb %[p3_l], -4(%[s2]) \n\t" \ - "sb %[p4_l], -5(%[s2]) \n\t" \ - "sb %[p5_l], -6(%[s2]) \n\t" \ - "sb %[p6_l], -7(%[s2]) \n\t" \ - \ - : \ - : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l), \ - [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), \ - [q0_l] "r" (q0_l), \ - [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \ - [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l), \ - [p6_l] "r" (p6_l), \ - [s2] "r" (s2) \ - ); \ - \ - __asm__ __volatile__ ( \ - "srl %[q6_l], %[q6_l], 16 \n\t" \ - "srl %[q5_l], %[q5_l], 16 \n\t" \ - "srl %[q4_l], %[q4_l], 16 \n\t" \ - "srl %[q3_l], %[q3_l], 16 \n\t" \ - "srl %[q2_l], %[q2_l], 16 \n\t" \ - "srl %[q1_l], %[q1_l], 16 \n\t" \ - "srl %[q0_l], %[q0_l], 16 \n\t" \ - "srl %[p0_l], %[p0_l], 16 \n\t" \ - "srl %[p1_l], %[p1_l], 16 \n\t" \ - "srl %[p2_l], %[p2_l], 16 \n\t" \ - "srl %[p3_l], %[p3_l], 16 \n\t" \ - "srl %[p4_l], %[p4_l], 16 \n\t" \ - "srl %[p5_l], %[p5_l], 16 \n\t" \ - "srl %[p6_l], %[p6_l], 16 \n\t" \ - \ - : [q6_l] "+r" (q6_l), [q5_l] "+r" (q5_l), [q4_l] "+r" (q4_l), \ - [q3_l] "+r" (q3_l), [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), \ - [q0_l] "+r" (q0_l), \ - [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l), \ - [p3_l] "+r" (p3_l), [p4_l] "+r" (p4_l), [p5_l] "+r" (p5_l), \ - [p6_l] "+r" (p6_l) \ - : \ - ); \ - \ - __asm__ __volatile__ ( \ - "sb %[q6_l], 6(%[s1]) \n\t" \ - "sb %[q5_l], 5(%[s1]) \n\t" \ - "sb %[q4_l], 4(%[s1]) \n\t" \ - "sb %[q3_l], 3(%[s1]) \n\t" \ - "sb %[q2_l], 2(%[s1]) \n\t" \ - "sb %[q1_l], 1(%[s1]) \n\t" \ - "sb %[q0_l], 0(%[s1]) \n\t" \ - "sb %[p0_l], -1(%[s1]) \n\t" \ - "sb %[p1_l], -2(%[s1]) \n\t" \ - "sb %[p2_l], -3(%[s1]) \n\t" \ - "sb %[p3_l], -4(%[s1]) \n\t" \ - "sb %[p4_l], -5(%[s1]) \n\t" \ - "sb %[p5_l], -6(%[s1]) \n\t" \ - "sb %[p6_l], -7(%[s1]) \n\t" \ - \ - : \ - : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l), \ - [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), \ - [q0_l] "r" (q0_l), \ - [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \ - [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l), \ - [p6_l] "r" (p6_l), \ - [s1] "r" (s1) \ - ); \ -} +#define STORE_F2() \ + { \ + __asm__ __volatile__( \ + "sb %[q6_r], 6(%[s4]) \n\t" \ + "sb %[q5_r], 5(%[s4]) \n\t" \ + "sb %[q4_r], 4(%[s4]) \n\t" \ + "sb %[q3_r], 3(%[s4]) \n\t" \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + "sb %[p3_r], -4(%[s4]) \n\t" \ + "sb %[p4_r], -5(%[s4]) \n\t" \ + "sb %[p5_r], -6(%[s4]) \n\t" \ + "sb %[p6_r], -7(%[s4]) \n\t" \ + \ + : \ + : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ + [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ + [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ + [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ + [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q6_r], %[q6_r], 16 \n\t" \ + "srl %[q5_r], %[q5_r], 16 \n\t" \ + "srl %[q4_r], %[q4_r], 16 \n\t" \ + "srl %[q3_r], %[q3_r], 16 \n\t" \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + "srl %[p3_r], %[p3_r], 16 \n\t" \ + "srl %[p4_r], %[p4_r], 16 \n\t" \ + "srl %[p5_r], %[p5_r], 16 \n\t" \ + "srl %[p6_r], %[p6_r], 16 \n\t" \ + \ + : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \ + [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \ + [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \ + [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \ + [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q6_r], 6(%[s3]) \n\t" \ + "sb %[q5_r], 5(%[s3]) \n\t" \ + "sb %[q4_r], 4(%[s3]) \n\t" \ + "sb %[q3_r], 3(%[s3]) \n\t" \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + "sb %[p3_r], -4(%[s3]) \n\t" \ + "sb %[p4_r], -5(%[s3]) \n\t" \ + "sb %[p5_r], -6(%[s3]) \n\t" \ + "sb %[p6_r], -7(%[s3]) \n\t" \ + \ + : \ + : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ + [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ + [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ + [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ + [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \ + \ + __asm__ __volatile__( \ + "sb %[q6_l], 6(%[s2]) \n\t" \ + "sb %[q5_l], 5(%[s2]) \n\t" \ + "sb %[q4_l], 4(%[s2]) \n\t" \ + "sb %[q3_l], 3(%[s2]) \n\t" \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + "sb %[p3_l], -4(%[s2]) \n\t" \ + "sb %[p4_l], -5(%[s2]) \n\t" \ + "sb %[p5_l], -6(%[s2]) \n\t" \ + "sb %[p6_l], -7(%[s2]) \n\t" \ + \ + : \ + : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ + [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ + [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ + [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ + [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q6_l], %[q6_l], 16 \n\t" \ + "srl %[q5_l], %[q5_l], 16 \n\t" \ + "srl %[q4_l], %[q4_l], 16 \n\t" \ + "srl %[q3_l], %[q3_l], 16 \n\t" \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + "srl %[p3_l], %[p3_l], 16 \n\t" \ + "srl %[p4_l], %[p4_l], 16 \n\t" \ + "srl %[p5_l], %[p5_l], 16 \n\t" \ + "srl %[p6_l], %[p6_l], 16 \n\t" \ + \ + : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \ + [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \ + [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \ + [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \ + [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q6_l], 6(%[s1]) \n\t" \ + "sb %[q5_l], 5(%[s1]) \n\t" \ + "sb %[q4_l], 4(%[s1]) \n\t" \ + "sb %[q3_l], 3(%[s1]) \n\t" \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + "sb %[p3_l], -4(%[s1]) \n\t" \ + "sb %[p4_l], -5(%[s1]) \n\t" \ + "sb %[p5_l], -6(%[s1]) \n\t" \ + "sb %[p6_l], -7(%[s1]) \n\t" \ + \ + : \ + : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ + [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ + [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ + [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ + [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \ + } -#define PACK_LEFT_0TO3() { \ - __asm__ __volatile__ ( \ - "preceu.ph.qbl %[p3_l], %[p3] \n\t" \ - "preceu.ph.qbl %[p2_l], %[p2] \n\t" \ - "preceu.ph.qbl %[p1_l], %[p1] \n\t" \ - "preceu.ph.qbl %[p0_l], %[p0] \n\t" \ - "preceu.ph.qbl %[q0_l], %[q0] \n\t" \ - "preceu.ph.qbl %[q1_l], %[q1] \n\t" \ - "preceu.ph.qbl %[q2_l], %[q2] \n\t" \ - "preceu.ph.qbl %[q3_l], %[q3] \n\t" \ - \ - : [p3_l] "=&r" (p3_l), [p2_l] "=&r" (p2_l), \ - [p1_l] "=&r" (p1_l), [p0_l] "=&r" (p0_l), \ - [q0_l] "=&r" (q0_l), [q1_l] "=&r" (q1_l), \ - [q2_l] "=&r" (q2_l), [q3_l] "=&r" (q3_l) \ - : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), \ - [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3) \ - ); \ -} +#define PACK_LEFT_0TO3() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbl %[p3_l], %[p3] \n\t" \ + "preceu.ph.qbl %[p2_l], %[p2] \n\t" \ + "preceu.ph.qbl %[p1_l], %[p1] \n\t" \ + "preceu.ph.qbl %[p0_l], %[p0] \n\t" \ + "preceu.ph.qbl %[q0_l], %[q0] \n\t" \ + "preceu.ph.qbl %[q1_l], %[q1] \n\t" \ + "preceu.ph.qbl %[q2_l], %[q2] \n\t" \ + "preceu.ph.qbl %[q3_l], %[q3] \n\t" \ + \ + : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \ + [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \ + [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \ + : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ + } -#define PACK_LEFT_4TO7() { \ - __asm__ __volatile__ ( \ - "preceu.ph.qbl %[p7_l], %[p7] \n\t" \ - "preceu.ph.qbl %[p6_l], %[p6] \n\t" \ - "preceu.ph.qbl %[p5_l], %[p5] \n\t" \ - "preceu.ph.qbl %[p4_l], %[p4] \n\t" \ - "preceu.ph.qbl %[q4_l], %[q4] \n\t" \ - "preceu.ph.qbl %[q5_l], %[q5] \n\t" \ - "preceu.ph.qbl %[q6_l], %[q6] \n\t" \ - "preceu.ph.qbl %[q7_l], %[q7] \n\t" \ - \ - : [p7_l] "=&r" (p7_l), [p6_l] "=&r" (p6_l), \ - [p5_l] "=&r" (p5_l), [p4_l] "=&r" (p4_l), \ - [q4_l] "=&r" (q4_l), [q5_l] "=&r" (q5_l), \ - [q6_l] "=&r" (q6_l), [q7_l] "=&r" (q7_l) \ - : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), \ - [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7) \ - ); \ -} +#define PACK_LEFT_4TO7() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbl %[p7_l], %[p7] \n\t" \ + "preceu.ph.qbl %[p6_l], %[p6] \n\t" \ + "preceu.ph.qbl %[p5_l], %[p5] \n\t" \ + "preceu.ph.qbl %[p4_l], %[p4] \n\t" \ + "preceu.ph.qbl %[q4_l], %[q4] \n\t" \ + "preceu.ph.qbl %[q5_l], %[q5] \n\t" \ + "preceu.ph.qbl %[q6_l], %[q6] \n\t" \ + "preceu.ph.qbl %[q7_l], %[q7] \n\t" \ + \ + : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \ + [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \ + [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \ + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ + [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ + } -#define PACK_RIGHT_0TO3() { \ - __asm__ __volatile__ ( \ - "preceu.ph.qbr %[p3_r], %[p3] \n\t" \ - "preceu.ph.qbr %[p2_r], %[p2] \n\t" \ - "preceu.ph.qbr %[p1_r], %[p1] \n\t" \ - "preceu.ph.qbr %[p0_r], %[p0] \n\t" \ - "preceu.ph.qbr %[q0_r], %[q0] \n\t" \ - "preceu.ph.qbr %[q1_r], %[q1] \n\t" \ - "preceu.ph.qbr %[q2_r], %[q2] \n\t" \ - "preceu.ph.qbr %[q3_r], %[q3] \n\t" \ - \ - : [p3_r] "=&r" (p3_r), [p2_r] "=&r" (p2_r), \ - [p1_r] "=&r" (p1_r), [p0_r] "=&r" (p0_r), \ - [q0_r] "=&r" (q0_r), [q1_r] "=&r" (q1_r), \ - [q2_r] "=&r" (q2_r), [q3_r] "=&r" (q3_r) \ - : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), \ - [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3) \ - ); \ -} +#define PACK_RIGHT_0TO3() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbr %[p3_r], %[p3] \n\t" \ + "preceu.ph.qbr %[p2_r], %[p2] \n\t" \ + "preceu.ph.qbr %[p1_r], %[p1] \n\t" \ + "preceu.ph.qbr %[p0_r], %[p0] \n\t" \ + "preceu.ph.qbr %[q0_r], %[q0] \n\t" \ + "preceu.ph.qbr %[q1_r], %[q1] \n\t" \ + "preceu.ph.qbr %[q2_r], %[q2] \n\t" \ + "preceu.ph.qbr %[q3_r], %[q3] \n\t" \ + \ + : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \ + [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \ + [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \ + : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ + } -#define PACK_RIGHT_4TO7() { \ - __asm__ __volatile__ ( \ - "preceu.ph.qbr %[p7_r], %[p7] \n\t" \ - "preceu.ph.qbr %[p6_r], %[p6] \n\t" \ - "preceu.ph.qbr %[p5_r], %[p5] \n\t" \ - "preceu.ph.qbr %[p4_r], %[p4] \n\t" \ - "preceu.ph.qbr %[q4_r], %[q4] \n\t" \ - "preceu.ph.qbr %[q5_r], %[q5] \n\t" \ - "preceu.ph.qbr %[q6_r], %[q6] \n\t" \ - "preceu.ph.qbr %[q7_r], %[q7] \n\t" \ - \ - : [p7_r] "=&r" (p7_r), [p6_r] "=&r" (p6_r), \ - [p5_r] "=&r" (p5_r), [p4_r] "=&r" (p4_r), \ - [q4_r] "=&r" (q4_r), [q5_r] "=&r" (q5_r), \ - [q6_r] "=&r" (q6_r), [q7_r] "=&r" (q7_r) \ - : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), \ - [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7) \ - ); \ -} +#define PACK_RIGHT_4TO7() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbr %[p7_r], %[p7] \n\t" \ + "preceu.ph.qbr %[p6_r], %[p6] \n\t" \ + "preceu.ph.qbr %[p5_r], %[p5] \n\t" \ + "preceu.ph.qbr %[p4_r], %[p4] \n\t" \ + "preceu.ph.qbr %[q4_r], %[q4] \n\t" \ + "preceu.ph.qbr %[q5_r], %[q5] \n\t" \ + "preceu.ph.qbr %[q6_r], %[q6] \n\t" \ + "preceu.ph.qbr %[q7_r], %[q7] \n\t" \ + \ + : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \ + [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \ + [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \ + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ + [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ + } -#define COMBINE_LEFT_RIGHT_0TO2() { \ - __asm__ __volatile__ ( \ - "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \ - "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \ - "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \ - "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \ - "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \ - "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \ - \ - : [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), \ - [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2) \ - : [p2_l] "r" (p2_l), [p2_r] "r" (p2_r), \ - [p1_l] "r" (p1_l), [p1_r] "r" (p1_r), \ - [p0_l] "r" (p0_l), [p0_r] "r" (p0_r), \ - [q0_l] "r" (q0_l), [q0_r] "r" (q0_r), \ - [q1_l] "r" (q1_l), [q1_r] "r" (q1_r), \ - [q2_l] "r" (q2_l), [q2_r] "r" (q2_r) \ - ); \ -} +#define COMBINE_LEFT_RIGHT_0TO2() \ + { \ + __asm__ __volatile__( \ + "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \ + "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \ + "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \ + "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \ + "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \ + "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \ + \ + : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \ + [q1] "=&r"(q1), [q2] "=&r"(q2) \ + : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \ + [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \ + [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \ + [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \ + } -#define COMBINE_LEFT_RIGHT_3TO6() { \ - __asm__ __volatile__ ( \ - "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \ - "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \ - "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \ - "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \ - "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \ - "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \ - "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \ - "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \ - \ - : [p6] "=&r" (p6),[p5] "=&r" (p5), \ - [p4] "=&r" (p4),[p3] "=&r" (p3), \ - [q3] "=&r" (q3),[q4] "=&r" (q4), \ - [q5] "=&r" (q5),[q6] "=&r" (q6) \ - : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), \ - [p4_l] "r" (p4_l), [p3_l] "r" (p3_l), \ - [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), \ - [p4_r] "r" (p4_r), [p3_r] "r" (p3_r), \ - [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), \ - [q5_l] "r" (q5_l), [q6_l] "r" (q6_l), \ - [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), \ - [q5_r] "r" (q5_r), [q6_r] "r" (q6_r) \ - ); \ -} +#define COMBINE_LEFT_RIGHT_3TO6() \ + { \ + __asm__ __volatile__( \ + "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \ + "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \ + "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \ + "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \ + "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \ + "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \ + "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \ + "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \ + \ + : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \ + [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \ + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \ + [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \ + [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \ + [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \ + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \ + [q6_r] "r"(q6_r)); \ + } #endif // #if HAVE_DSPR2 #ifdef __cplusplus diff --git a/vpx_dsp/mips/loopfilter_masks_dspr2.h b/vpx_dsp/mips/loopfilter_masks_dspr2.h index 9bf292705a62d50b2d112cec0c5592f7ec6e1c68..0a0cf577e332ead7d31826ef8f226ab339db7467 100644 --- a/vpx_dsp/mips/loopfilter_masks_dspr2.h +++ b/vpx_dsp/mips/loopfilter_masks_dspr2.h @@ -25,18 +25,17 @@ extern "C" { /* processing 4 pixels at the same time * compute hev and mask in the same function */ static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit, - uint32_t p1, uint32_t p0, - uint32_t p3, uint32_t p2, - uint32_t q0, uint32_t q1, + uint32_t p1, uint32_t p0, uint32_t p3, + uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, uint32_t q3, uint32_t thresh, uint32_t *hev, uint32_t *mask) { - uint32_t c, r, r3, r_k; - uint32_t s1, s2, s3; - uint32_t ones = 0xFFFFFFFF; - uint32_t hev1; + uint32_t c, r, r3, r_k; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t hev1; - __asm__ __volatile__ ( + __asm__ __volatile__( /* mask |= (abs(p3 - p2) > limit) */ "subu_s.qb %[c], %[p3], %[p2] \n\t" "subu_s.qb %[r_k], %[p2], %[p3] \n\t" @@ -88,14 +87,12 @@ static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit, "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" "or %[r], %[r], %[c] \n\t" - : [c] "=&r" (c), [r_k] "=&r" (r_k), - [r] "=&r" (r), [r3] "=&r" (r3) - : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2), - [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0), - [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh) - ); + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh)); - __asm__ __volatile__ ( + __asm__ __volatile__( /* abs(p0 - q0) */ "subu_s.qb %[c], %[p0], %[q0] \n\t" "subu_s.qb %[r_k], %[q0], %[p0] \n\t" @@ -119,34 +116,27 @@ static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit, "wrdsp %[r] \n\t" "pick.qb %[s2], $0, %[ones] \n\t" - : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1), - [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3) - : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3), - [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit) - ); + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); *hev = hev1; *mask = s2; } -static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit, - uint32_t flimit, - uint32_t thresh, - uint32_t p1, uint32_t p0, - uint32_t p3, uint32_t p2, - uint32_t q0, uint32_t q1, - uint32_t q2, uint32_t q3, - uint32_t *hev, - uint32_t *mask, - uint32_t *flat) { - uint32_t c, r, r3, r_k, r_flat; - uint32_t s1, s2, s3; - uint32_t ones = 0xFFFFFFFF; - uint32_t flat_thresh = 0x01010101; - uint32_t hev1; - uint32_t flat1; - - __asm__ __volatile__ ( +static INLINE void filter_hev_mask_flatmask4_dspr2( + uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0, + uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) { + uint32_t c, r, r3, r_k, r_flat; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t hev1; + uint32_t flat1; + + __asm__ __volatile__( /* mask |= (abs(p3 - p2) > limit) */ "subu_s.qb %[c], %[p3], %[p2] \n\t" "subu_s.qb %[r_k], %[p2], %[p3] \n\t" @@ -236,15 +226,13 @@ static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit, "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" "or %[r], %[r], %[c] \n\t" - : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3), - [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1) - : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2), - [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0), - [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh), - [flat_thresh] "r" (flat_thresh), [ones] "r" (ones) - ); + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3), + [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); - __asm__ __volatile__ ( + __asm__ __volatile__( /* abs(p0 - q0) */ "subu_s.qb %[c], %[p0], %[q0] \n\t" "subu_s.qb %[r_k], %[q0], %[p0] \n\t" @@ -268,29 +256,25 @@ static INLINE void filter_hev_mask_flatmask4_dspr2(uint32_t limit, "wrdsp %[r] \n\t" "pick.qb %[s2], $0, %[ones] \n\t" - : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1), - [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3) - : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3), - [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit) - ); + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); *hev = hev1; *mask = s2; *flat = flat1; } -static INLINE void flatmask5(uint32_t p4, uint32_t p3, - uint32_t p2, uint32_t p1, - uint32_t p0, uint32_t q0, - uint32_t q1, uint32_t q2, - uint32_t q3, uint32_t q4, - uint32_t *flat2) { - uint32_t c, r, r_k, r_flat; - uint32_t ones = 0xFFFFFFFF; - uint32_t flat_thresh = 0x01010101; - uint32_t flat1, flat3; - - __asm__ __volatile__ ( +static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t q4, uint32_t *flat2) { + uint32_t c, r, r_k, r_flat; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t flat1, flat3; + + __asm__ __volatile__( /* flat |= (abs(p4 - p0) > thresh) */ "subu_s.qb %[c], %[p4], %[p0] \n\t" "subu_s.qb %[r_k], %[p0], %[p4] \n\t" @@ -355,13 +339,11 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3, /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */ "and %[flat1], %[flat3], %[flat1] \n\t" - : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), - [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3) - : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), - [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1), - [q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4), - [flat_thresh] "r" (flat_thresh), [ones] "r" (ones) - ); + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat), + [flat1] "=&r"(flat1), [flat3] "=&r"(flat3) + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4), + [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); *flat2 = flat1; } diff --git a/vpx_dsp/mips/loopfilter_mb_dspr2.c b/vpx_dsp/mips/loopfilter_mb_dspr2.c index dd0545eed23c71ea10c93f91c29e69aa8ff1f2db..e42479257c322e5330921286cecba20550ef8500 100644 --- a/vpx_dsp/mips/loopfilter_mb_dspr2.c +++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c @@ -19,36 +19,33 @@ #include "vpx_mem/vpx_mem.h" #if HAVE_DSPR2 -void vpx_lpf_horizontal_8_dspr2(unsigned char *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, +void vpx_lpf_horizontal_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - uint32_t mask; - uint32_t hev, flat; - uint8_t i; - uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - uint32_t p1_f0, p0_f0, q0_f0, q1_f0; - uint32_t p3, p2, p1, p0, q0, q1, q2, q3; - uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; - uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + uint32_t mask; + uint32_t hev, flat; + uint8_t i; + uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p3, p2, p1, p0, q0, q1, q2, q3; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; uflimit = *blimit; - ulimit = *limit; + ulimit = *limit; uthresh = *thresh; /* create quad-byte */ - __asm__ __volatile__ ( + __asm__ __volatile__( "replv.qb %[thresh_vec], %[uthresh] \n\t" "replv.qb %[flimit_vec], %[uflimit] \n\t" "replv.qb %[limit_vec], %[ulimit] \n\t" - : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), - [limit_vec] "=r" (limit_vec) - : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) - ); + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); /* prefetch data for store */ prefetch_store(s); @@ -63,7 +60,7 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, sq2 = sq1 + pitch; sq3 = sq2 + pitch; - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[p3], (%[sp3]) \n\t" "lw %[p2], (%[sp2]) \n\t" "lw %[p1], (%[sp1]) \n\t" @@ -73,46 +70,39 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, "lw %[q2], (%[sq2]) \n\t" "lw %[q3], (%[sq3]) \n\t" - : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), - [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0) - : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0) - ); + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0) + : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0)); - filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, - p1, p0, p3, p2, q0, q1, q2, q3, - &hev, &mask, &flat); + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); if ((flat == 0) && (mask != 0)) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, - &p1_f0, &p0_f0, &q0_f0, &q1_f0); + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - __asm__ __volatile__ ( + __asm__ __volatile__( "sw %[p1_f0], (%[sp1]) \n\t" "sw %[p0_f0], (%[sp0]) \n\t" "sw %[q0_f0], (%[sq0]) \n\t" "sw %[q1_f0], (%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1)); } else if ((mask & flat) == 0xFFFFFFFF) { /* left 2 element operation */ PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, - &q0_l, &q1_l, &q2_l, &q3_l); + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); /* right 2 element operation */ PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, - &q0_r, &q1_r, &q2_r, &q3_r); + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); COMBINE_LEFT_RIGHT_0TO2() - __asm__ __volatile__ ( + __asm__ __volatile__( "sw %[p2], (%[sp2]) \n\t" "sw %[p1], (%[sp1]) \n\t" "sw %[p0], (%[sp0]) \n\t" @@ -121,28 +111,23 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, "sw %[q2], (%[sq2]) \n\t" : - : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), - [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), + [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), + [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); } else if ((flat != 0) && (mask != 0)) { /* filtering */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, - &p1_f0, &p0_f0, &q0_f0, &q1_f0); + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); /* left 2 element operation */ PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, - &q0_l, &q1_l, &q2_l, &q3_l); + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); /* right 2 element operation */ PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, - &q0_r, &q1_r, &q2_r, &q3_r); + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); if (mask & flat & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_r], (%[sp2]) \n\t" "sb %[p1_r], (%[sp1]) \n\t" "sb %[p0_r], (%[sp0]) \n\t" @@ -151,27 +136,24 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, "sb %[q2_r], (%[sq2]) \n\t" : - : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), - [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); } else if (mask & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], (%[sp1]) \n\t" "sb %[p0_f0], (%[sp0]) \n\t" "sb %[q0_f0], (%[sq0]) \n\t" "sb %[q1_f0], (%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p2_r], %[p2_r], 16 \n\t" "srl %[p1_r], %[p1_r], 16 \n\t" "srl %[p0_r], %[p0_r], 16 \n\t" @@ -183,15 +165,14 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), - [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_r], +1(%[sp2]) \n\t" "sb %[p1_r], +1(%[sp1]) \n\t" "sb %[p0_r], +1(%[sp0]) \n\t" @@ -200,41 +181,36 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, "sb %[q2_r], +1(%[sq2]) \n\t" : - : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), - [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); } else if (mask & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], +1(%[sp1]) \n\t" "sb %[p0_f0], +1(%[sp0]) \n\t" "sb %[q0_f0], +1(%[sq0]) \n\t" "sb %[q1_f0], +1(%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p1_f0], %[p1_f0], 8 \n\t" "srl %[p0_f0], %[p0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), - [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), + [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), + [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) + :); if (mask & flat & 0x00FF0000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_l], +2(%[sp2]) \n\t" "sb %[p1_l], +2(%[sp1]) \n\t" "sb %[p0_l], +2(%[sp0]) \n\t" @@ -243,27 +219,24 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, "sb %[q2_l], +2(%[sq2]) \n\t" : - : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), - [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); } else if (mask & 0x00FF0000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], +2(%[sp1]) \n\t" "sb %[p0_f0], +2(%[sp0]) \n\t" "sb %[q0_f0], +2(%[sq0]) \n\t" "sb %[q1_f0], +2(%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p2_l], %[p2_l], 16 \n\t" "srl %[p1_l], %[p1_l], 16 \n\t" "srl %[p0_l], %[p0_l], 16 \n\t" @@ -275,15 +248,14 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), - [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_l], +3(%[sp2]) \n\t" "sb %[p1_l], +3(%[sp1]) \n\t" "sb %[p0_l], +3(%[sp0]) \n\t" @@ -292,24 +264,21 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, "sb %[q2_l], +3(%[sq2]) \n\t" : - : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), - [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); } else if (mask & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], +3(%[sp1]) \n\t" "sb %[p0_f0], +3(%[sp0]) \n\t" "sb %[q0_f0], +3(%[sq0]) \n\t" "sb %[q1_f0], +3(%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); } } @@ -317,36 +286,33 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s, } } -void vpx_lpf_vertical_8_dspr2(unsigned char *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, +void vpx_lpf_vertical_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - uint8_t i; - uint32_t mask, hev, flat; - uint8_t *s1, *s2, *s3, *s4; - uint32_t prim1, prim2, sec3, sec4, prim3, prim4; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - uint32_t p3, p2, p1, p0, q3, q2, q1, q0; - uint32_t p1_f0, p0_f0, q0_f0, q1_f0; - uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; - uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + uint8_t i; + uint32_t mask, hev, flat; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p3, p2, p1, p0, q3, q2, q1, q0; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; uflimit = *blimit; - ulimit = *limit; + ulimit = *limit; uthresh = *thresh; /* create quad-byte */ - __asm__ __volatile__ ( + __asm__ __volatile__( "replv.qb %[thresh_vec], %[uthresh] \n\t" "replv.qb %[flimit_vec], %[uflimit] \n\t" "replv.qb %[limit_vec], %[ulimit] \n\t" - : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), - [limit_vec] "=r" (limit_vec) - : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) - ); + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); prefetch_store(s + pitch); @@ -355,9 +321,9 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, s2 = s + pitch; s3 = s2 + pitch; s4 = s3 + pitch; - s = s4 + pitch; + s = s4 + pitch; - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[p0], -4(%[s1]) \n\t" "lw %[p1], -4(%[s2]) \n\t" "lw %[p2], -4(%[s3]) \n\t" @@ -367,10 +333,9 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, "lw %[q1], (%[s3]) \n\t" "lw %[q0], (%[s4]) \n\t" - : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), - [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2), [q3] "=&r" (q3) - : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) - ); + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); /* transpose p3, p2, p1, p0 original (when loaded from memory) @@ -387,7 +352,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, p2 p3_1 p2_1 p1_1 p0_1 p3 p3_0 p2_0 p1_0 p0_0 */ - __asm__ __volatile__ ( + __asm__ __volatile__( "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" @@ -403,12 +368,10 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, "append %[p1], %[sec3], 16 \n\t" "append %[p3], %[sec4], 16 \n\t" - : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), - [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), - [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), - [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) - : - ); + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); /* transpose q0, q1, q2, q3 original (when loaded from memory) @@ -425,7 +388,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, q1 q0_1 q1_1 q2_1 q3_1 q0 q0_0 q1_0 q2_0 q3_0 */ - __asm__ __volatile__ ( + __asm__ __volatile__( "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" @@ -441,49 +404,40 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, "append %[q2], %[sec3], 16 \n\t" "append %[q0], %[sec4], 16 \n\t" - : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), - [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), - [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0), - [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) - : - ); + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), + [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); - filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, - p1, p0, p3, p2, q0, q1, q2, q3, - &hev, &mask, &flat); + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); if ((flat == 0) && (mask != 0)) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, - &p1_f0, &p0_f0, &q0_f0, &q1_f0); + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); STORE_F0() } else if ((mask & flat) == 0xFFFFFFFF) { /* left 2 element operation */ PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, - &q0_l, &q1_l, &q2_l, &q3_l); + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); /* right 2 element operation */ PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, - &q0_r, &q1_r, &q2_r, &q3_r); + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); STORE_F1() } else if ((flat != 0) && (mask != 0)) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, - &p1_f0, &p0_f0, &q0_f0, &q1_f0); + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); /* left 2 element operation */ PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, - &q0_l, &q1_l, &q2_l, &q3_l); + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); /* right 2 element operation */ PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, - &q0_r, &q1_r, &q2_r, &q3_r); + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); if (mask & flat & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_r], -3(%[s4]) \n\t" "sb %[p1_r], -2(%[s4]) \n\t" "sb %[p0_r], -1(%[s4]) \n\t" @@ -492,25 +446,22 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, "sb %[q2_r], +2(%[s4]) \n\t" : - : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), - [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), - [s4] "r" (s4) - ); + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s4] "r"(s4)); } else if (mask & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], -2(%[s4]) \n\t" "sb %[p0_f0], -1(%[s4]) \n\t" "sb %[q0_f0], (%[s4]) \n\t" "sb %[q1_f0], +1(%[s4]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [s4] "r" (s4) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p2_r], %[p2_r], 16 \n\t" "srl %[p1_r], %[p1_r], 16 \n\t" "srl %[p0_r], %[p0_r], 16 \n\t" @@ -522,15 +473,14 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), - [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_r], -3(%[s3]) \n\t" "sb %[p1_r], -2(%[s3]) \n\t" "sb %[p0_r], -1(%[s3]) \n\t" @@ -539,66 +489,58 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, "sb %[q2_r], +2(%[s3]) \n\t" : - : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), - [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), - [s3] "r" (s3) - ); + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s3] "r"(s3)); } else if (mask & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], -2(%[s3]) \n\t" "sb %[p0_f0], -1(%[s3]) \n\t" "sb %[q0_f0], (%[s3]) \n\t" "sb %[q1_f0], +1(%[s3]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [s3] "r" (s3) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p1_f0], %[p1_f0], 8 \n\t" "srl %[p0_f0], %[p0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), - [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), + [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), + [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) + :); if (mask & flat & 0x00FF0000) { - __asm__ __volatile__ ( - "sb %[p2_l], -3(%[s2]) \n\t" - "sb %[p1_l], -2(%[s2]) \n\t" - "sb %[p0_l], -1(%[s2]) \n\t" - "sb %[q0_l], (%[s2]) \n\t" - "sb %[q1_l], +1(%[s2]) \n\t" - "sb %[q2_l], +2(%[s2]) \n\t" + __asm__ __volatile__( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" - : - : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), - [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), - [s2] "r" (s2) - ); + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s2] "r"(s2)); } else if (mask & 0x00FF0000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], -2(%[s2]) \n\t" "sb %[p0_f0], -1(%[s2]) \n\t" "sb %[q0_f0], (%[s2]) \n\t" "sb %[q1_f0], +1(%[s2]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [s2] "r" (s2) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p2_l], %[p2_l], 16 \n\t" "srl %[p1_l], %[p1_l], 16 \n\t" "srl %[p0_l], %[p0_l], 16 \n\t" @@ -610,15 +552,14 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), - [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_l], -3(%[s1]) \n\t" "sb %[p1_l], -2(%[s1]) \n\t" "sb %[p0_l], -1(%[s1]) \n\t" @@ -627,21 +568,19 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s, "sb %[q2_l], +2(%[s1]) \n\t" : - : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), - [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), - [s1] "r" (s1) - ); + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s1] "r"(s1)); } else if (mask & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], -2(%[s1]) \n\t" "sb %[p0_f0], -1(%[s1]) \n\t" "sb %[q0_f0], (%[s1]) \n\t" "sb %[q1_f0], +1(%[s1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), - [q1_f0] "r" (q1_f0), [s1] "r" (s1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); } } } diff --git a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c index 85e167ca054fd1a374a0f5ef27be64903ba6fa96..6325762a2aa26b117a9e0501680c42a5814e7a78 100644 --- a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c +++ b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c @@ -19,42 +19,38 @@ #include "vpx_mem/vpx_mem.h" #if HAVE_DSPR2 -static void mb_lpf_horizontal_edge(unsigned char *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { - uint32_t mask; - uint32_t hev, flat, flat2; - uint8_t i; - uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0; - uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - uint32_t p1_f0, p0_f0, q0_f0, q1_f0; - uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; - uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; - uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; - uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; - uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; - uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; +static void mb_lpf_horizontal_edge(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int count) { + uint32_t mask; + uint32_t hev, flat, flat2; + uint8_t i; + uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0; + uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; uflimit = *blimit; - ulimit = *limit; + ulimit = *limit; uthresh = *thresh; /* create quad-byte */ - __asm__ __volatile__ ( + __asm__ __volatile__( "replv.qb %[thresh_vec], %[uthresh] \n\t" "replv.qb %[flimit_vec], %[uflimit] \n\t" "replv.qb %[limit_vec], %[ulimit] \n\t" - : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), - [limit_vec] "=r" (limit_vec) - : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) - ); + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); /* prefetch data for store */ prefetch_store(s); @@ -77,7 +73,7 @@ static void mb_lpf_horizontal_edge(unsigned char *s, sq6 = sq5 + pitch; sq7 = sq6 + pitch; - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[p7], (%[sp7]) \n\t" "lw %[p6], (%[sp6]) \n\t" "lw %[p5], (%[sp5]) \n\t" @@ -87,13 +83,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "lw %[p1], (%[sp1]) \n\t" "lw %[p0], (%[sp0]) \n\t" - : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), - [p7] "=&r" (p7), [p6] "=&r" (p6), [p5] "=&r" (p5), [p4] "=&r" (p4) - : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7) - ); + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) + : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7)); - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[q0], (%[sq0]) \n\t" "lw %[q1], (%[sq1]) \n\t" "lw %[q2], (%[sq2]) \n\t" @@ -103,57 +98,50 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "lw %[q6], (%[sq6]) \n\t" "lw %[q7], (%[sq7]) \n\t" - : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0), - [q7] "=&r" (q7), [q6] "=&r" (q6), [q5] "=&r" (q5), [q4] "=&r" (q4) - : [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0), - [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7) - ); + : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), + [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) + : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0), + [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7)); - filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, - p1, p0, p3, p2, q0, q1, q2, q3, - &hev, &mask, &flat); + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); /* f0 */ if (((flat2 == 0) && (flat == 0) && (mask != 0)) || ((flat2 != 0) && (flat == 0) && (mask != 0))) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, - &p1_f0, &p0_f0, &q0_f0, &q1_f0); + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); - __asm__ __volatile__ ( + __asm__ __volatile__( "sw %[p1_f0], (%[sp1]) \n\t" "sw %[p0_f0], (%[sp0]) \n\t" "sw %[q0_f0], (%[sq0]) \n\t" "sw %[q1_f0], (%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1)); } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { /* f2 */ PACK_LEFT_0TO3() PACK_LEFT_4TO7() - wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, - &p3_l, &p2_l, &p1_l, &p0_l, - &q0_l, &q1_l, &q2_l, &q3_l, - &q4_l, &q5_l, &q6_l, &q7_l); + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); PACK_RIGHT_0TO3() PACK_RIGHT_4TO7() - wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, - &p3_r, &p2_r, &p1_r, &p0_r, - &q0_r, &q1_r, &q2_r, &q3_r, - &q4_r, &q5_r, &q6_r, &q7_r); + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); COMBINE_LEFT_RIGHT_0TO2() COMBINE_LEFT_RIGHT_3TO6() - __asm__ __volatile__ ( + __asm__ __volatile__( "sw %[p6], (%[sp6]) \n\t" "sw %[p5], (%[sp5]) \n\t" "sw %[p4], (%[sp4]) \n\t" @@ -163,13 +151,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sw %[p0], (%[sp0]) \n\t" : - : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), - [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), - [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0) - ); + : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6), + [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0)); - __asm__ __volatile__ ( + __asm__ __volatile__( "sw %[q6], (%[sq6]) \n\t" "sw %[q5], (%[sq5]) \n\t" "sw %[q4], (%[sq4]) \n\t" @@ -179,26 +166,23 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sw %[q0], (%[sq0]) \n\t" : - : [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), [q3] "r" (q3), - [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0), - [sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3), - [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0) - ); + : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), + [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6), + [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2), + [sq1] "r"(sq1), [sq0] "r"(sq0)); } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { /* f1 */ /* left 2 element operation */ PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, - &q0_l, &q1_l, &q2_l, &q3_l); + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); /* right 2 element operation */ PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, - &q0_r, &q1_r, &q2_r, &q3_r); + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); COMBINE_LEFT_RIGHT_0TO2() - __asm__ __volatile__ ( + __asm__ __volatile__( "sw %[p2], (%[sp2]) \n\t" "sw %[p1], (%[sp1]) \n\t" "sw %[p0], (%[sp0]) \n\t" @@ -207,28 +191,23 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sw %[q2], (%[sq2]) \n\t" : - : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), - [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), + [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), + [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { /* f0+f1 */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, - &p1_f0, &p0_f0, &q0_f0, &q1_f0); + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); /* left 2 element operation */ PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, - &q0_l, &q1_l, &q2_l, &q3_l); + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); /* right 2 element operation */ PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, - &q0_r, &q1_r, &q2_r, &q3_r); + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); if (mask & flat & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_r], (%[sp2]) \n\t" "sb %[p1_r], (%[sp1]) \n\t" "sb %[p0_r], (%[sp0]) \n\t" @@ -237,27 +216,24 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[q2_r], (%[sq2]) \n\t" : - : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), - [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); } else if (mask & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], (%[sp1]) \n\t" "sb %[p0_f0], (%[sp0]) \n\t" "sb %[q0_f0], (%[sq0]) \n\t" "sb %[q1_f0], (%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p2_r], %[p2_r], 16 \n\t" "srl %[p1_r], %[p1_r], 16 \n\t" "srl %[p0_r], %[p0_r], 16 \n\t" @@ -269,15 +245,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), - [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_r], +1(%[sp2]) \n\t" "sb %[p1_r], +1(%[sp1]) \n\t" "sb %[p0_r], +1(%[sp0]) \n\t" @@ -286,39 +261,35 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[q2_r], +1(%[sq2]) \n\t" : - : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), - [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); } else if (mask & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], +1(%[sp1]) \n\t" "sb %[p0_f0], +1(%[sp0]) \n\t" "sb %[q0_f0], +1(%[sq0]) \n\t" "sb %[q1_f0], +1(%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p1_f0], %[p1_f0], 8 \n\t" "srl %[p0_f0], %[p0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & 0x00FF0000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_l], +2(%[sp2]) \n\t" "sb %[p1_l], +2(%[sp1]) \n\t" "sb %[p0_l], +2(%[sp0]) \n\t" @@ -327,27 +298,24 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[q2_l], +2(%[sq2]) \n\t" : - : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), - [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); } else if (mask & 0x00FF0000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], +2(%[sp1]) \n\t" "sb %[p0_f0], +2(%[sp0]) \n\t" "sb %[q0_f0], +2(%[sq0]) \n\t" "sb %[q1_f0], +2(%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p2_l], %[p2_l], 16 \n\t" "srl %[p1_l], %[p1_l], 16 \n\t" "srl %[p0_l], %[p0_l], 16 \n\t" @@ -359,15 +327,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), - [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_l], +3(%[sp2]) \n\t" "sb %[p1_l], +3(%[sp1]) \n\t" "sb %[p0_l], +3(%[sp0]) \n\t" @@ -376,61 +343,51 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[q2_l], +3(%[sq2]) \n\t" : - : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), - [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); } else if (mask & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], +3(%[sp1]) \n\t" "sb %[p0_f0], +3(%[sp0]) \n\t" "sb %[q0_f0], +3(%[sq0]) \n\t" "sb %[q1_f0], +3(%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); } } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { /* f0 + f1 + f2 */ /* f0 function */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, - &p1_f0, &p0_f0, &q0_f0, &q1_f0); + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); /* f1 function */ /* left 2 element operation */ PACK_LEFT_0TO3() - mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, - q0_l, q1_l, q2_l, q3_l, - &p2_l_f1, &p1_l_f1, &p0_l_f1, - &q0_l_f1, &q1_l_f1, &q2_l_f1); + mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, + &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); /* right 2 element operation */ PACK_RIGHT_0TO3() - mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, - q0_r, q1_r, q2_r, q3_r, - &p2_r_f1, &p1_r_f1, &p0_r_f1, - &q0_r_f1, &q1_r_f1, &q2_r_f1); + mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, + &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); /* f2 function */ PACK_LEFT_4TO7() - wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, - &p3_l, &p2_l, &p1_l, &p0_l, - &q0_l, &q1_l, &q2_l, &q3_l, - &q4_l, &q5_l, &q6_l, &q7_l); + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); PACK_RIGHT_4TO7() - wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, - &p3_r, &p2_r, &p1_r, &p0_r, - &q0_r, &q1_r, &q2_r, &q3_r, - &q4_r, &q5_r, &q6_r, &q7_r); + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); if (mask & flat & flat2 & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p6_r], (%[sp6]) \n\t" "sb %[p5_r], (%[sp5]) \n\t" "sb %[p4_r], (%[sp4]) \n\t" @@ -440,14 +397,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[p0_r], (%[sp0]) \n\t" : - : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r), - [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), - [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), - [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), - [p0_r] "r" (p0_r), [sp0] "r" (sp0) - ); - - __asm__ __volatile__ ( + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), + [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0)); + + __asm__ __volatile__( "sb %[q0_r], (%[sq0]) \n\t" "sb %[q1_r], (%[sq1]) \n\t" "sb %[q2_r], (%[sq2]) \n\t" @@ -457,15 +412,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[q6_r], (%[sq6]) \n\t" : - : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), - [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r), - [q6_r] "r" (q6_r), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2), - [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5), - [sq6] "r" (sq6) - ); + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); } else if (mask & flat & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_r_f1], (%[sp2]) \n\t" "sb %[p1_r_f1], (%[sp1]) \n\t" "sb %[p0_r_f1], (%[sp0]) \n\t" @@ -474,27 +426,25 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[q2_r_f1], (%[sq2]) \n\t" : - : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1), - [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1), - [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); } else if (mask & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], (%[sp1]) \n\t" "sb %[p0_f0], (%[sp0]) \n\t" "sb %[q0_f0], (%[sq0]) \n\t" "sb %[q1_f0], (%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), - [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p6_r], %[p6_r], 16 \n\t" "srl %[p5_r], %[p5_r], 16 \n\t" "srl %[p4_r], %[p4_r], 16 \n\t" @@ -510,15 +460,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "srl %[q5_r], %[q5_r], 16 \n\t" "srl %[q6_r], %[q6_r], 16 \n\t" - : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), - [q3_r] "+r" (q3_r), [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r), - [p6_r] "+r" (p6_r), [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r), - [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), - [q6_r] "+r" (q6_r), [p0_r] "+r" (p0_r) - : - ); + : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), + [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r), + [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), + [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r) + :); - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" @@ -530,16 +479,15 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1), - [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1), - [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), + [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), + [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & flat2 & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p6_r], +1(%[sp6]) \n\t" "sb %[p5_r], +1(%[sp5]) \n\t" "sb %[p4_r], +1(%[sp4]) \n\t" @@ -549,14 +497,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[p0_r], +1(%[sp0]) \n\t" : - : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r), - [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), - [p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5), - [sp4] "r" (sp4), [sp3] "r" (sp3), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0) - ); - - __asm__ __volatile__ ( + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( "sb %[q0_r], +1(%[sq0]) \n\t" "sb %[q1_r], +1(%[sq1]) \n\t" "sb %[q2_r], +1(%[sq2]) \n\t" @@ -566,14 +512,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[q6_r], +1(%[sq6]) \n\t" : - : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), - [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r), - [q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1), - [sq2] "r" (sq2), [sq3] "r" (sq3), - [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6) - ); + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); } else if (mask & flat & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_r_f1], +1(%[sp2]) \n\t" "sb %[p1_r_f1], +1(%[sp1]) \n\t" "sb %[p0_r_f1], +1(%[sp0]) \n\t" @@ -582,39 +526,36 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[q2_r_f1], +1(%[sq2]) \n\t" : - : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1), - [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1), - [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); } else if (mask & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], +1(%[sp1]) \n\t" "sb %[p0_f0], +1(%[sp0]) \n\t" "sb %[q0_f0], +1(%[sq0]) \n\t" "sb %[q1_f0], +1(%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), - [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p1_f0], %[p1_f0], 8 \n\t" "srl %[p0_f0], %[p0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & flat2 & 0x00FF0000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p6_l], +2(%[sp6]) \n\t" "sb %[p5_l], +2(%[sp5]) \n\t" "sb %[p4_l], +2(%[sp4]) \n\t" @@ -624,14 +565,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[p0_l], +2(%[sp0]) \n\t" : - : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l), - [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), - [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5), - [sp4] "r" (sp4), [sp3] "r" (sp3), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0) - ); - - __asm__ __volatile__ ( + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( "sb %[q0_l], +2(%[sq0]) \n\t" "sb %[q1_l], +2(%[sq1]) \n\t" "sb %[q2_l], +2(%[sq2]) \n\t" @@ -641,14 +580,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[q6_l], +2(%[sq6]) \n\t" : - : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), - [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l), - [q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1), - [sq2] "r" (sq2), [sq3] "r" (sq3), - [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6) - ); + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); } else if (mask & flat & 0x00FF0000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_l_f1], +2(%[sp2]) \n\t" "sb %[p1_l_f1], +2(%[sp1]) \n\t" "sb %[p0_l_f1], +2(%[sp0]) \n\t" @@ -657,27 +594,25 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[q2_l_f1], +2(%[sq2]) \n\t" : - : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1), - [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1), - [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); } else if (mask & 0x00FF0000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], +2(%[sp1]) \n\t" "sb %[p0_f0], +2(%[sp0]) \n\t" "sb %[q0_f0], +2(%[sq0]) \n\t" "sb %[q1_f0], +2(%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), - [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p6_l], %[p6_l], 16 \n\t" "srl %[p5_l], %[p5_l], 16 \n\t" "srl %[p4_l], %[p4_l], 16 \n\t" @@ -693,15 +628,14 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "srl %[q5_l], %[q5_l], 16 \n\t" "srl %[q6_l], %[q6_l], 16 \n\t" - : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), - [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l), - [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l), - [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l), - [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l) - : - ); + : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), + [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), + [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), + [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) + :); - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" @@ -713,16 +647,15 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1), - [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1), - [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), + [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), + [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & flat2 & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p6_l], +3(%[sp6]) \n\t" "sb %[p5_l], +3(%[sp5]) \n\t" "sb %[p4_l], +3(%[sp4]) \n\t" @@ -732,14 +665,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[p0_l], +3(%[sp0]) \n\t" : - : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l), - [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), - [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5), - [sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2), - [sp1] "r" (sp1), [sp0] "r" (sp0) - ); - - __asm__ __volatile__ ( + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( "sb %[q0_l], +3(%[sq0]) \n\t" "sb %[q1_l], +3(%[sq1]) \n\t" "sb %[q2_l], +3(%[sq2]) \n\t" @@ -749,15 +680,12 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[q6_l], +3(%[sq6]) \n\t" : - : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), - [q2_l] "r" (q2_l), [q3_l] "r" (q3_l), - [q4_l] "r" (q4_l), [q5_l] "r" (q5_l), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2), - [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5), - [q6_l] "r" (q6_l), [sq6] "r" (sq6) - ); + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3), + [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6)); } else if (mask & flat & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_l_f1], +3(%[sp2]) \n\t" "sb %[p1_l_f1], +3(%[sp1]) \n\t" "sb %[p0_l_f1], +3(%[sp0]) \n\t" @@ -766,25 +694,22 @@ static void mb_lpf_horizontal_edge(unsigned char *s, "sb %[q2_l_f1], +3(%[sq2]) \n\t" : - : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1), - [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1), - [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1), - [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) - ); + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); } else if (mask & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], +3(%[sp1]) \n\t" "sb %[p0_f0], +3(%[sp0]) \n\t" "sb %[q0_f0], +3(%[sq0]) \n\t" "sb %[q1_f0], +3(%[sq1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [sp1] "r" (sp1), [sp0] "r" (sp0), - [sq0] "r" (sq0), [sq1] "r" (sq1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); } } diff --git a/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c b/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c index e580f014e933aa845f51eabf78cebb11a348c860..96e8d8858a117bdc4c6f3b5ddac48e3cf0759ff9 100644 --- a/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c +++ b/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c @@ -19,40 +19,36 @@ #include "vpx_mem/vpx_mem.h" #if HAVE_DSPR2 -void vpx_lpf_vertical_16_dspr2(uint8_t *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - uint8_t i; - uint32_t mask, hev, flat, flat2; - uint8_t *s1, *s2, *s3, *s4; - uint32_t prim1, prim2, sec3, sec4, prim3, prim4; - uint32_t thresh_vec, flimit_vec, limit_vec; - uint32_t uflimit, ulimit, uthresh; - uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; - uint32_t p1_f0, p0_f0, q0_f0, q1_f0; - uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; - uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; - uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; - uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; - uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; - uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; +void vpx_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev, flat, flat2; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; uflimit = *blimit; ulimit = *limit; uthresh = *thresh; /* create quad-byte */ - __asm__ __volatile__ ( + __asm__ __volatile__( "replv.qb %[thresh_vec], %[uthresh] \n\t" "replv.qb %[flimit_vec], %[uflimit] \n\t" "replv.qb %[limit_vec], %[ulimit] \n\t" - : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), - [limit_vec] "=r" (limit_vec) - : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) - ); + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); prefetch_store(s + pitch); @@ -61,9 +57,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, s2 = s + pitch; s3 = s2 + pitch; s4 = s3 + pitch; - s = s4 + pitch; + s = s4 + pitch; - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[p0], -4(%[s1]) \n\t" "lw %[p1], -4(%[s2]) \n\t" "lw %[p2], -4(%[s3]) \n\t" @@ -73,13 +69,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "lw %[p6], -8(%[s3]) \n\t" "lw %[p7], -8(%[s4]) \n\t" - : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), - [p0] "=&r" (p0), [p7] "=&r" (p7), [p6] "=&r" (p6), - [p5] "=&r" (p5), [p4] "=&r" (p4) - : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) - ); + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); - __asm__ __volatile__ ( + __asm__ __volatile__( "lw %[q3], (%[s1]) \n\t" "lw %[q2], (%[s2]) \n\t" "lw %[q1], (%[s3]) \n\t" @@ -89,11 +83,9 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "lw %[q5], +4(%[s3]) \n\t" "lw %[q4], +4(%[s4]) \n\t" - : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), - [q0] "=&r" (q0), [q7] "=&r" (q7), [q6] "=&r" (q6), - [q5] "=&r" (q5), [q4] "=&r" (q4) - : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) - ); + : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), + [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); /* transpose p3, p2, p1, p0 original (when loaded from memory) @@ -110,7 +102,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, p2 p3_1 p2_1 p1_1 p0_1 p3 p3_0 p2_0 p1_0 p0_0 */ - __asm__ __volatile__ ( + __asm__ __volatile__( "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" @@ -126,12 +118,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "append %[p1], %[sec3], 16 \n\t" "append %[p3], %[sec4], 16 \n\t" - : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), - [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), - [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), - [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) - : - ); + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); /* transpose q0, q1, q2, q3 original (when loaded from memory) @@ -148,7 +138,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, q1 q0_1 q1_1 q2_1 q3_1 q0 q0_0 q1_0 q2_0 q3_0 */ - __asm__ __volatile__ ( + __asm__ __volatile__( "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" @@ -164,12 +154,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "append %[q2], %[sec3], 16 \n\t" "append %[q0], %[sec4], 16 \n\t" - : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), - [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), - [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0), - [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) - : - ); + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), + [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); /* transpose p7, p6, p5, p4 original (when loaded from memory) @@ -186,7 +174,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, p6 p7_1 p6_1 p5_1 p4_1 p7 p7_0 p6_0 p5_0 p4_0 */ - __asm__ __volatile__ ( + __asm__ __volatile__( "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t" "precr.qb.ph %[prim2], %[p4], %[p5] \n\t" "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t" @@ -202,12 +190,10 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "append %[p5], %[sec3], 16 \n\t" "append %[p7], %[sec4], 16 \n\t" - : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), - [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), - [p4] "+r" (p4), [p5] "+r" (p5), [p6] "+r" (p6), [p7] "+r" (p7), - [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) - : - ); + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6), + [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); /* transpose q4, q5, q6, q7 original (when loaded from memory) @@ -224,7 +210,7 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, q5 q4_1 q5_1 q26_1 q7_1 q4 q4_0 q5_0 q26_0 q7_0 */ - __asm__ __volatile__ ( + __asm__ __volatile__( "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t" "precr.qb.ph %[prim2], %[q7], %[q6] \n\t" "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t" @@ -240,71 +226,60 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "append %[q6], %[sec3], 16 \n\t" "append %[q4], %[sec4], 16 \n\t" - : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), - [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), - [q7] "+r" (q7), [q6] "+r" (q6), [q5] "+r" (q5), [q4] "+r" (q4), - [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) - : - ); + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5), + [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); - filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, - p1, p0, p3, p2, q0, q1, q2, q3, - &hev, &mask, &flat); + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); /* f0 */ if (((flat2 == 0) && (flat == 0) && (mask != 0)) || ((flat2 != 0) && (flat == 0) && (mask != 0))) { - filter1_dspr2(mask, hev, p1, p0, q0, q1, - &p1_f0, &p0_f0, &q0_f0, &q1_f0); + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); STORE_F0() } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { /* f2 */ PACK_LEFT_0TO3() PACK_LEFT_4TO7() - wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, - &p3_l, &p2_l, &p1_l, &p0_l, - &q0_l, &q1_l, &q2_l, &q3_l, - &q4_l, &q5_l, &q6_l, &q7_l); + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); PACK_RIGHT_0TO3() PACK_RIGHT_4TO7() - wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, - &p3_r, &p2_r, &p1_r, &p0_r, - &q0_r, &q1_r, &q2_r, &q3_r, - &q4_r, &q5_r, &q6_r, &q7_r); + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); STORE_F2() } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { /* f1 */ PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, - &q0_l, &q1_l, &q2_l, &q3_l); + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, - &q0_r, &q1_r, &q2_r, &q3_r); + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); STORE_F1() } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { /* f0 + f1 */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, - &p1_f0, &p0_f0, &q0_f0, &q1_f0); + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); /* left 2 element operation */ PACK_LEFT_0TO3() - mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, - &q0_l, &q1_l, &q2_l, &q3_l); + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); /* right 2 element operation */ PACK_RIGHT_0TO3() - mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, - &q0_r, &q1_r, &q2_r, &q3_r); + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); if (mask & flat & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_r], -3(%[s4]) \n\t" "sb %[p1_r], -2(%[s4]) \n\t" "sb %[p0_r], -1(%[s4]) \n\t" @@ -313,25 +288,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[q2_r], +2(%[s4]) \n\t" : - : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), - [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), - [s4] "r" (s4) - ); + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s4] "r"(s4)); } else if (mask & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], -2(%[s4]) \n\t" "sb %[p0_f0], -1(%[s4]) \n\t" "sb %[q0_f0], (%[s4]) \n\t" "sb %[q1_f0], +1(%[s4]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [s4] "r" (s4) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p2_r], %[p2_r], 16 \n\t" "srl %[p1_r], %[p1_r], 16 \n\t" "srl %[p0_r], %[p0_r], 16 \n\t" @@ -343,15 +315,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), - [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_r], -3(%[s3]) \n\t" "sb %[p1_r], -2(%[s3]) \n\t" "sb %[p0_r], -1(%[s3]) \n\t" @@ -360,64 +331,57 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[q2_r], +2(%[s3]) \n\t" : - : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), - [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), - [s3] "r" (s3) - ); + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s3] "r"(s3)); } else if (mask & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], -2(%[s3]) \n\t" "sb %[p0_f0], -1(%[s3]) \n\t" "sb %[q0_f0], (%[s3]) \n\t" "sb %[q1_f0], +1(%[s3]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [s3] "r" (s3) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p1_f0], %[p1_f0], 8 \n\t" "srl %[p0_f0], %[p0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & 0x00FF0000) { - __asm__ __volatile__ ( - "sb %[p2_l], -3(%[s2]) \n\t" - "sb %[p1_l], -2(%[s2]) \n\t" - "sb %[p0_l], -1(%[s2]) \n\t" - "sb %[q0_l], (%[s2]) \n\t" - "sb %[q1_l], +1(%[s2]) \n\t" - "sb %[q2_l], +2(%[s2]) \n\t" - - : - : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), - [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), - [s2] "r" (s2) - ); + __asm__ __volatile__( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s2] "r"(s2)); } else if (mask & 0x00FF0000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], -2(%[s2]) \n\t" "sb %[p0_f0], -1(%[s2]) \n\t" "sb %[q0_f0], (%[s2]) \n\t" "sb %[q1_f0], +1(%[s2]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [s2] "r" (s2) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p2_l], %[p2_l], 16 \n\t" "srl %[p1_l], %[p1_l], 16 \n\t" "srl %[p0_l], %[p0_l], 16 \n\t" @@ -429,15 +393,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), - [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_l], -3(%[s1]) \n\t" "sb %[p1_l], -2(%[s1]) \n\t" "sb %[p0_l], -1(%[s1]) \n\t" @@ -446,54 +409,44 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[q2_l], +2(%[s1]) \n\t" : - : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), - [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), - [s1] "r" (s1) - ); + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s1] "r"(s1)); } else if (mask & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], -2(%[s1]) \n\t" "sb %[p0_f0], -1(%[s1]) \n\t" "sb %[q0_f0], (%[s1]) \n\t" "sb %[q1_f0], +1(%[s1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [s1] "r" (s1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); } } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { /* f0+f1+f2 */ - filter1_dspr2(mask, hev, p1, p0, q0, q1, - &p1_f0, &p0_f0, &q0_f0, &q1_f0); + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); PACK_LEFT_0TO3() - mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, - q0_l, q1_l, q2_l, q3_l, - &p2_l_f1, &p1_l_f1, &p0_l_f1, - &q0_l_f1, &q1_l_f1, &q2_l_f1); + mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, + &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); PACK_RIGHT_0TO3() - mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, - q0_r, q1_r, q2_r, q3_r, - &p2_r_f1, &p1_r_f1, &p0_r_f1, - &q0_r_f1, &q1_r_f1, &q2_r_f1); + mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, + &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); PACK_LEFT_4TO7() - wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, - &p3_l, &p2_l, &p1_l, &p0_l, - &q0_l, &q1_l, &q2_l, &q3_l, - &q4_l, &q5_l, &q6_l, &q7_l); + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); PACK_RIGHT_4TO7() - wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, - &p3_r, &p2_r, &p1_r, &p0_r, - &q0_r, &q1_r, &q2_r, &q3_r, - &q4_r, &q5_r, &q6_r, &q7_r); + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); if (mask & flat & flat2 & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p6_r], -7(%[s4]) \n\t" "sb %[p5_r], -6(%[s4]) \n\t" "sb %[p4_r], -5(%[s4]) \n\t" @@ -503,13 +456,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[p0_r], -1(%[s4]) \n\t" : - : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), - [p4_r] "r" (p4_r), [p3_r] "r" (p3_r), - [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), - [p0_r] "r" (p0_r), [s4] "r" (s4) - ); + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [s4] "r"(s4)); - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[q0_r], (%[s4]) \n\t" "sb %[q1_r], +1(%[s4]) \n\t" "sb %[q2_r], +2(%[s4]) \n\t" @@ -519,13 +470,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[q6_r], +6(%[s4]) \n\t" : - : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), - [q2_r] "r" (q2_r), [q3_r] "r" (q3_r), - [q4_r] "r" (q4_r), [q5_r] "r" (q5_r), - [q6_r] "r" (q6_r), [s4] "r" (s4) - ); + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [s4] "r"(s4)); } else if (mask & flat & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_r_f1], -3(%[s4]) \n\t" "sb %[p1_r_f1], -2(%[s4]) \n\t" "sb %[p0_r_f1], -1(%[s4]) \n\t" @@ -534,26 +483,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[q2_r_f1], +2(%[s4]) \n\t" : - : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1), - [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1), - [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1), - [s4] "r" (s4) - ); + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4)); } else if (mask & 0x000000FF) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], -2(%[s4]) \n\t" "sb %[p0_f0], -1(%[s4]) \n\t" "sb %[q0_f0], (%[s4]) \n\t" "sb %[q1_f0], +1(%[s4]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [s4] "r" (s4) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p6_r], %[p6_r], 16 \n\t" "srl %[p5_r], %[p5_r], 16 \n\t" "srl %[p4_r], %[p4_r], 16 \n\t" @@ -569,17 +514,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "srl %[q5_r], %[q5_r], 16 \n\t" "srl %[q6_r], %[q6_r], 16 \n\t" - : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), - [q2_r] "+r" (q2_r), [q3_r] "+r" (q3_r), - [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r), - [q6_r] "+r" (q6_r), [p6_r] "+r" (p6_r), - [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r), - [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), - [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r) - : - ); - - __asm__ __volatile__ ( + : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), + [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), + [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), + [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r) + :); + + __asm__ __volatile__( "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" @@ -591,16 +533,15 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1), - [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1), - [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), + [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), + [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & flat2 & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p6_r], -7(%[s3]) \n\t" "sb %[p5_r], -6(%[s3]) \n\t" "sb %[p4_r], -5(%[s3]) \n\t" @@ -610,12 +551,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[p0_r], -1(%[s3]) \n\t" : - : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r), - [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), - [p0_r] "r" (p0_r), [s3] "r" (s3) - ); + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [s3] "r"(s3)); - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[q0_r], (%[s3]) \n\t" "sb %[q1_r], +1(%[s3]) \n\t" "sb %[q2_r], +2(%[s3]) \n\t" @@ -625,13 +565,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[q6_r], +6(%[s3]) \n\t" : - : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), - [q2_r] "r" (q2_r), [q3_r] "r" (q3_r), - [q4_r] "r" (q4_r), [q5_r] "r" (q5_r), - [q6_r] "r" (q6_r), [s3] "r" (s3) - ); + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [s3] "r"(s3)); } else if (mask & flat & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_r_f1], -3(%[s3]) \n\t" "sb %[p1_r_f1], -2(%[s3]) \n\t" "sb %[p0_r_f1], -1(%[s3]) \n\t" @@ -640,38 +578,33 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[q2_r_f1], +2(%[s3]) \n\t" : - : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1), - [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1), - [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1), - [s3] "r" (s3) - ); + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3)); } else if (mask & 0x0000FF00) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], -2(%[s3]) \n\t" "sb %[p0_f0], -1(%[s3]) \n\t" "sb %[q0_f0], (%[s3]) \n\t" "sb %[q1_f0], +1(%[s3]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [s3] "r" (s3) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p1_f0], %[p1_f0], 8 \n\t" "srl %[p0_f0], %[p0_f0], 8 \n\t" "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & flat2 & 0x00FF0000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p6_l], -7(%[s2]) \n\t" "sb %[p5_l], -6(%[s2]) \n\t" "sb %[p4_l], -5(%[s2]) \n\t" @@ -681,12 +614,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[p0_l], -1(%[s2]) \n\t" : - : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l), - [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), - [p0_l] "r" (p0_l), [s2] "r" (s2) - ); + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [s2] "r"(s2)); - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[q0_l], (%[s2]) \n\t" "sb %[q1_l], +1(%[s2]) \n\t" "sb %[q2_l], +2(%[s2]) \n\t" @@ -696,12 +628,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[q6_l], +6(%[s2]) \n\t" : - : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), - [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l), - [q6_l] "r" (q6_l), [s2] "r" (s2) - ); + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [s2] "r"(s2)); } else if (mask & flat & 0x00FF0000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_l_f1], -3(%[s2]) \n\t" "sb %[p1_l_f1], -2(%[s2]) \n\t" "sb %[p0_l_f1], -1(%[s2]) \n\t" @@ -710,26 +641,22 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[q2_l_f1], +2(%[s2]) \n\t" : - : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1), - [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1), - [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1), - [s2] "r" (s2) - ); + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2)); } else if (mask & 0x00FF0000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], -2(%[s2]) \n\t" "sb %[p0_f0], -1(%[s2]) \n\t" "sb %[q0_f0], (%[s2]) \n\t" "sb %[q1_f0], +1(%[s2]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [s2] "r" (s2) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); } - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p6_l], %[p6_l], 16 \n\t" "srl %[p5_l], %[p5_l], 16 \n\t" "srl %[p4_l], %[p4_l], 16 \n\t" @@ -745,15 +672,14 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "srl %[q5_l], %[q5_l], 16 \n\t" "srl %[q6_l], %[q6_l], 16 \n\t" - : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), - [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l), - [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l), - [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l), - [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l) - : - ); + : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), + [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), + [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), + [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) + :); - __asm__ __volatile__ ( + __asm__ __volatile__( "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" @@ -765,16 +691,15 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "srl %[q0_f0], %[q0_f0], 8 \n\t" "srl %[q1_f0], %[q1_f0], 8 \n\t" - : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1), - [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1), - [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1), - [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), - [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) - : - ); + : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), + [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), + [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); if (mask & flat & flat2 & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p6_l], -7(%[s1]) \n\t" "sb %[p5_l], -6(%[s1]) \n\t" "sb %[p4_l], -5(%[s1]) \n\t" @@ -784,13 +709,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[p0_l], -1(%[s1]) \n\t" : - : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l), - [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), - [p0_l] "r" (p0_l), - [s1] "r" (s1) - ); + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [s1] "r"(s1)); - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[q0_l], (%[s1]) \n\t" "sb %[q1_l], 1(%[s1]) \n\t" "sb %[q2_l], 2(%[s1]) \n\t" @@ -800,13 +723,11 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[q6_l], 6(%[s1]) \n\t" : - : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), - [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l), - [q6_l] "r" (q6_l), - [s1] "r" (s1) - ); + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [s1] "r"(s1)); } else if (mask & flat & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p2_l_f1], -3(%[s1]) \n\t" "sb %[p1_l_f1], -2(%[s1]) \n\t" "sb %[p0_l_f1], -1(%[s1]) \n\t" @@ -815,23 +736,19 @@ void vpx_lpf_vertical_16_dspr2(uint8_t *s, "sb %[q2_l_f1], +2(%[s1]) \n\t" : - : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1), - [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1), - [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1), - [s1] "r" (s1) - ); + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1)); } else if (mask & 0xFF000000) { - __asm__ __volatile__ ( + __asm__ __volatile__( "sb %[p1_f0], -2(%[s1]) \n\t" "sb %[p0_f0], -1(%[s1]) \n\t" "sb %[q0_f0], (%[s1]) \n\t" "sb %[q1_f0], +1(%[s1]) \n\t" : - : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), - [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), - [s1] "r" (s1) - ); + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); } } } diff --git a/vpx_dsp/mips/loopfilter_msa.h b/vpx_dsp/mips/loopfilter_msa.h index 9894701bf7b8fa18aa82beaaff8966f78772e508..d3c2bd4edb1230bc33f1874192233ef197006967 100644 --- a/vpx_dsp/mips/loopfilter_msa.h +++ b/vpx_dsp/mips/loopfilter_msa.h @@ -13,234 +13,238 @@ #include "vpx_dsp/mips/macros_msa.h" -#define VPX_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ - p1_out, p0_out, q0_out, q1_out) { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ - v8i16 q0_sub_p0_r, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - filt = filt & (v16i8)hev_in; \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - /* combine left and right part */ \ - filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \ - \ - filt = filt & (v16i8)mask_in; \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ - \ - filt = __msa_srari_b(filt1, 1); \ - hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ - filt = filt & (v16i8)hev_in; \ - \ - q1_m = __msa_subs_s_b(q1_m, filt); \ - q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, filt); \ - p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ -} +#define VPX_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, filt_r, cnst3h; \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt = filt & (v16i8)hev_in; \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ + filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + /* combine left and right part */ \ + filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \ + \ + filt = filt & (v16i8)mask_in; \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ + filt = filt & (v16i8)hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ + } -#define VPX_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ - p1_out, p0_out, q0_out, q1_out) { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ - v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - \ - filt = filt & (v16i8)hev_in; \ - \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ - filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ - filt_l += q0_sub_p0_l; \ - filt_l = __msa_sat_s_h(filt_l, 7); \ - \ - filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ - filt = filt & (v16i8)mask_in; \ - \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ - \ - filt = __msa_srari_b(filt1, 1); \ - hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ - filt = filt & (v16i8)hev_in; \ - \ - q1_m = __msa_subs_s_b(q1_m, filt); \ - q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, filt); \ - p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ -} +#define VPX_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + \ + filt = filt & (v16i8)hev_in; \ + \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ + filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ + filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ + filt_l += q0_sub_p0_l; \ + filt_l = __msa_sat_s_h(filt_l, 7); \ + \ + filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ + filt = filt & (v16i8)mask_in; \ + \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ + filt = filt & (v16i8)hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ + } -#define VPX_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) { \ - v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ - v16u8 zero_in = { 0 }; \ - \ - tmp = __msa_ori_b(zero_in, 1); \ - p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ - q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ - p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ - q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ - \ - p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ - flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ - p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ - flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ - \ - flat_out = (tmp < (v16u8)flat_out); \ - flat_out = __msa_xori_b(flat_out, 0xff); \ - flat_out = flat_out & (mask); \ -} +#define VPX_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + { \ + v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ + v16u8 zero_in = { 0 }; \ + \ + tmp = __msa_ori_b(zero_in, 1); \ + p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ + q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ + p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ + q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ + \ + p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ + flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ + p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ + flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ + \ + flat_out = (tmp < (v16u8)flat_out); \ + flat_out = __msa_xori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ + } -#define VPX_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \ - q5_in, q6_in, q7_in, flat_in, flat2_out) { \ - v16u8 tmp, zero_in = { 0 }; \ - v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ - v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ - \ - tmp = __msa_ori_b(zero_in, 1); \ - p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ - q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ - p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ - q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ - p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ - q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ - p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ - q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ - \ - p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ - flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ - flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ - p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ - flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ - p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ - flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ - \ - flat2_out = (tmp < (v16u8)flat2_out); \ - flat2_out = __msa_xori_b(flat2_out, 0xff); \ - flat2_out = flat2_out & flat_in; \ -} +#define VPX_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ + q6_in, q7_in, flat_in, flat2_out) \ + { \ + v16u8 tmp, zero_in = { 0 }; \ + v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ + v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ + \ + tmp = __msa_ori_b(zero_in, 1); \ + p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ + q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ + p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ + q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ + p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ + q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ + p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ + q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ + \ + p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ + flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ + flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ + p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ + flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ + p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ + flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ + \ + flat2_out = (tmp < (v16u8)flat2_out); \ + flat2_out = __msa_xori_b(flat2_out, 0xff); \ + flat2_out = flat2_out & flat_in; \ + } -#define VPX_FILTER8(p3_in, p2_in, p1_in, p0_in, \ - q0_in, q1_in, q2_in, q3_in, \ - p2_filt8_out, p1_filt8_out, p0_filt8_out, \ - q0_filt8_out, q1_filt8_out, q2_filt8_out) { \ - v8u16 tmp0, tmp1, tmp2; \ - \ - tmp2 = p2_in + p1_in + p0_in; \ - tmp0 = p3_in << 1; \ - \ - tmp0 = tmp0 + tmp2 + q0_in; \ - tmp1 = tmp0 + p3_in + p2_in; \ - p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ - \ - tmp1 = tmp0 + p1_in + q1_in; \ - p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ - \ - tmp1 = q2_in + q1_in + q0_in; \ - tmp2 = tmp2 + tmp1; \ - tmp0 = tmp2 + (p0_in); \ - tmp0 = tmp0 + (p3_in); \ - p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3); \ - \ - tmp0 = q2_in + q3_in; \ - tmp0 = p0_in + tmp1 + tmp0; \ - tmp1 = q3_in + q3_in; \ - tmp1 = tmp1 + tmp0; \ - q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ - \ - tmp0 = tmp2 + q3_in; \ - tmp1 = tmp0 + q0_in; \ - q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ - \ - tmp1 = tmp0 - p2_in; \ - tmp0 = q1_in + q3_in; \ - tmp1 = tmp0 + tmp1; \ - q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ -} +#define VPX_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ + q1_filt8_out, q2_filt8_out) \ + { \ + v8u16 tmp0, tmp1, tmp2; \ + \ + tmp2 = p2_in + p1_in + p0_in; \ + tmp0 = p3_in << 1; \ + \ + tmp0 = tmp0 + tmp2 + q0_in; \ + tmp1 = tmp0 + p3_in + p2_in; \ + p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + \ + tmp1 = tmp0 + p1_in + q1_in; \ + p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + \ + tmp1 = q2_in + q1_in + q0_in; \ + tmp2 = tmp2 + tmp1; \ + tmp0 = tmp2 + (p0_in); \ + tmp0 = tmp0 + (p3_in); \ + p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3); \ + \ + tmp0 = q2_in + q3_in; \ + tmp0 = p0_in + tmp1 + tmp0; \ + tmp1 = q3_in + q3_in; \ + tmp1 = tmp1 + tmp0; \ + q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + \ + tmp0 = tmp2 + q3_in; \ + tmp1 = tmp0 + q0_in; \ + q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + \ + tmp1 = tmp0 - p2_in; \ + tmp0 = q1_in + q3_in; \ + tmp1 = tmp0 + tmp1; \ + q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + } -#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \ - q0_in, q1_in, q2_in, q3_in, \ - limit_in, b_limit_in, thresh_in, \ - hev_out, mask_out, flat_out) { \ - v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ - v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ - \ - /* absolute subtraction of pixel values */ \ - p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ - p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ - p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ - q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ - q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ - q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ - p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ - p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ - \ - /* calculation of hev */ \ - flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ - hev_out = thresh_in < (v16u8)flat_out; \ - \ - /* calculation of mask */ \ - p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ - p1_asub_q1_m >>= 1; \ - p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ - \ - mask_out = b_limit_in < p0_asub_q0_m; \ - mask_out = __msa_max_u_b(flat_out, mask_out); \ - p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ - mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ - q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ - mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ - \ - mask_out = limit_in < (v16u8)mask_out; \ - mask_out = __msa_xori_b(mask_out, 0xff); \ -} -#endif /* VPX_DSP_LOOPFILTER_MSA_H_ */ +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + { \ + v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ + p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ + p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ + q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ + q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ + q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ + p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ + p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ + \ + /* calculation of hev */ \ + flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = thresh_in < (v16u8)flat_out; \ + \ + /* calculation of mask */ \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m >>= 1; \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ + \ + mask_out = b_limit_in < p0_asub_q0_m; \ + mask_out = __msa_max_u_b(flat_out, mask_out); \ + p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ + \ + mask_out = limit_in < (v16u8)mask_out; \ + mask_out = __msa_xori_b(mask_out, 0xff); \ + } +#endif /* VPX_DSP_LOOPFILTER_MSA_H_ */ diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h index ea59eafe92abb31a45cda5f7a74702b67cff6956..f498fbe9de248be4a7126f808d0ce51fcfcff8ed 100644 --- a/vpx_dsp/mips/macros_msa.h +++ b/vpx_dsp/mips/macros_msa.h @@ -38,194 +38,186 @@ #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) #if (__mips_isa_rev >= 6) -#define LH(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__ ( \ - "lh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m) \ - ); \ - \ - val_m; \ -}) - -#define LW(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__ ( \ - "lw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m) \ - ); \ - \ - val_m; \ -}) +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val_m; \ + \ + __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) #if (__mips == 64) -#define LD(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__ ( \ - "ld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m) \ - ); \ - \ - val_m; \ -}) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) #else // !(__mips == 64) -#define LD(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + \ + val0_m = LW(psrc_m); \ + val1_m = LW(psrc_m + 4); \ + \ + val_m = (uint64_t)(val1_m); \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ + \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint64_t val_m = (val); \ + \ + __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } +#else // !(__mips_isa_rev >= 6) +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ \ - val0_m = LW(psrc_m); \ - val1_m = LW(psrc_m + 4); \ + __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ \ - val_m; \ -}) -#endif // (__mips == 64) + val_m; \ + }) -#define SH(val, pdst) { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "sh %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m" (*pdst_m) \ - : [val_m] "r" (val_m) \ - ); \ -} - -#define SW(val, pdst) { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "sw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m" (*pdst_m) \ - : [val_m] "r" (val_m) \ - ); \ -} - -#define SD(val, pdst) { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint64_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "sd %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m" (*pdst_m) \ - : [val_m] "r" (val_m) \ - ); \ -} -#else // !(__mips_isa_rev >= 6) -#define LH(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__ ( \ - "ulh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m) \ - ); \ - \ - val_m; \ -}) - -#define LW(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__ ( \ - "ulw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m) \ - ); \ - \ - val_m; \ -}) +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val_m; \ + \ + __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) #if (__mips == 64) -#define LD(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__ ( \ - "uld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m) \ - ); \ - \ - val_m; \ -}) -#else // !(__mips == 64) -#define LD(psrc) ({ \ - const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ \ - val0_m = LW(psrc_m1); \ - val1_m = LW(psrc_m1 + 4); \ + __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ \ - val_m; \ -}) -#endif // (__mips == 64) - -#define SH(val, pdst) { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "ush %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m" (*pdst_m) \ - : [val_m] "r" (val_m) \ - ); \ -} - -#define SW(val, pdst) { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "usw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m" (*pdst_m) \ - : [val_m] "r" (val_m) \ - ); \ -} - -#define SD(val, pdst) { \ - uint8_t *pdst_m1 = (uint8_t *)(pdst); \ - uint32_t val0_m, val1_m; \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ \ - val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + val0_m = LW(psrc_m1); \ + val1_m = LW(psrc_m1 + 4); \ \ - SW(val0_m, pdst_m1); \ - SW(val1_m, pdst_m1 + 4); \ -} + val_m = (uint64_t)(val1_m); \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ + \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *pdst_m1 = (uint8_t *)(pdst); \ + uint32_t val0_m, val1_m; \ + \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_m, pdst_m1); \ + SW(val1_m, pdst_m1 + 4); \ + } #endif // (__mips_isa_rev >= 6) /* Description : Load 4 words with stride @@ -236,12 +228,13 @@ Load word in 'out2' from (psrc + 2 * stride) Load word in 'out3' from (psrc + 3 * stride) */ -#define LW4(psrc, stride, out0, out1, out2, out3) { \ - out0 = LW((psrc)); \ - out1 = LW((psrc) + stride); \ - out2 = LW((psrc) + 2 * stride); \ - out3 = LW((psrc) + 3 * stride); \ -} +#define LW4(psrc, stride, out0, out1, out2, out3) \ + { \ + out0 = LW((psrc)); \ + out1 = LW((psrc) + stride); \ + out2 = LW((psrc) + 2 * stride); \ + out3 = LW((psrc) + 3 * stride); \ + } /* Description : Load double words with stride Arguments : Inputs - psrc, stride @@ -249,14 +242,16 @@ Details : Load double word in 'out0' from (psrc) Load double word in 'out1' from (psrc + stride) */ -#define LD2(psrc, stride, out0, out1) { \ - out0 = LD((psrc)); \ - out1 = LD((psrc) + stride); \ -} -#define LD4(psrc, stride, out0, out1, out2, out3) { \ - LD2((psrc), stride, out0, out1); \ - LD2((psrc) + 2 * stride, stride, out2, out3); \ -} +#define LD2(psrc, stride, out0, out1) \ + { \ + out0 = LD((psrc)); \ + out1 = LD((psrc) + stride); \ + } +#define LD4(psrc, stride, out0, out1, out2, out3) \ + { \ + LD2((psrc), stride, out0, out1); \ + LD2((psrc) + 2 * stride, stride, out2, out3); \ + } /* Description : Store 4 words with stride Arguments : Inputs - in0, in1, in2, in3, pdst, stride @@ -265,12 +260,13 @@ Store word from 'in2' to (pdst + 2 * stride) Store word from 'in3' to (pdst + 3 * stride) */ -#define SW4(in0, in1, in2, in3, pdst, stride) { \ - SW(in0, (pdst)) \ - SW(in1, (pdst) + stride); \ - SW(in2, (pdst) + 2 * stride); \ - SW(in3, (pdst) + 3 * stride); \ -} +#define SW4(in0, in1, in2, in3, pdst, stride) \ + { \ + SW(in0, (pdst)) \ + SW(in1, (pdst) + stride); \ + SW(in2, (pdst) + 2 * stride); \ + SW(in3, (pdst) + 3 * stride); \ + } /* Description : Store 4 double words with stride Arguments : Inputs - in0, in1, in2, in3, pdst, stride @@ -279,12 +275,13 @@ Store double word from 'in2' to (pdst + 2 * stride) Store double word from 'in3' to (pdst + 3 * stride) */ -#define SD4(in0, in1, in2, in3, pdst, stride) { \ - SD(in0, (pdst)) \ - SD(in1, (pdst) + stride); \ - SD(in2, (pdst) + 2 * stride); \ - SD(in3, (pdst) + 3 * stride); \ -} +#define SD4(in0, in1, in2, in3, pdst, stride) \ + { \ + SD(in0, (pdst)) \ + SD(in1, (pdst) + stride); \ + SD(in2, (pdst) + 2 * stride); \ + SD(in3, (pdst) + 3 * stride); \ + } /* Description : Load vectors with 16 byte elements with stride Arguments : Inputs - psrc, stride @@ -293,45 +290,50 @@ Details : Load 16 byte elements in 'out0' from (psrc) Load 16 byte elements in 'out1' from (psrc + stride) */ -#define LD_B2(RTYPE, psrc, stride, out0, out1) { \ - out0 = LD_B(RTYPE, (psrc)); \ - out1 = LD_B(RTYPE, (psrc) + stride); \ -} +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ + } #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) -#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ -} +#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ + } #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) -#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ -} +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) -#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ -} +#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ + { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ + } #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) -#define LD_B7(RTYPE, psrc, stride, \ - out0, out1, out2, out3, out4, out5, out6) { \ - LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ - LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ -} +#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ + { \ + LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ + LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ + } #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) -#define LD_B8(RTYPE, psrc, stride, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ -} +#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7) \ + { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + } #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) @@ -341,33 +343,36 @@ Details : Load 8 halfword elements in 'out0' from (psrc) Load 8 halfword elements in 'out1' from (psrc + stride) */ -#define LD_H2(RTYPE, psrc, stride, out0, out1) { \ - out0 = LD_H(RTYPE, (psrc)); \ - out1 = LD_H(RTYPE, (psrc) + (stride)); \ -} +#define LD_H2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_H(RTYPE, (psrc)); \ + out1 = LD_H(RTYPE, (psrc) + (stride)); \ + } #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) -#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ - LD_H2(RTYPE, (psrc), stride, out0, out1); \ - LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ -} +#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_H2(RTYPE, (psrc), stride, out0, out1); \ + LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) -#define LD_H8(RTYPE, psrc, stride, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ -} +#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7) \ + { \ + LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + } #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) -#define LD_H16(RTYPE, psrc, stride, \ - out0, out1, out2, out3, out4, out5, out6, out7, \ - out8, out9, out10, out11, out12, out13, out14, out15) { \ - LD_H8(RTYPE, (psrc), stride, \ - out0, out1, out2, out3, out4, out5, out6, out7); \ - LD_H8(RTYPE, (psrc) + 8 * stride, stride, \ - out8, out9, out10, out11, out12, out13, out14, out15); \ -} +#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7, out8, out9, out10, out11, out12, out13, out14, out15) \ + { \ + LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ + out7); \ + LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ + out13, out14, out15); \ + } #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) /* Description : Load 4x4 block of signed halfword elements from 1D source @@ -375,45 +380,49 @@ Arguments : Input - psrc Outputs - out0, out1, out2, out3 */ -#define LD4x4_SH(psrc, out0, out1, out2, out3) { \ - out0 = LD_SH(psrc); \ - out2 = LD_SH(psrc + 8); \ - out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ -} +#define LD4x4_SH(psrc, out0, out1, out2, out3) \ + { \ + out0 = LD_SH(psrc); \ + out2 = LD_SH(psrc + 8); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ + } /* Description : Load 2 vectors of signed word elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 Return Type - signed word */ -#define LD_SW2(psrc, stride, out0, out1) { \ - out0 = LD_SW((psrc)); \ - out1 = LD_SW((psrc) + stride); \ -} +#define LD_SW2(psrc, stride, out0, out1) \ + { \ + out0 = LD_SW((psrc)); \ + out1 = LD_SW((psrc) + stride); \ + } /* Description : Store vectors of 16 byte elements with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 16 byte elements from 'in0' to (pdst) Store 16 byte elements from 'in1' to (pdst + stride) */ -#define ST_B2(RTYPE, in0, in1, pdst, stride) { \ - ST_B(RTYPE, in0, (pdst)); \ - ST_B(RTYPE, in1, (pdst) + stride); \ -} +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ + } #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) -#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ - ST_B2(RTYPE, in0, in1, (pdst), stride); \ - ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ -} +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) -#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - pdst, stride) { \ - ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ - ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ -} +#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ + { \ + ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ + } #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) /* Description : Store vectors of 8 halfword elements with stride @@ -421,22 +430,25 @@ Details : Store 8 halfword elements from 'in0' to (pdst) Store 8 halfword elements from 'in1' to (pdst + stride) */ -#define ST_H2(RTYPE, in0, in1, pdst, stride) { \ - ST_H(RTYPE, in0, (pdst)); \ - ST_H(RTYPE, in1, (pdst) + stride); \ -} +#define ST_H2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ + } #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) -#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ - ST_H2(RTYPE, in0, in1, (pdst), stride); \ - ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ -} +#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_H2(RTYPE, in0, in1, (pdst), stride); \ + ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) -#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \ - ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ - ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ -} +#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ + { \ + ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ + ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ + } #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) /* Description : Store vectors of word elements with stride @@ -444,10 +456,11 @@ Details : Store 4 word elements from 'in0' to (pdst) Store 4 word elements from 'in1' to (pdst + stride) */ -#define ST_SW2(in0, in1, pdst, stride) { \ - ST_SW(in0, (pdst)); \ - ST_SW(in1, (pdst) + stride); \ -} +#define ST_SW2(in0, in1, pdst, stride) \ + { \ + ST_SW(in0, (pdst)); \ + ST_SW(in1, (pdst) + stride); \ + } /* Description : Store 2x4 byte block to destination memory from input vector Arguments : Inputs - in, stidx, pdst, stride @@ -460,20 +473,21 @@ Index 'stidx+3' halfword element from 'in' vector is copied to the GP register and stored to (pdst + 3 * stride) */ -#define ST2x4_UB(in, stidx, pdst, stride) { \ - uint16_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ - out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ - out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ - out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ - \ - SH(out0_m, pblk_2x4_m); \ - SH(out1_m, pblk_2x4_m + stride); \ - SH(out2_m, pblk_2x4_m + 2 * stride); \ - SH(out3_m, pblk_2x4_m + 3 * stride); \ -} +#define ST2x4_UB(in, stidx, pdst, stride) \ + { \ + uint16_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ + out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ + out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ + out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ + \ + SH(out0_m, pblk_2x4_m); \ + SH(out1_m, pblk_2x4_m + stride); \ + SH(out2_m, pblk_2x4_m + 2 * stride); \ + SH(out3_m, pblk_2x4_m + 3 * stride); \ + } /* Description : Store 4x2 byte block to destination memory from input vector Arguments : Inputs - in, pdst, stride @@ -482,16 +496,17 @@ Index 1 word element from 'in' vector is copied to the GP register and stored to (pdst + stride) */ -#define ST4x2_UB(in, pdst, stride) { \ - uint32_t out0_m, out1_m; \ - uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_w((v4i32)in, 0); \ - out1_m = __msa_copy_u_w((v4i32)in, 1); \ - \ - SW(out0_m, pblk_4x2_m); \ - SW(out1_m, pblk_4x2_m + stride); \ -} +#define ST4x2_UB(in, pdst, stride) \ + { \ + uint32_t out0_m, out1_m; \ + uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in, 0); \ + out1_m = __msa_copy_u_w((v4i32)in, 1); \ + \ + SW(out0_m, pblk_4x2_m); \ + SW(out1_m, pblk_4x2_m + stride); \ + } /* Description : Store 4x4 byte block to destination memory from input vector Arguments : Inputs - in0, in1, pdst, stride @@ -504,35 +519,38 @@ 'Idx3' word element from input vector 'in0' is copied to the GP register and stored to (pdst + 3 * stride) */ -#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \ - uint32_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ - out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ - out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ - out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ - \ - SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ -} -#define ST4x8_UB(in0, in1, pdst, stride) { \ - uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ - \ - ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ - ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ -} +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ + { \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ + out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ + out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ + out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ + \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ + } +#define ST4x8_UB(in0, in1, pdst, stride) \ + { \ + uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ + \ + ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ + ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ + } /* Description : Store 8x1 byte block to destination memory from input vector Arguments : Inputs - in, pdst Details : Index 0 double word element from 'in' vector is copied to the GP register and stored to (pdst) */ -#define ST8x1_UB(in, pdst) { \ - uint64_t out0_m; \ - \ - out0_m = __msa_copy_u_d((v2i64)in, 0); \ - SD(out0_m, pdst); \ -} +#define ST8x1_UB(in, pdst) \ + { \ + uint64_t out0_m; \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + SD(out0_m, pdst); \ + } /* Description : Store 8x2 byte block to destination memory from input vector Arguments : Inputs - in, pdst, stride @@ -541,16 +559,17 @@ Index 1 double word element from 'in' vector is copied to the GP register and stored to (pdst + stride) */ -#define ST8x2_UB(in, pdst, stride) { \ - uint64_t out0_m, out1_m; \ - uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_d((v2i64)in, 0); \ - out1_m = __msa_copy_u_d((v2i64)in, 1); \ - \ - SD(out0_m, pblk_8x2_m); \ - SD(out1_m, pblk_8x2_m + stride); \ -} +#define ST8x2_UB(in, pdst, stride) \ + { \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + out1_m = __msa_copy_u_d((v2i64)in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ + } /* Description : Store 8x4 byte block to destination memory from input vectors @@ -564,17 +583,18 @@ Index 1 double word element from 'in1' vector is copied to the GP register and stored to (pdst + 3 * stride) */ -#define ST8x4_UB(in0, in1, pdst, stride) { \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_d((v2i64)in0, 0); \ - out1_m = __msa_copy_u_d((v2i64)in0, 1); \ - out2_m = __msa_copy_u_d((v2i64)in1, 0); \ - out3_m = __msa_copy_u_d((v2i64)in1, 1); \ - \ - SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ -} +#define ST8x4_UB(in0, in1, pdst, stride) \ + { \ + uint64_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in0, 0); \ + out1_m = __msa_copy_u_d((v2i64)in0, 1); \ + out2_m = __msa_copy_u_d((v2i64)in1, 0); \ + out3_m = __msa_copy_u_d((v2i64)in1, 1); \ + \ + SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ + } /* Description : average with rounding (in0 + in1 + 1) / 2. Arguments : Inputs - in0, in1, in2, in3, @@ -584,17 +604,19 @@ each unsigned byte element from 'in1' vector. Then the average with rounding is calculated and written to 'out0' */ -#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ - out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ -} +#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ + out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ + } #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) -#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ - AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ -} +#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ + } #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) /* Description : Immediate number of elements to slide with zero @@ -604,18 +626,20 @@ Details : Byte elements from 'zero_m' vector are slid into 'in0' by value specified in the 'slide_val' */ -#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \ - v16i8 zero_m = { 0 }; \ - out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ - out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ -} +#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ + { \ + v16i8 zero_m = { 0 }; \ + out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ + } #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) -#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \ - out0, out1, out2, out3, slide_val) { \ - SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ - SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ -} +#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ + slide_val) \ + { \ + SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ + SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ + } #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) /* Description : Immediate number of elements to slide @@ -625,18 +649,20 @@ Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by value specified in the 'slide_val' */ -#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \ - out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ - out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ -} +#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + { \ + out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ + } #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) -#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \ - out0, out1, out2, slide_val) { \ - SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ - out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ -} +#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ + out2, slide_val) \ + { \ + SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ + } #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) @@ -647,19 +673,21 @@ Details : Byte elements from 'in0' & 'in1' are copied selectively to 'out0' as per control vector 'mask0' */ -#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ - out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ - out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ -} +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ + } #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) -#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \ - out0, out1, out2, out3) { \ - VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ - VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ -} +#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ + out3) \ + { \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ + } #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) @@ -673,18 +701,19 @@ The multiplication result of adjacent odd-even elements are added together and written to the 'out0' vector */ -#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ - out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ - out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ -} +#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ + out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ + } #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) -#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \ - cnst0, cnst1, cnst2, cnst3, \ - out0, out1, out2, out3) { \ - DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ -} +#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) /* Description : Dot product of byte vector elements @@ -697,17 +726,19 @@ The multiplication result of adjacent odd-even elements are added together and written to the 'out0' vector */ -#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ - out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ - out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ -} +#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ + } #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) -#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \ - cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ - DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ -} +#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) /* Description : Dot product of halfword vector elements @@ -720,18 +751,19 @@ The multiplication result of adjacent odd-even elements are added together and written to the 'out0' vector */ -#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ - out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ - out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ -} +#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ + } #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) -#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \ - cnst0, cnst1, cnst2, cnst3, \ - out0, out1, out2, out3) { \ - DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ -} +#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) /* Description : Dot product of word vector elements @@ -744,10 +776,11 @@ The multiplication result of adjacent odd-even elements are added together and written to the 'out0' vector */ -#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ - out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ - out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ -} +#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ + } #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) /* Description : Dot product & addition of byte vector elements @@ -760,17 +793,19 @@ The multiplication result of adjacent odd-even elements are added to the 'out0' vector */ -#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ - out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ - out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ -} +#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ + } #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) -#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \ - cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ - DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ -} +#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) /* Description : Dot product & addition of halfword vector elements @@ -783,10 +818,11 @@ The multiplication result of adjacent odd-even elements are added to the 'out0' vector */ -#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ - out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ - out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ -} +#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ + } #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) /* Description : Dot product & addition of double word vector elements @@ -799,10 +835,11 @@ The multiplication result of adjacent odd-even elements are added to the 'out0' vector */ -#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) { \ - out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ - out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ -} +#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ + out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ + } #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) /* Description : Minimum values between unsigned elements of @@ -813,16 +850,18 @@ Details : Minimum of unsigned halfword element values from 'in0' and 'min_vec' are written to output vector 'in0' */ -#define MIN_UH2(RTYPE, in0, in1, min_vec) { \ - in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ - in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ -} +#define MIN_UH2(RTYPE, in0, in1, min_vec) \ + { \ + in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ + in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ + } #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) -#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) { \ - MIN_UH2(RTYPE, in0, in1, min_vec); \ - MIN_UH2(RTYPE, in2, in3, min_vec); \ -} +#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ + { \ + MIN_UH2(RTYPE, in0, in1, min_vec); \ + MIN_UH2(RTYPE, in2, in3, min_vec); \ + } #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) /* Description : Clips all signed halfword elements of input vector @@ -831,22 +870,25 @@ Output - out_m Return Type - signed halfword */ -#define CLIP_SH_0_255(in) ({ \ - v8i16 max_m = __msa_ldi_h(255); \ - v8i16 out_m; \ - \ - out_m = __msa_maxi_s_h((v8i16)in, 0); \ - out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ - out_m; \ -}) -#define CLIP_SH2_0_255(in0, in1) { \ - in0 = CLIP_SH_0_255(in0); \ - in1 = CLIP_SH_0_255(in1); \ -} -#define CLIP_SH4_0_255(in0, in1, in2, in3) { \ - CLIP_SH2_0_255(in0, in1); \ - CLIP_SH2_0_255(in2, in3); \ -} +#define CLIP_SH_0_255(in) \ + ({ \ + v8i16 max_m = __msa_ldi_h(255); \ + v8i16 out_m; \ + \ + out_m = __msa_maxi_s_h((v8i16)in, 0); \ + out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ + out_m; \ + }) +#define CLIP_SH2_0_255(in0, in1) \ + { \ + in0 = CLIP_SH_0_255(in0); \ + in1 = CLIP_SH_0_255(in1); \ + } +#define CLIP_SH4_0_255(in0, in1, in2, in3) \ + { \ + CLIP_SH2_0_255(in0, in1); \ + CLIP_SH2_0_255(in2, in3); \ + } /* Description : Horizontal addition of 4 signed word elements of input vector Arguments : Input - in (signed word vector) @@ -855,16 +897,17 @@ Details : 4 signed word elements of 'in' vector are added together and the resulting integer sum is returned */ -#define HADD_SW_S32(in) ({ \ - v2i64 res0_m, res1_m; \ - int32_t sum_m; \ - \ - res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ - res1_m = __msa_splati_d(res0_m, 1); \ - res0_m = res0_m + res1_m; \ - sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ - sum_m; \ -}) +#define HADD_SW_S32(in) \ + ({ \ + v2i64 res0_m, res1_m; \ + int32_t sum_m; \ + \ + res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ + res1_m = __msa_splati_d(res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ + sum_m; \ + }) /* Description : Horizontal addition of 8 unsigned halfword elements Arguments : Inputs - in (unsigned halfword vector) @@ -873,18 +916,19 @@ Details : 8 unsigned halfword elements of input vector are added together and the resulting integer sum is returned */ -#define HADD_UH_U32(in) ({ \ - v4u32 res_m; \ - v2u64 res0_m, res1_m; \ - uint32_t sum_m; \ - \ - res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ - res0_m = __msa_hadd_u_d(res_m, res_m); \ - res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ - res0_m = res0_m + res1_m; \ - sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ - sum_m; \ -}) +#define HADD_UH_U32(in) \ + ({ \ + v4u32 res_m; \ + v2u64 res0_m, res1_m; \ + uint32_t sum_m; \ + \ + res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + res0_m = __msa_hadd_u_d(res_m, res_m); \ + res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ + sum_m; \ + }) /* Description : Horizontal addition of unsigned byte vector elements Arguments : Inputs - in0, in1 @@ -894,16 +938,18 @@ even unsigned byte element from 'in0' (pairwise) and the halfword result is written to 'out0' */ -#define HADD_UB2(RTYPE, in0, in1, out0, out1) { \ - out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ - out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ -} +#define HADD_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ + } #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) -#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \ - HADD_UB2(RTYPE, in0, in1, out0, out1); \ - HADD_UB2(RTYPE, in2, in3, out2, out3); \ -} +#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + HADD_UB2(RTYPE, in0, in1, out0, out1); \ + HADD_UB2(RTYPE, in2, in3, out2, out3); \ + } #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) /* Description : Horizontal subtraction of unsigned byte vector elements @@ -914,10 +960,11 @@ even unsigned byte element from 'in0' (pairwise) and the halfword result is written to 'out0' */ -#define HSUB_UB2(RTYPE, in0, in1, out0, out1) { \ - out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ - out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ -} +#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ + } #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) /* Description : SAD (Sum of Absolute Difference) @@ -928,18 +975,19 @@ 'ref0' is calculated and preserved in 'diff0'. Then even-odd pairs are added together to generate 8 halfword results. */ -#define SAD_UB2_UH(in0, in1, ref0, ref1) ({ \ - v16u8 diff0_m, diff1_m; \ - v8u16 sad_m = { 0 }; \ - \ - diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ - diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ - \ - sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ - sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ - \ - sad_m; \ -}) +#define SAD_UB2_UH(in0, in1, ref0, ref1) \ + ({ \ + v16u8 diff0_m, diff1_m; \ + v8u16 sad_m = { 0 }; \ + \ + diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ + diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ + \ + sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ + sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ + \ + sad_m; \ + }) /* Description : Horizontal subtraction of signed halfword vector elements Arguments : Inputs - in0, in1 @@ -949,10 +997,11 @@ even signed halfword element from 'in0' (pairwise) and the word result is written to 'out0' */ -#define HSUB_UH2(RTYPE, in0, in1, out0, out1) { \ - out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ - out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ -} +#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ + out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ + } #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) /* Description : Set element n input vector to GPR value @@ -961,25 +1010,28 @@ Return Type - as per RTYPE Details : Set element 0 in vector 'out' to value specified in 'in0' */ -#define INSERT_W2(RTYPE, in0, in1, out) { \ - out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ -} +#define INSERT_W2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + } #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) -#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \ - out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ -} +#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ + { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ + } #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) -#define INSERT_D2(RTYPE, in0, in1, out) { \ - out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ - out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ -} +#define INSERT_D2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ + out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ + } #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) @@ -990,10 +1042,11 @@ Details : Even byte elements of 'in0' and 'in1' are interleaved and written to 'out0' */ -#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ - out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ -} +#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ + } #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) @@ -1004,10 +1057,11 @@ Details : Even halfword elements of 'in0' and 'in1' are interleaved and written to 'out0' */ -#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ - out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ -} +#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ + } #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) @@ -1019,10 +1073,11 @@ Details : Even word elements of 'in0' and 'in1' are interleaved and written to 'out0' */ -#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ - out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ -} +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ + out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ + } #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) /* Description : Interleave even double word elements from vectors @@ -1032,10 +1087,11 @@ Details : Even double word elements of 'in0' and 'in1' are interleaved and written to 'out0' */ -#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ - out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ -} +#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ + out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ + } #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) /* Description : Interleave left half of byte elements from vectors @@ -1045,20 +1101,22 @@ Details : Left half of byte elements of 'in0' and 'in1' are interleaved and written to 'out0'. */ -#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ -} +#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ + } #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) -#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} +#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) @@ -1070,10 +1128,11 @@ Details : Left half of halfword elements of 'in0' and 'in1' are interleaved and written to 'out0'. */ -#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ -} +#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ + } #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) @@ -1084,10 +1143,11 @@ Details : Left half of word elements of 'in0' and 'in1' are interleaved and written to 'out0'. */ -#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ - out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ -} +#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ + } #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) @@ -1098,33 +1158,36 @@ Details : Right half of byte elements of 'in0' and 'in1' are interleaved and written to out0. */ -#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ -} +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ + } #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) -#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) -#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - in8, in9, in10, in11, in12, in13, in14, in15, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3); \ - ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \ - out4, out5, out6, out7); \ -} +#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ + in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ + out5, out6, out7) \ + { \ + ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3); \ + ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ + out6, out7); \ + } #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) /* Description : Interleave right half of halfword elements from vectors @@ -1134,32 +1197,36 @@ Details : Right half of halfword elements of 'in0' and 'in1' are interleaved and written to 'out0'. */ -#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ -} +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ + } #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) -#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) -#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ - out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ -} +#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ + } #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) -#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} +#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) /* Description : Interleave right half of double word elements from vectors @@ -1169,25 +1236,28 @@ Details : Right half of double word elements of 'in0' and 'in1' are interleaved and written to 'out0'. */ -#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ - out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ -} +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ + out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ + } #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) -#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) { \ - ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ - out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ -} +#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ + } #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) -#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} +#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) @@ -1198,26 +1268,29 @@ Details : Right half of byte elements from 'in0' and 'in1' are interleaved and written to 'out0' */ -#define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ -} +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + } #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) -#define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ -} +#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + } #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) -#define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ - out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ -} +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + } #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) @@ -1232,16 +1305,18 @@ value generated with (sat_val + 1) bit range. The results are written in place */ -#define SAT_UH2(RTYPE, in0, in1, sat_val) { \ - in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ - in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ -} +#define SAT_UH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ + } #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) -#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) { \ - SAT_UH2(RTYPE, in0, in1, sat_val); \ - SAT_UH2(RTYPE, in2, in3, sat_val) \ -} +#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_UH2(RTYPE, in0, in1, sat_val); \ + SAT_UH2(RTYPE, in2, in3, sat_val) \ + } #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) /* Description : Saturate the halfword element values to the max @@ -1254,16 +1329,18 @@ value generated with (sat_val + 1) bit range The results are written in place */ -#define SAT_SH2(RTYPE, in0, in1, sat_val) { \ - in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ - in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ -} +#define SAT_SH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ + } #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) -#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) { \ - SAT_SH2(RTYPE, in0, in1, sat_val); \ - SAT_SH2(RTYPE, in2, in3, sat_val); \ -} +#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_SH2(RTYPE, in0, in1, sat_val); \ + SAT_SH2(RTYPE, in2, in3, sat_val); \ + } #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) /* Description : Indexed halfword element values are replicated to all @@ -1275,17 +1352,18 @@ elements in 'out0' vector Valid index range for halfword operation is 0-7 */ -#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) { \ - out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ - out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ -} +#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ + out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ + } #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) -#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \ - out0, out1, out2, out3) { \ - SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ - SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ -} +#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ + { \ + SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ + SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ + } #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) @@ -1297,19 +1375,21 @@ 'out0' & even byte elements of 'in1' are copied to the right half of 'out0'. */ -#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ -} +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ + } #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) -#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ - PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} +#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) @@ -1322,18 +1402,20 @@ 'out0' & even halfword elements of 'in1' are copied to the right half of 'out0'. */ -#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ -} +#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ + } #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) -#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ - PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} +#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) /* Description : Pack even double word elements of vector pairs @@ -1344,18 +1426,20 @@ 'out0' & even double elements of 'in1' are copied to the right half of 'out0'. */ -#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ - out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ -} +#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ + out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ + } #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) -#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ - PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} +#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) /* Description : Each byte element is logically xor'ed with immediate 128 @@ -1365,30 +1449,34 @@ Details : Each unsigned byte element from input vector 'in0' is logically xor'ed with 128 and the result is stored in-place. */ -#define XORI_B2_128(RTYPE, in0, in1) { \ - in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ - in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ -} +#define XORI_B2_128(RTYPE, in0, in1) \ + { \ + in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ + in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ + } #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) -#define XORI_B3_128(RTYPE, in0, in1, in2) { \ - XORI_B2_128(RTYPE, in0, in1); \ - in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ -} +#define XORI_B3_128(RTYPE, in0, in1, in2) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ + } #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) -#define XORI_B4_128(RTYPE, in0, in1, in2, in3) { \ - XORI_B2_128(RTYPE, in0, in1); \ - XORI_B2_128(RTYPE, in2, in3); \ -} +#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + XORI_B2_128(RTYPE, in2, in3); \ + } #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) -#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) { \ - XORI_B4_128(RTYPE, in0, in1, in2, in3); \ - XORI_B3_128(RTYPE, in4, in5, in6); \ -} +#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ + { \ + XORI_B4_128(RTYPE, in0, in1, in2, in3); \ + XORI_B3_128(RTYPE, in4, in5, in6); \ + } #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) /* Description : Average of signed halfword elements -> (a + b) / 2 @@ -1400,13 +1488,14 @@ in one extra bit in the result. The result is then divided by 2 and written to 'out0' */ -#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ - out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ - out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ -} +#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ + out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ + out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ + } #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) /* Description : Addition of signed halfword elements and signed saturation @@ -1417,17 +1506,19 @@ halfword elements of 'in1'. The result is then signed saturated between halfword data type range */ -#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ -} +#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ + } #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) -#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} +#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) /* Description : Shift left all elements of vector (generic for all data types) @@ -1437,12 +1528,13 @@ Details : Each element of vector 'in0' is left shifted by 'shift' and the result is written in-place. */ -#define SLLI_4V(in0, in1, in2, in3, shift) { \ - in0 = in0 << shift; \ - in1 = in1 << shift; \ - in2 = in2 << shift; \ - in3 = in3 << shift; \ -} +#define SLLI_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ + } /* Description : Arithmetic shift right all elements of vector (generic for all data types) @@ -1452,12 +1544,13 @@ Details : Each element of vector 'in0' is right shifted by 'shift' and the result is written in-place. 'shift' is a GP variable. */ -#define SRA_4V(in0, in1, in2, in3, shift) { \ - in0 = in0 >> shift; \ - in1 = in1 >> shift; \ - in2 = in2 >> shift; \ - in3 = in3 >> shift; \ -} +#define SRA_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + in2 = in2 >> shift; \ + in3 = in3 >> shift; \ + } /* Description : Shift right arithmetic rounded words Arguments : Inputs - in0, in1, shift @@ -1469,15 +1562,17 @@ rounding and the result is written in-place. 'shift' is a vector. */ -#define SRAR_W2(RTYPE, in0, in1, shift) { \ - in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ - in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ -} - -#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) { \ - SRAR_W2(RTYPE, in0, in1, shift) \ - SRAR_W2(RTYPE, in2, in3, shift) \ -} +#define SRAR_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ + in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ + } + +#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRAR_W2(RTYPE, in0, in1, shift) \ + SRAR_W2(RTYPE, in2, in3, shift) \ + } #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) /* Description : Shift right arithmetic rounded (immediate) @@ -1489,30 +1584,34 @@ shifted value for rounding and the result is written in-place. 'shift' is an immediate value. */ -#define SRARI_H2(RTYPE, in0, in1, shift) { \ - in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ - in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ -} +#define SRARI_H2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ + in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ + } #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) -#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) { \ - SRARI_H2(RTYPE, in0, in1, shift); \ - SRARI_H2(RTYPE, in2, in3, shift); \ -} +#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_H2(RTYPE, in0, in1, shift); \ + SRARI_H2(RTYPE, in2, in3, shift); \ + } #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) -#define SRARI_W2(RTYPE, in0, in1, shift) { \ - in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ - in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ -} +#define SRARI_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ + in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ + } #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) -#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \ - SRARI_W2(RTYPE, in0, in1, shift); \ - SRARI_W2(RTYPE, in2, in3, shift); \ -} +#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_W2(RTYPE, in0, in1, shift); \ + SRARI_W2(RTYPE, in2, in3, shift); \ + } #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) /* Description : Logical shift right all elements of vector (immediate) @@ -1522,12 +1621,13 @@ Details : Each element of vector 'in0' is right shifted by 'shift' and the result is written in-place. 'shift' is an immediate value. */ -#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) { \ - out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ - out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ - out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ - out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ -} +#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ + { \ + out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ + out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ + out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ + out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ + } #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) /* Description : Multiplication of pairs of vectors @@ -1536,15 +1636,16 @@ Details : Each element from 'in0' is multiplied with elements from 'in1' and the result is written to 'out0' */ -#define MUL2(in0, in1, in2, in3, out0, out1) { \ - out0 = in0 * in1; \ - out1 = in2 * in3; \ -} -#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - MUL2(in0, in1, in2, in3, out0, out1); \ - MUL2(in4, in5, in6, in7, out2, out3); \ -} +#define MUL2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ + } +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ + } /* Description : Addition of 2 pairs of vectors Arguments : Inputs - in0, in1, in2, in3 @@ -1552,15 +1653,16 @@ Details : Each element in 'in0' is added to 'in1' and result is written to 'out0'. */ -#define ADD2(in0, in1, in2, in3, out0, out1) { \ - out0 = in0 + in1; \ - out1 = in2 + in3; \ -} -#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ADD2(in0, in1, in2, in3, out0, out1); \ - ADD2(in4, in5, in6, in7, out2, out3); \ -} +#define ADD2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ + } +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ + } /* Description : Subtraction of 2 pairs of vectors Arguments : Inputs - in0, in1, in2, in3 @@ -1568,17 +1670,18 @@ Details : Each element in 'in1' is subtracted from 'in0' and result is written to 'out0'. */ -#define SUB2(in0, in1, in2, in3, out0, out1) { \ - out0 = in0 - in1; \ - out1 = in2 - in3; \ -} -#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - out0 = in0 - in1; \ - out1 = in2 - in3; \ - out2 = in4 - in5; \ - out3 = in6 - in7; \ -} +#define SUB2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + } +#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + out2 = in4 - in5; \ + out3 = in6 - in7; \ + } /* Description : Sign extend halfword elements from right half of the vector Arguments : Input - in (halfword vector) @@ -1588,12 +1691,13 @@ extracted and interleaved with same vector 'in0' to generate 4 word elements keeping sign intact */ -#define UNPCK_R_SH_SW(in, out) { \ - v8i16 sign_m; \ - \ - sign_m = __msa_clti_s_h((v8i16)in, 0); \ - out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ -} +#define UNPCK_R_SH_SW(in, out) \ + { \ + v8i16 sign_m; \ + \ + sign_m = __msa_clti_s_h((v8i16)in, 0); \ + out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ + } /* Description : Zero extend unsigned byte elements to halfword elements Arguments : Input - in (unsigned byte vector) @@ -1602,11 +1706,12 @@ Details : Zero extended right half of vector is returned in 'out0' Zero extended left half of vector is returned in 'out1' */ -#define UNPCK_UB_SH(in, out0, out1) { \ - v16i8 zero_m = { 0 }; \ - \ - ILVRL_B2_SH(zero_m, in, out0, out1); \ -} +#define UNPCK_UB_SH(in, out0, out1) \ + { \ + v16i8 zero_m = { 0 }; \ + \ + ILVRL_B2_SH(zero_m, in, out0, out1); \ + } /* Description : Sign extend halfword elements from input vector and return the result in pair of vectors @@ -1619,91 +1724,96 @@ Then interleaved left with same vector 'in0' to generate 4 signed word elements in 'out1' */ -#define UNPCK_SH_SW(in, out0, out1) { \ - v8i16 tmp_m; \ - \ - tmp_m = __msa_clti_s_h((v8i16)in, 0); \ - ILVRL_H2_SW(tmp_m, in, out0, out1); \ -} +#define UNPCK_SH_SW(in, out0, out1) \ + { \ + v8i16 tmp_m; \ + \ + tmp_m = __msa_clti_s_h((v8i16)in, 0); \ + ILVRL_H2_SW(tmp_m, in, out0, out1); \ + } /* Description : Butterfly of 4 input vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 Details : Butterfly operation */ -#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \ - out0 = in0 + in3; \ - out1 = in1 + in2; \ - \ - out2 = in1 - in2; \ - out3 = in0 - in3; \ -} +#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ + } /* Description : Butterfly of 8 input vectors Arguments : Inputs - in0 ... in7 Outputs - out0 .. out7 Details : Butterfly operation */ -#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - out0 = in0 + in7; \ - out1 = in1 + in6; \ - out2 = in2 + in5; \ - out3 = in3 + in4; \ - \ - out4 = in3 - in4; \ - out5 = in2 - in5; \ - out6 = in1 - in6; \ - out7 = in0 - in7; \ -} +#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + out0 = in0 + in7; \ + out1 = in1 + in6; \ + out2 = in2 + in5; \ + out3 = in3 + in4; \ + \ + out4 = in3 - in4; \ + out5 = in2 - in5; \ + out6 = in1 - in6; \ + out7 = in0 - in7; \ + } /* Description : Butterfly of 16 input vectors Arguments : Inputs - in0 ... in15 Outputs - out0 .. out15 Details : Butterfly operation */ -#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \ - in8, in9, in10, in11, in12, in13, in14, in15, \ - out0, out1, out2, out3, out4, out5, out6, out7, \ - out8, out9, out10, out11, out12, out13, out14, out15) { \ - out0 = in0 + in15; \ - out1 = in1 + in14; \ - out2 = in2 + in13; \ - out3 = in3 + in12; \ - out4 = in4 + in11; \ - out5 = in5 + in10; \ - out6 = in6 + in9; \ - out7 = in7 + in8; \ +#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ + in11, in12, in13, in14, in15, out0, out1, out2, out3, \ + out4, out5, out6, out7, out8, out9, out10, out11, out12, \ + out13, out14, out15) \ + { \ + out0 = in0 + in15; \ + out1 = in1 + in14; \ + out2 = in2 + in13; \ + out3 = in3 + in12; \ + out4 = in4 + in11; \ + out5 = in5 + in10; \ + out6 = in6 + in9; \ + out7 = in7 + in8; \ \ - out8 = in7 - in8; \ - out9 = in6 - in9; \ - out10 = in5 - in10; \ - out11 = in4 - in11; \ - out12 = in3 - in12; \ - out13 = in2 - in13; \ - out14 = in1 - in14; \ - out15 = in0 - in15; \ -} + out8 = in7 - in8; \ + out9 = in6 - in9; \ + out10 = in5 - in10; \ + out11 = in4 - in11; \ + out12 = in3 - in12; \ + out13 = in2 - in13; \ + out14 = in1 - in14; \ + out15 = in0 - in15; \ + } /* Description : Transpose input 8x8 byte block Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - as per RTYPE */ -#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \ - tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ - ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ - ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ - ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ - SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ - SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ -} +#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ + tmp3_m); \ + ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ + ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ + ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ + ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ + SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ + SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ + } #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors @@ -1712,128 +1822,133 @@ Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - unsigned byte */ -#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ - in8, in9, in10, in11, in12, in13, in14, in15, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ - ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ - ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ - ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ - \ - tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ - tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ - tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ - tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ - out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ - tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ - out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ - tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ - \ - ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ - out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ - out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - \ - ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ - out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ - out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ -} +#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ + in10, in11, in12, in13, in14, in15, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ + ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ + ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ + ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ + \ + tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ + tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ + tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ + tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ + out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ + tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ + out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ + tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ + \ + ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ + out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ + out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ + out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + } /* Description : Transpose 4x4 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 Return Type - signed halfword */ -#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ - v8i16 s0_m, s1_m; \ - \ - ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ - ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ - out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ -} +#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 s0_m, s1_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ + ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ + } /* Description : Transpose 4x8 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - signed halfword */ -#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ - v8i16 zero_m = { 0 }; \ - \ - ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ - tmp0_n, tmp1_n, tmp2_n, tmp3_n); \ - ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ - ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ - \ - out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ - out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ - out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ - out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ - \ - out4 = zero_m; \ - out5 = zero_m; \ - out6 = zero_m; \ - out7 = zero_m; \ -} +#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ + v8i16 zero_m = { 0 }; \ + \ + ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ + tmp3_n); \ + ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ + ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ + \ + out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + \ + out4 = zero_m; \ + out5 = zero_m; \ + out6 = zero_m; \ + out7 = zero_m; \ + } /* Description : Transpose 8x4 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - signed halfword */ -#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ - ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ - ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ - ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ -} +#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ + ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ + ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ + } /* Description : Transpose 8x8 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - as per RTYPE */ -#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v8i16 s0_m, s1_m; \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ - ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ - ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ - ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ - PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ - tmp3_m, tmp7_m, out0, out2, out4, out6); \ - out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ - out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ - out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ - out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ -} +#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m; \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ + ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ + ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ + PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ + tmp7_m, out0, out2, out4, out6); \ + out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ + out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ + out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ + out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ + } #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) /* Description : Transpose 4x4 block with word elements in vectors @@ -1841,40 +1956,42 @@ Outputs - out0, out1, out2, out3 Return Type - signed word */ -#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) { \ - v4i32 s0_m, s1_m, s2_m, s3_m; \ - \ - ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ - ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ - \ - out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ - out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ - out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ - out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ -} +#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + \ + out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ + out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ + out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ + out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ + } /* Description : Add block 4x4 Arguments : Inputs - in0, in1, in2, in3, pdst, stride Details : Least significant 4 bytes from each input vector are added to the destination bytes, clipped between 0-255 and stored. */ -#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \ - uint32_t src0_m, src1_m, src2_m, src3_m; \ - v8i16 inp0_m, inp1_m, res0_m, res1_m; \ - v16i8 dst0_m = { 0 }; \ - v16i8 dst1_m = { 0 }; \ - v16i8 zero_m = { 0 }; \ - \ - ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ - LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ - INSERT_W2_SB(src0_m, src1_m, dst0_m); \ - INSERT_W2_SB(src2_m, src3_m, dst1_m); \ - ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ - ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ - CLIP_SH2_0_255(res0_m, res1_m); \ - PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ - ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ -} +#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ + { \ + uint32_t src0_m, src1_m, src2_m, src3_m; \ + v8i16 inp0_m, inp1_m, res0_m, res1_m; \ + v16i8 dst0_m = { 0 }; \ + v16i8 dst1_m = { 0 }; \ + v16i8 zero_m = { 0 }; \ + \ + ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ + LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ + INSERT_W2_SB(src0_m, src1_m, dst0_m); \ + INSERT_W2_SB(src2_m, src3_m, dst1_m); \ + ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ + ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ + CLIP_SH2_0_255(res0_m, res1_m); \ + PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ + ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ + } /* Description : Pack even elements of input vectors & xor with 128 Arguments : Inputs - in0, in1 @@ -1884,53 +2001,57 @@ together in one vector and the resulting vector is xor'ed with 128 to shift the range from signed to unsigned byte */ -#define PCKEV_XORI128_UB(in0, in1) ({ \ - v16u8 out_m; \ - \ - out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ - out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ - out_m; \ -}) +#define PCKEV_XORI128_UB(in0, in1) \ + ({ \ + v16u8 out_m; \ + \ + out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ + out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ + out_m; \ + }) /* Description : Converts inputs to unsigned bytes, interleave, average & store as 8x4 unsigned byte block Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, pdst, stride */ -#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \ - dst0, dst1, dst2, dst3, pdst, stride) { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - \ - tmp0_m = PCKEV_XORI128_UB(in0, in1); \ - tmp1_m = PCKEV_XORI128_UB(in2, in3); \ - ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ -} +#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ + pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + tmp0_m = PCKEV_XORI128_UB(in0, in1); \ + tmp1_m = PCKEV_XORI128_UB(in2, in3); \ + ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ + } /* Description : Pack even byte elements and store byte vector in destination memory Arguments : Inputs - in0, in1, pdst */ -#define PCKEV_ST_SB(in0, in1, pdst) { \ - v16i8 tmp_m; \ - \ - tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ - ST_SB(tmp_m, (pdst)); \ -} +#define PCKEV_ST_SB(in0, in1, pdst) \ + { \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ + ST_SB(tmp_m, (pdst)); \ + } /* Description : Horizontal 2 tap filter kernel code Arguments : Inputs - in0, in1, mask, coeff, shift */ -#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \ - v16i8 tmp0_m; \ - v8u16 tmp1_m; \ - \ - tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ - tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ - tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ - \ - tmp1_m; \ -}) -#endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */ +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ + ({ \ + v16i8 tmp0_m; \ + v8u16 tmp1_m; \ + \ + tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ + tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ + tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ + \ + tmp1_m; \ + }) +#endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */ diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c index 3bdec28e6ef058bc7d6f173d6f38736d29afd4c2..6455814e1b84d2fbeffefd7fbfd64187e1c4c6b2 100644 --- a/vpx_dsp/mips/sad_msa.c +++ b/vpx_dsp/mips/sad_msa.c @@ -11,12 +11,13 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" -#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) { \ - out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ - out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ - out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ - out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ -} +#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ + { \ + out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ + } #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, @@ -58,8 +59,8 @@ static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride, LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); ref += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, - src0, src1, ref0, ref1); + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); sad += SAD_UB2_UH(src0, src1, ref0, ref1); } @@ -214,8 +215,8 @@ static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride, src += (4 * src_stride); LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); ref += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, - src0, src1, ref0, ref1); + PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, + ref0, ref1); sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); @@ -473,8 +474,8 @@ static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride, src += (4 * src_stride); LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); ref += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, - src0, src1, ref0, ref1); + PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, + ref0, ref1); sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); @@ -793,9 +794,9 @@ static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride, } static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t * const aref_ptr[], - int32_t ref_stride, - int32_t height, uint32_t *sad_array) { + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; int32_t ht_cnt; uint32_t src0, src1, src2, src3; @@ -854,9 +855,9 @@ static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, } static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t * const aref_ptr[], - int32_t ref_stride, - int32_t height, uint32_t *sad_array) { + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { int32_t ht_cnt; const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; v16u8 src0, src1, src2, src3; @@ -905,9 +906,9 @@ static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, } static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t * const aref_ptr[], - int32_t ref_stride, - int32_t height, uint32_t *sad_array) { + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { int32_t ht_cnt; const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; v16u8 src, ref0, ref1, ref2, ref3, diff; @@ -970,9 +971,9 @@ static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, } static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, - const uint8_t * const aref_ptr[], - int32_t ref_stride, - int32_t height, uint32_t *sad_array) { + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; int32_t ht_cnt; v16u8 src0, src1, ref0, ref1; @@ -1014,9 +1015,9 @@ static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, } static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, - const uint8_t * const aref_ptr[], - int32_t ref_stride, - int32_t height, uint32_t *sad_array) { + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; int32_t ht_cnt; v16u8 src0, src1, src2, src3; @@ -1114,8 +1115,8 @@ static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride, ref += (4 * ref_stride); LD_UB2(sec_pred, 16, pred0, pred1); sec_pred += 32; - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, - src0, src1, ref0, ref1); + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); sad += SAD_UB2_UH(src0, src1, diff0, diff1); } @@ -1213,8 +1214,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, ref += ref_stride; LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); sec_pred += 64; - AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, - comp0, comp1, comp2, comp3); + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); @@ -1224,8 +1225,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, ref += ref_stride; LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); sec_pred += 64; - AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, - comp0, comp1, comp2, comp3); + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); @@ -1235,8 +1236,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, ref += ref_stride; LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); sec_pred += 64; - AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, - comp0, comp1, comp2, comp3); + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); @@ -1246,8 +1247,8 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, ref += ref_stride; LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); sec_pred += 64; - AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, - comp0, comp1, comp2, comp3); + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); } @@ -1258,180 +1259,180 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, return HADD_SW_S32(sad); } -#define VPX_SAD_4xHEIGHT_MSA(height) \ -uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ -} +#define VPX_SAD_4xHEIGHT_MSA(height) \ + uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ + } -#define VPX_SAD_8xHEIGHT_MSA(height) \ -uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ -} +#define VPX_SAD_8xHEIGHT_MSA(height) \ + uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ + } -#define VPX_SAD_16xHEIGHT_MSA(height) \ -uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ -} +#define VPX_SAD_16xHEIGHT_MSA(height) \ + uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ + } -#define VPX_SAD_32xHEIGHT_MSA(height) \ -uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ -} +#define VPX_SAD_32xHEIGHT_MSA(height) \ + uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ + } -#define VPX_SAD_64xHEIGHT_MSA(height) \ -uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride) { \ - return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ -} +#define VPX_SAD_64xHEIGHT_MSA(height) \ + uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ + } -#define VPX_SAD_4xHEIGHTx3_MSA(height) \ -void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ -} +#define VPX_SAD_4xHEIGHTx3_MSA(height) \ + void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } -#define VPX_SAD_8xHEIGHTx3_MSA(height) \ -void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ -} +#define VPX_SAD_8xHEIGHTx3_MSA(height) \ + void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } -#define VPX_SAD_16xHEIGHTx3_MSA(height) \ -void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ -} +#define VPX_SAD_16xHEIGHTx3_MSA(height) \ + void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } -#define VPX_SAD_32xHEIGHTx3_MSA(height) \ -void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ -} +#define VPX_SAD_32xHEIGHTx3_MSA(height) \ + void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } -#define VPX_SAD_64xHEIGHTx3_MSA(height) \ -void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ -} +#define VPX_SAD_64xHEIGHTx3_MSA(height) \ + void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } -#define VPX_SAD_4xHEIGHTx8_MSA(height) \ -void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ -} +#define VPX_SAD_4xHEIGHTx8_MSA(height) \ + void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } -#define VPX_SAD_8xHEIGHTx8_MSA(height) \ -void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ -} +#define VPX_SAD_8xHEIGHTx8_MSA(height) \ + void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } -#define VPX_SAD_16xHEIGHTx8_MSA(height) \ -void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ -} +#define VPX_SAD_16xHEIGHTx8_MSA(height) \ + void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } -#define VPX_SAD_32xHEIGHTx8_MSA(height) \ -void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ -} +#define VPX_SAD_32xHEIGHTx8_MSA(height) \ + void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } -#define VPX_SAD_64xHEIGHTx8_MSA(height) \ -void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ -} +#define VPX_SAD_64xHEIGHTx8_MSA(height) \ + void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } -#define VPX_SAD_4xHEIGHTx4D_MSA(height) \ -void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ -} +#define VPX_SAD_4xHEIGHTx4D_MSA(height) \ + void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } -#define VPX_SAD_8xHEIGHTx4D_MSA(height) \ -void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ -} +#define VPX_SAD_8xHEIGHTx4D_MSA(height) \ + void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } -#define VPX_SAD_16xHEIGHTx4D_MSA(height) \ -void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ -} +#define VPX_SAD_16xHEIGHTx4D_MSA(height) \ + void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } -#define VPX_SAD_32xHEIGHTx4D_MSA(height) \ -void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ -} +#define VPX_SAD_32xHEIGHTx4D_MSA(height) \ + void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } -#define VPX_SAD_64xHEIGHTx4D_MSA(height) \ -void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ - sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ -} +#define VPX_SAD_64xHEIGHTx4D_MSA(height) \ + void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } -#define VPX_AVGSAD_4xHEIGHT_MSA(height) \ -uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - const uint8_t *second_pred) { \ - return avgsad_4width_msa(src, src_stride, ref, ref_stride, \ - height, second_pred); \ -} +#define VPX_AVGSAD_4xHEIGHT_MSA(height) \ + uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *second_pred) { \ + return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } -#define VPX_AVGSAD_8xHEIGHT_MSA(height) \ -uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - const uint8_t *second_pred) { \ - return avgsad_8width_msa(src, src_stride, ref, ref_stride, \ - height, second_pred); \ -} +#define VPX_AVGSAD_8xHEIGHT_MSA(height) \ + uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *second_pred) { \ + return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } -#define VPX_AVGSAD_16xHEIGHT_MSA(height) \ -uint32_t vpx_sad16x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - const uint8_t *second_pred) { \ - return avgsad_16width_msa(src, src_stride, ref, ref_stride, \ - height, second_pred); \ -} +#define VPX_AVGSAD_16xHEIGHT_MSA(height) \ + uint32_t vpx_sad16x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } -#define VPX_AVGSAD_32xHEIGHT_MSA(height) \ -uint32_t vpx_sad32x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - const uint8_t *second_pred) { \ - return avgsad_32width_msa(src, src_stride, ref, ref_stride, \ - height, second_pred); \ -} +#define VPX_AVGSAD_32xHEIGHT_MSA(height) \ + uint32_t vpx_sad32x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } -#define VPX_AVGSAD_64xHEIGHT_MSA(height) \ -uint32_t vpx_sad64x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - const uint8_t *second_pred) { \ - return avgsad_64width_msa(src, src_stride, ref, ref_stride, \ - height, second_pred); \ -} +#define VPX_AVGSAD_64xHEIGHT_MSA(height) \ + uint32_t vpx_sad64x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } // 64x64 VPX_SAD_64xHEIGHT_MSA(64); diff --git a/vpx_dsp/mips/sub_pixel_variance_msa.c b/vpx_dsp/mips/sub_pixel_variance_msa.c index a592a2d078e80b361a15c759ecda263f5c811b3b..313e06f92dda0713a0f73d9363c2b079372bf9a8 100644 --- a/vpx_dsp/mips/sub_pixel_variance_msa.c +++ b/vpx_dsp/mips/sub_pixel_variance_msa.c @@ -14,29 +14,23 @@ #include "vpx_dsp/variance.h" static const uint8_t bilinear_filters_msa[8][2] = { - { 128, 0, }, - { 112, 16, }, - { 96, 32, }, - { 80, 48, }, - { 64, 64, }, - { 48, 80, }, - { 32, 96, }, - { 16, 112, }, + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, }; -#define CALC_MSE_AVG_B(src, ref, var, sub) { \ - v16u8 src_l0_m, src_l1_m; \ - v8i16 res_l0_m, res_l1_m; \ - \ - ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ - HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ - DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ - \ - sub += res_l0_m + res_l1_m; \ -} +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + \ + sub += res_l0_m + res_l1_m; \ + } -#define VARIANCE_WxH(sse, diff, shift) \ - sse - (((uint32_t)diff * diff) >> shift) +#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) #define VARIANCE_LARGE_WxH(sse, diff, shift) \ sse - (((int64_t)diff * diff) >> shift) @@ -45,8 +39,7 @@ static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, - const uint8_t *sec_pred, - int32_t height, + const uint8_t *sec_pred, int32_t height, int32_t *diff) { int32_t ht_cnt; uint32_t src0, src1, src2, src3; @@ -81,8 +74,7 @@ static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, - const uint8_t *sec_pred, - int32_t height, + const uint8_t *sec_pred, int32_t height, int32_t *diff) { int32_t ht_cnt; v16u8 src0, src1, src2, src3; @@ -99,8 +91,8 @@ static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); ref_ptr += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, - src0, src1, ref0, ref1); + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); CALC_MSE_AVG_B(src0, ref0, var, avg); CALC_MSE_AVG_B(src1, ref1, var, avg); @@ -117,8 +109,7 @@ static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr, const uint8_t *ref_ptr, int32_t ref_stride, const uint8_t *sec_pred, - int32_t height, - int32_t *diff) { + int32_t height, int32_t *diff) { int32_t ht_cnt; v16u8 src, ref, pred; v8i16 avg = { 0 }; @@ -173,8 +164,7 @@ static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr, const uint8_t *ref_ptr, int32_t ref_stride, const uint8_t *sec_pred, - int32_t height, - int32_t *diff) { + int32_t height, int32_t *diff) { int32_t ht_cnt; v16u8 src0, src1, ref0, ref1, pred0, pred1; v8i16 avg = { 0 }; @@ -232,8 +222,7 @@ static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, - const uint8_t *sec_pred, - int32_t *diff) { + const uint8_t *sec_pred, int32_t *diff) { int32_t ht_cnt; v16u8 src0, src1, ref0, ref1, pred0, pred1; v8i16 avg0 = { 0 }; @@ -293,8 +282,7 @@ static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, - const uint8_t *sec_pred, - int32_t *diff) { + const uint8_t *sec_pred, int32_t *diff) { int32_t ht_cnt; v16u8 src0, src1, src2, src3; v16u8 ref0, ref1, ref2, ref3; @@ -310,8 +298,8 @@ static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, src_ptr += src_stride; LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, - src0, src1, src2, src3); + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); CALC_MSE_AVG_B(src0, ref0, var, avg0); CALC_MSE_AVG_B(src2, ref2, var, avg0); CALC_MSE_AVG_B(src1, ref1, var, avg1); @@ -323,8 +311,8 @@ static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, src_ptr += src_stride; LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, - src0, src1, src2, src3); + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); CALC_MSE_AVG_B(src0, ref0, var, avg0); CALC_MSE_AVG_B(src2, ref2, var, avg0); CALC_MSE_AVG_B(src1, ref1, var, avg1); @@ -343,8 +331,7 @@ static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, - const uint8_t *sec_pred, - int32_t *diff) { + const uint8_t *sec_pred, int32_t *diff) { int32_t ht_cnt; v16u8 src0, src1, src2, src3; v16u8 ref0, ref1, ref2, ref3; @@ -362,8 +349,8 @@ static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, src_ptr += src_stride; LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, - src0, src1, src2, src3); + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); CALC_MSE_AVG_B(src0, ref0, var, avg0); CALC_MSE_AVG_B(src1, ref1, var, avg1); CALC_MSE_AVG_B(src2, ref2, var, avg2); @@ -375,8 +362,8 @@ static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, src_ptr += src_stride; LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); ref_ptr += ref_stride; - AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, - src0, src1, src2, src3); + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); CALC_MSE_AVG_B(src0, ref0, var, avg0); CALC_MSE_AVG_B(src1, ref1, var, avg1); CALC_MSE_AVG_B(src2, ref2, var, avg2); @@ -392,13 +379,9 @@ static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, return HADD_SW_S32(var); } -static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_4width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; uint32_t ref0, ref1, ref2, ref3; @@ -420,11 +403,11 @@ static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src, INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - vec0, vec1, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, - src0, src1, src2, src3); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); CALC_MSE_AVG_B(src0, ref, var, avg); @@ -436,13 +419,9 @@ static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_8width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; v16u8 filt0, out, ref0, ref1, ref2, ref3; @@ -464,11 +443,11 @@ static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src, PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - vec0, vec1, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, - src0, src1, src2, src3); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); CALC_MSE_AVG_B(out, ref0, var, avg); out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); @@ -481,13 +460,9 @@ static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_16width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7; @@ -512,14 +487,14 @@ static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src, VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - out0, out1, out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, - out4, out5, out6, out7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, - src0, src1, src2, src3); + PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1, + src2, src3); CALC_MSE_AVG_B(src0, dst0, var, avg); CALC_MSE_AVG_B(src1, dst1, var, avg); CALC_MSE_AVG_B(src2, dst2, var, avg); @@ -532,13 +507,9 @@ static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_32width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { uint32_t loop_cnt, sse = 0; int32_t diff0[2]; @@ -554,13 +525,9 @@ static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src, return sse; } -static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_64width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { uint32_t loop_cnt, sse = 0; int32_t diff0[4]; @@ -576,13 +543,9 @@ static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src, return sse; } -static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_4width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; uint32_t ref0, ref1, ref2, ref3; @@ -608,8 +571,8 @@ static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src, dst += (4 * dst_stride); INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, - src10_r, src21_r, src32_r, src43_r); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); @@ -624,13 +587,9 @@ static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_8width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; v16u8 src0, src1, src2, src3, src4; @@ -654,10 +613,10 @@ static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src, dst += (4 * dst_stride); PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, - vec0, vec1, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - tmp0, tmp1, tmp2, tmp3); + ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); CALC_MSE_AVG_B(src0, ref0, var, avg); @@ -671,13 +630,9 @@ static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_16width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; v16u8 ref0, ref1, ref2, ref3; @@ -734,13 +689,9 @@ static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_32width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { uint32_t loop_cnt, sse = 0; int32_t diff0[2]; @@ -756,13 +707,9 @@ static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src, return sse; } -static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_64width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { uint32_t loop_cnt, sse = 0; int32_t diff0[4]; @@ -778,14 +725,10 @@ static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src, return sse; } -static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter_horiz, - const uint8_t *filter_vert, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_4width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; uint32_t ref0, ref1, ref2, ref3; @@ -831,14 +774,10 @@ static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter_horiz, - const uint8_t *filter_vert, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_8width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; v16u8 ref0, ref1, ref2, ref3; @@ -892,14 +831,10 @@ static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter_horiz, - const uint8_t *filter_vert, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_16width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; @@ -969,14 +904,10 @@ static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter_horiz, - const uint8_t *filter_vert, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_32width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { uint32_t loop_cnt, sse = 0; int32_t diff0[2]; @@ -993,14 +924,10 @@ static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src, return sse; } -static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *filter_horiz, - const uint8_t *filter_vert, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_sse_diff_64width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { uint32_t loop_cnt, sse = 0; int32_t diff0[4]; @@ -1017,14 +944,10 @@ static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src, return sse; } -static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_avg_sse_diff_4width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; uint32_t ref0, ref1, ref2, ref3; @@ -1049,11 +972,11 @@ static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src, INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - vec0, vec1, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, - src0, src1, src2, src3); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); out = __msa_aver_u_b(out, pred); @@ -1066,14 +989,10 @@ static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_avg_sse_diff_8width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; v16u8 out, pred, filt0; @@ -1096,11 +1015,11 @@ static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src, PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - vec0, vec1, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, - src0, src1, src2, src3); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); pred = LD_UB(sec_pred); @@ -1120,15 +1039,10 @@ static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter, - int32_t height, - int32_t *diff, - int32_t width) { +static uint32_t subpel_avg_ssediff_16w_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { int16_t filtval; uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7; @@ -1157,16 +1071,16 @@ static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src, VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - out0, out1, out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, - out4, out5, out6, out7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, - tmp0, tmp1, tmp2, tmp3); - AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, - tmp0, tmp1, tmp2, tmp3); + PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1, + tmp2, tmp3); + AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1, + tmp2, tmp3); CALC_MSE_AVG_B(tmp0, dst0, var, avg); CALC_MSE_AVG_B(tmp1, dst1, var, avg); @@ -1180,33 +1094,25 @@ static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_avg_sse_diff_16width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, filter, height, diff, 16); } -static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_avg_sse_diff_32width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { uint32_t loop_cnt, sse = 0; int32_t diff0[2]; for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, - &diff0[loop_cnt], 32); + sse += + subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 32); src += 16; dst += 16; sec_pred += 16; @@ -1217,21 +1123,17 @@ static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src, return sse; } -static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_avg_sse_diff_64width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { uint32_t loop_cnt, sse = 0; int32_t diff0[4]; for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, - &diff0[loop_cnt], 64); + sse += + subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); src += 16; dst += 16; sec_pred += 16; @@ -1242,14 +1144,10 @@ static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src, return sse; } -static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_avg_sse_diff_4width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; uint32_t ref0, ref1, ref2, ref3; @@ -1276,8 +1174,8 @@ static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src, dst += (4 * dst_stride); INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, - src10_r, src21_r, src32_r, src43_r); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); @@ -1294,14 +1192,10 @@ static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_avg_sse_diff_8width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; v16u8 src0, src1, src2, src3, src4; @@ -1326,10 +1220,10 @@ static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src, LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); dst += (4 * dst_stride); PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); - ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, - vec0, vec1, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, - tmp0, tmp1, tmp2, tmp3); + ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); @@ -1345,15 +1239,10 @@ static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter, - int32_t height, - int32_t *diff, - int32_t width) { +static uint32_t subpel_avg_ssediff_16w_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { int16_t filtval; uint32_t loop_cnt; v16u8 ref0, ref1, ref2, ref3; @@ -1401,8 +1290,8 @@ static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src, LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); dst += (4 * dst_stride); - AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, - out0, out1, out2, out3); + AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, + out2, out3); CALC_MSE_AVG_B(out0, ref0, var, avg); CALC_MSE_AVG_B(out1, ref1, var, avg); @@ -1416,33 +1305,25 @@ static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src, return HADD_SW_S32(var); } -static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_avg_sse_diff_16width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, filter, height, diff, 16); } -static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_avg_sse_diff_32width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { uint32_t loop_cnt, sse = 0; int32_t diff0[2]; for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, - &diff0[loop_cnt], 32); + sse += + subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 32); src += 16; dst += 16; sec_pred += 16; @@ -1453,21 +1334,17 @@ static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src, return sse; } -static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter, - int32_t height, - int32_t *diff) { +static uint32_t sub_pixel_avg_sse_diff_64width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { uint32_t loop_cnt, sse = 0; int32_t diff0[4]; for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { - sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, - sec_pred, filter, height, - &diff0[loop_cnt], 64); + sse += + subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); src += 16; dst += 16; sec_pred += 16; @@ -1479,11 +1356,9 @@ static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src, } static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( - const uint8_t *src, int32_t src_stride, - const uint8_t *dst, int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; uint32_t ref0, ref1, ref2, ref3; @@ -1532,11 +1407,9 @@ static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( } static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( - const uint8_t *src, int32_t src_stride, - const uint8_t *dst, int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { int16_t filtval; uint32_t loop_cnt; v16u8 ref0, ref1, ref2, ref3; @@ -1598,16 +1471,10 @@ static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( return HADD_SW_S32(var); } -static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src, - int32_t src_stride, - const uint8_t *dst, - int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter_horiz, - const uint8_t *filter_vert, - int32_t height, - int32_t *diff, - int32_t width) { +static uint32_t subpel_avg_ssediff_16w_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { int16_t filtval; uint32_t loop_cnt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; @@ -1669,8 +1536,8 @@ static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src, LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); dst += (4 * dst_stride); - AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, - out0, out1, out2, out3); + AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, + out2, out3); CALC_MSE_AVG_B(out0, ref0, var, avg); CALC_MSE_AVG_B(out1, ref1, var, avg); @@ -1685,22 +1552,18 @@ static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src, } static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa( - const uint8_t *src, int32_t src_stride, - const uint8_t *dst, int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, sec_pred, filter_horiz, filter_vert, height, diff, 16); } static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( - const uint8_t *src, int32_t src_stride, - const uint8_t *dst, int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { uint32_t loop_cnt, sse = 0; int32_t diff0[2]; @@ -1719,11 +1582,9 @@ static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( } static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( - const uint8_t *src, int32_t src_stride, - const uint8_t *dst, int32_t dst_stride, - const uint8_t *sec_pred, - const uint8_t *filter_horiz, const uint8_t *filter_vert, - int32_t height, int32_t *diff) { + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { uint32_t loop_cnt, sse = 0; int32_t diff0[4]; @@ -1756,47 +1617,40 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); -#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ -uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \ - int32_t src_stride, \ - int32_t xoffset, \ - int32_t yoffset, \ - const uint8_t *ref, \ - int32_t ref_stride, \ - uint32_t *sse) { \ - int32_t diff; \ - uint32_t var; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ - \ - if (yoffset) { \ - if (xoffset) { \ - *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride, \ - ref, ref_stride, \ - h_filter, v_filter, \ - ht, &diff); \ - } else { \ - *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride, \ - ref, ref_stride, \ - v_filter, ht, &diff); \ - } \ - \ - var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ - } else { \ - if (xoffset) { \ - *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride, \ - ref, ref_stride, \ - h_filter, ht, &diff); \ - \ - var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ - } else { \ - var = vpx_variance##wd##x##ht##_msa(src, src_stride, \ - ref, ref_stride, sse); \ - } \ - } \ - \ - return var; \ -} +#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa( \ + const uint8_t *src, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sse) { \ + int32_t diff; \ + uint32_t var; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ + src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_sse_diff_##wd##width_v_msa( \ + src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ + } \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ + src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \ + sse); \ + } \ + } \ + \ + return var; \ + } VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4); VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8); @@ -1817,42 +1671,37 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32); VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ -uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \ - const uint8_t *src_ptr, int32_t src_stride, \ - int32_t xoffset, int32_t yoffset, \ - const uint8_t *ref_ptr, int32_t ref_stride, \ - uint32_t *sse, const uint8_t *sec_pred) { \ - int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ \ - if (yoffset) { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, h_filter, \ - v_filter, ht, &diff); \ - } else { \ - *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, v_filter, \ - ht, &diff); \ - } \ - } else { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, h_filter, \ - ht, &diff); \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ } else { \ - *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, ht, &diff); \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, ht, &diff); \ + } \ } \ - } \ \ - return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ -} + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4); VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8); @@ -1870,11 +1719,9 @@ VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32); uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, int32_t src_stride, - int32_t xoffset, - int32_t yoffset, + int32_t xoffset, int32_t yoffset, const uint8_t *ref_ptr, - int32_t ref_stride, - uint32_t *sse, + int32_t ref_stride, uint32_t *sse, const uint8_t *sec_pred) { int32_t diff; const uint8_t *h_filter = bilinear_filters_msa[xoffset]; @@ -1882,22 +1729,19 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, if (yoffset) { if (xoffset) { - *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride, - ref_ptr, ref_stride, - sec_pred, h_filter, - v_filter, 64, &diff); + *sse = sub_pixel_avg_sse_diff_32width_hv_msa( + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, + v_filter, 64, &diff); } else { - *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, - ref_ptr, ref_stride, - sec_pred, v_filter, - 64, &diff); + *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr, + ref_stride, sec_pred, + v_filter, 64, &diff); } } else { if (xoffset) { - *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, - ref_ptr, ref_stride, - sec_pred, h_filter, - 64, &diff); + *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, + ref_stride, sec_pred, + h_filter, 64, &diff); } else { *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, &diff); @@ -1907,46 +1751,38 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, return VARIANCE_32Wx64H(*sse, diff); } -#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ -uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr, \ - int32_t src_stride, \ - int32_t xoffset, \ - int32_t yoffset, \ - const uint8_t *ref_ptr, \ - int32_t ref_stride, \ - uint32_t *sse, \ - const uint8_t *sec_pred) { \ - int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ - \ - if (yoffset) { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, h_filter, \ - v_filter, ht, &diff); \ - } else { \ - *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, v_filter, \ - ht, &diff); \ - } \ - } else { \ - if (xoffset) { \ - *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, h_filter, \ - ht, &diff); \ - } else { \ - *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, \ - ref_ptr, ref_stride, \ - sec_pred, &diff); \ - } \ - } \ - \ - return VARIANCE_64Wx##ht##H(*sse, diff); \ -} +#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ + uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_64width_v_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, &diff); \ + } \ + } \ + \ + return VARIANCE_64Wx##ht##H(*sse, diff); \ + } VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32); VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64); diff --git a/vpx_dsp/mips/subtract_msa.c b/vpx_dsp/mips/subtract_msa.c index 9ac43c5cd5225c8a579d47f3ccd31f38bb47de66..391a7ebf66251daec485a91e7ca936a4996cc42b 100644 --- a/vpx_dsp/mips/subtract_msa.c +++ b/vpx_dsp/mips/subtract_msa.c @@ -68,8 +68,8 @@ static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride, LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); src += (8 * src_stride); - LD_SB8(pred, pred_stride, - pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7); + LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6, + pred7); pred += (8 * pred_stride); ILVRL_B2_UB(src0, pred0, src_l0, src_l1); @@ -226,31 +226,31 @@ static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride, } } -void vpx_subtract_block_msa(int32_t rows, int32_t cols, - int16_t *diff_ptr, ptrdiff_t diff_stride, - const uint8_t *src_ptr, ptrdiff_t src_stride, - const uint8_t *pred_ptr, ptrdiff_t pred_stride) { +void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { if (rows == cols) { switch (rows) { case 4: - sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, - diff_ptr, diff_stride); + sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); break; case 8: - sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, - diff_ptr, diff_stride); + sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); break; case 16: - sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, - diff_ptr, diff_stride); + sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); break; case 32: - sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, - diff_ptr, diff_stride); + sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); break; case 64: - sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, - diff_ptr, diff_stride); + sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); break; default: vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, diff --git a/vpx_dsp/mips/txfm_macros_msa.h b/vpx_dsp/mips/txfm_macros_msa.h index 68c63d56f67f524311ff62ae8aca30a7a4758b3c..da100f6a9808af6c3105057a9662159f884fe3d9 100644 --- a/vpx_dsp/mips/txfm_macros_msa.h +++ b/vpx_dsp/mips/txfm_macros_msa.h @@ -13,81 +13,84 @@ #include "vpx_dsp/mips/macros_msa.h" -#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \ - v8i16 k0_m = __msa_fill_h(cnst0); \ - v4i32 s0_m, s1_m, s2_m, s3_m; \ - \ - s0_m = (v4i32)__msa_fill_h(cnst1); \ - k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \ - \ - ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ - ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ - DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ - SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ - out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ - \ - DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ - SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ - out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ -} +#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ + { \ + v8i16 k0_m = __msa_fill_h(cnst0); \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + s0_m = (v4i32)__msa_fill_h(cnst1); \ + k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \ + \ + ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ + ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ + DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ + \ + DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ + } -#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, \ - dst0, dst1, dst2, dst3) { \ - v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ - v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ - \ - DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, \ - tp0_m, tp2_m, tp3_m, tp4_m); \ - DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, \ - tp5_m, tp6_m, tp7_m, tp8_m); \ - BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ - BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ - SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \ - SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \ - PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, \ - dst0, dst1, dst2, dst3); \ -} +#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0, \ + dst1, dst2, dst3) \ + { \ + v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ + v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ + \ + DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m, \ + tp4_m); \ + DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m, \ + tp8_m); \ + BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ + BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ + SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \ + SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \ + dst1, dst2, dst3); \ + } -#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \ - v8i16 dst_m; \ - v4i32 tp0_m, tp1_m; \ - \ - DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ - SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \ - dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \ - \ - dst_m; \ -}) +#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \ + ({ \ + v8i16 dst_m; \ + v4i32 tp0_m, tp1_m; \ + \ + DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ + SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \ + dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \ + \ + dst_m; \ + }) -#define MADD_SHORT(m0, m1, c0, c1, res0, res1) { \ - v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ - v8i16 madd_s0_m, madd_s1_m; \ - \ - ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ - DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \ - c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \ - SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ -} +#define MADD_SHORT(m0, m1, c0, c1, res0, res1) \ + { \ + v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ + v8i16 madd_s0_m, madd_s1_m; \ + \ + ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \ + madd0_m, madd1_m, madd2_m, madd3_m); \ + SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ + } -#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \ - out0, out1, out2, out3) { \ - v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ - \ - ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ - ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ - DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ - cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ - m4_m, m5_m, tmp3_m, tmp2_m); \ - SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ - DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \ - cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \ - m4_m, m5_m, tmp3_m, tmp2_m); \ - SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ -} +#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ + out2, out3) \ + { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \ + cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \ + cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ + } #endif // VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ diff --git a/vpx_dsp/mips/variance_msa.c b/vpx_dsp/mips/variance_msa.c index 33e175560fd3207fc86917095ca6c03d393a670e..085990e48459f24365c18e842f88bcc2a201928e 100644 --- a/vpx_dsp/mips/variance_msa.c +++ b/vpx_dsp/mips/variance_msa.c @@ -11,28 +11,29 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" -#define CALC_MSE_B(src, ref, var) { \ - v16u8 src_l0_m, src_l1_m; \ - v8i16 res_l0_m, res_l1_m; \ - \ - ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ - HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ - DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ -} +#define CALC_MSE_B(src, ref, var) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + } -#define CALC_MSE_AVG_B(src, ref, var, sub) { \ - v16u8 src_l0_m, src_l1_m; \ - v8i16 res_l0_m, res_l1_m; \ - \ - ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ - HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ - DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ - \ - sub += res_l0_m + res_l1_m; \ -} +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + \ + sub += res_l0_m + res_l1_m; \ + } -#define VARIANCE_WxH(sse, diff, shift) \ - sse - (((uint32_t)diff * diff) >> shift) +#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) #define VARIANCE_LARGE_WxH(sse, diff, shift) \ sse - (((int64_t)diff * diff) >> shift) @@ -80,8 +81,8 @@ static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride, LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); ref_ptr += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, - src0, src1, ref0, ref1); + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); CALC_MSE_AVG_B(src0, ref0, var, avg); CALC_MSE_AVG_B(src1, ref1, var, avg); } @@ -370,8 +371,8 @@ static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride, LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); ref_ptr += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, - src0, src1, ref0, ref1); + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); CALC_MSE_B(src0, ref0, var); CALC_MSE_B(src1, ref1, var); } @@ -526,19 +527,17 @@ uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride, #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); -#define VPX_VARIANCE_WDXHT_MSA(wd, ht) \ -uint32_t vpx_variance##wd##x##ht##_msa(const uint8_t *src, \ - int32_t src_stride, \ - const uint8_t *ref, \ - int32_t ref_stride, \ - uint32_t *sse) { \ - int32_t diff; \ - \ - *sse = sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, \ - ht, &diff); \ - \ - return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ -} +#define VPX_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t vpx_variance##wd##x##ht##_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, uint32_t *sse) { \ + int32_t diff; \ + \ + *sse = \ + sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } VPX_VARIANCE_WDXHT_MSA(4, 4); VPX_VARIANCE_WDXHT_MSA(4, 8); @@ -585,8 +584,7 @@ uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride, } uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse) { + const uint8_t *ref, int32_t ref_stride, uint32_t *sse) { *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8); return *sse; @@ -617,17 +615,15 @@ uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride, } void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse, int32_t *sum) { + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum); } void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - uint32_t *sse, int32_t *sum) { + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum); } -uint32_t vpx_get_mb_ss_msa(const int16_t *src) { - return get_mb_ss_msa(src); -} +uint32_t vpx_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); } diff --git a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c index f6244d834b7d4f24acd22c1b961071c62e71e798..ad2af286692155bccbee8df80abbbd41dee655d6 100644 --- a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c @@ -13,8 +13,7 @@ #include "vpx_dsp/mips/vpx_convolve_msa.h" static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; @@ -48,8 +47,7 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, } static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; @@ -92,10 +90,8 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, } static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, int32_t height) { if (4 == height) { common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); @@ -105,10 +101,8 @@ static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, } static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, int32_t height) { int32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; @@ -136,18 +130,16 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); SAT_SH4_SH(out0, out1, out2, out3, 7); - CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, - dst, dst_stride); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + dst_stride); dst += (4 * dst_stride); } } static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, - int32_t height) { + int8_t *filter, int32_t height) { int32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16u8 mask0, mask1, mask2, mask3, dst0, dst1; @@ -199,11 +191,9 @@ static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, } static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, - int32_t height) { + int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16u8 dst1, dst2, mask0, mask1, mask2, mask3; @@ -256,11 +246,9 @@ static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, } static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, - int32_t height) { + int8_t *filter, int32_t height) { uint32_t loop_cnt, cnt; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16u8 dst1, dst2, mask0, mask1, mask2, mask3; @@ -318,8 +306,7 @@ static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, } static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { v16i8 src0, src1, src2, src3, mask; @@ -344,8 +331,7 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, } static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; @@ -378,10 +364,8 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, } static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, int32_t height) { if (4 == height) { common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); @@ -391,8 +375,7 @@ static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, } static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { v16i8 src0, src1, src2, src3, mask; @@ -412,16 +395,13 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, - dst, dst_stride); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); } -static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter, - int32_t height) { +static void common_hz_2t_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { v16i8 src0, src1, src2, src3, mask; v16u8 filt0, dst0, dst1, dst2, dst3; v8u16 vec0, vec1, vec2, vec3, filt; @@ -442,8 +422,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_SB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, - dst, dst_stride); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); dst += (4 * dst_stride); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); @@ -452,8 +432,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, - dst, dst_stride); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); dst += (4 * dst_stride); if (16 == height) { @@ -467,8 +447,8 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_SB4(src, src_stride, src0, src1, src2, src3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, - dst, dst_stride); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); dst += (4 * dst_stride); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); @@ -477,16 +457,14 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, - dst, dst_stride); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); } } static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, int32_t height) { if (4 == height) { common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); @@ -497,11 +475,9 @@ static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, } static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, - int32_t height) { + int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16u8 filt0, dst0, dst1, dst2, dst3; @@ -566,11 +542,9 @@ static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, } static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, - int32_t height) { + int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16u8 filt0, dst0, dst1, dst2, dst3; @@ -617,11 +591,9 @@ static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, } static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, - int32_t height) { + int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16u8 filt0, dst0, dst1, dst2, dst3; @@ -662,8 +634,8 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { int8_t cnt, filt_hor[8]; assert(x_step_q4 == 16); @@ -676,67 +648,55 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, if (((const int32_t *)filter_x)[0] == 0) { switch (w) { case 4: - common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_hor[3], h); + common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); break; case 8: - common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_hor[3], h); + common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); break; case 16: - common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_hor[3], h); + common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); break; case 32: - common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_hor[3], h); + common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); break; case 64: - common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_hor[3], h); + common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } else { switch (w) { case 4: - common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, h); + common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); break; case 8: - common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, h); + common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); break; case 16: - common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, h); + common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); break; case 32: - common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, h); + common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); break; case 64: - common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, h); + common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_msa.c index 2abde6de83c3e14131866426cb21721764d76cdc..1cfa63201c5acfefbf1170ec42d902be0c26edea 100644 --- a/vpx_dsp/mips/vpx_convolve8_avg_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_avg_msa.c @@ -12,13 +12,9 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/vpx_convolve_msa.h" -static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, - int32_t height) { +static void common_hv_8ht_8vt_and_aver_dst_4w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; @@ -64,15 +60,15 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, src += (4 * src_stride); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, @@ -94,13 +90,9 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, } } -static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, - int32_t height) { +static void common_hv_8ht_8vt_and_aver_dst_8w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; @@ -154,20 +146,20 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, filt_vt2, filt_vt3); @@ -180,8 +172,8 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); - CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, - dst, dst_stride); + CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst, + dst_stride); dst += (4 * dst_stride); hz_out6 = hz_out10; @@ -194,13 +186,9 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, } } -static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, - int32_t height) { +static void common_hv_8ht_8vt_and_aver_dst_16w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { int32_t multiple8_cnt; for (multiple8_cnt = 2; multiple8_cnt--;) { common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, @@ -210,13 +198,9 @@ static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, } } -static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, - int32_t height) { +static void common_hv_8ht_8vt_and_aver_dst_32w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { int32_t multiple8_cnt; for (multiple8_cnt = 4; multiple8_cnt--;) { common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, @@ -226,13 +210,9 @@ static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, } } -static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, - int32_t height) { +static void common_hv_8ht_8vt_and_aver_dst_64w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { int32_t multiple8_cnt; for (multiple8_cnt = 8; multiple8_cnt--;) { common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, @@ -242,12 +222,9 @@ static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, } } -static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert) { +static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { v16i8 src0, src1, src2, src3, src4, mask; v16u8 filt_hz, filt_vt, vec0, vec1; v16u8 dst0, dst1, dst2, dst3, res0, res1; @@ -280,12 +257,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); } -static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert) { +static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; @@ -316,29 +290,25 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, - dst4, dst6); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, + dst6); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, - tmp0, tmp1, tmp2, tmp3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, + tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, - res2, res3); - AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, - res2, res3); + PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2, + res3); + AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, + res3); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); } -static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, - int32_t height) { +static void common_hv_2ht_2vt_and_aver_dst_4w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { if (4 == height) { common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz, filter_vert); @@ -348,12 +318,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t *src, } } -static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert) { +static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { v16i8 src0, src1, src2, src3, src4, mask; v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; @@ -390,17 +357,13 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, tmp3 = __msa_dotp_u_h(vec3, filt_vt); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, - dst, dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, + dst_stride); } -static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, - int32_t height) { +static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, mask; v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; @@ -445,36 +408,27 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, - dst, dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, + dst_stride); dst += (4 * dst_stride); } } -static void common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, - int32_t height) { +static void common_hv_2ht_2vt_and_aver_dst_8w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { if (4 == height) { common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz, filter_vert); } else { - common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, - height); + common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( + src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); } } -static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, - int32_t height) { +static void common_hv_2ht_2vt_and_aver_dst_16w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; @@ -536,13 +490,9 @@ static void common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t *src, } } -static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, - int32_t height) { +static void common_hv_2ht_2vt_and_aver_dst_32w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { int32_t multiple8_cnt; for (multiple8_cnt = 2; multiple8_cnt--;) { common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, @@ -552,13 +502,9 @@ static void common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t *src, } } -static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, - int32_t height) { +static void common_hv_2ht_2vt_and_aver_dst_64w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { int32_t multiple8_cnt; for (multiple8_cnt = 4; multiple8_cnt--;) { common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, @@ -571,8 +517,8 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t *src, void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { int8_t cnt, filt_hor[8], filt_ver[8]; assert(x_step_q4 == 16); @@ -589,72 +535,69 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, ((const int32_t *)filter_y)[0] == 0) { switch (w) { case 4: - common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], h); + common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); break; case 8: - common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], h); + common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); break; case 16: - common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], &filt_ver[3], h); break; case 32: - common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], &filt_ver[3], h); break; case 64: - common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], &filt_ver[3], h); break; default: - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } else if (((const int32_t *)filter_x)[0] == 0 || ((const int32_t *)filter_y)[0] == 0) { - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); } else { switch (w) { case 4: - common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, filt_ver, h); + common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); break; case 8: - common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, filt_ver, h); + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); break; case 16: - common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, filt_ver, h); + common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); break; case 32: - common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, filt_ver, h); + common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); break; case 64: - common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, filt_ver, h); + common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); break; default: - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c index 0164e41aa161a3c5614b86c2ac71687089a29083..146ce3b2f596d63ffc2bcc01ae03da08cdf9cbc4 100644 --- a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c @@ -13,10 +13,8 @@ #include "vpx_dsp/mips/vpx_convolve_msa.h" static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; @@ -73,10 +71,8 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, } static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; @@ -106,18 +102,18 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, XORI_B4_128_SB(src7, src8, src9, src10); ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, src87_r, src98_r, src109_r); - out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, - filt1, filt2, filt3); - out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, - filt1, filt2, filt3); - out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, - filt1, filt2, filt3); + out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1, + filt2, filt3); + out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1, + filt2, filt3); + out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1, + filt2, filt3); out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3); SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); SAT_SH4_SH(out0, out1, out2, out3, 7); - CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, - dst, dst_stride); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + dst_stride); dst += (4 * dst_stride); src10_r = src54_r; @@ -130,13 +126,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, } } -static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter, - int32_t height, - int32_t width) { +static void common_vt_8t_and_aver_dst_16w_mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, int32_t width) { const uint8_t *src_tmp; uint8_t *dst_tmp; uint32_t loop_cnt, cnt; @@ -227,38 +219,31 @@ static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src, } static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, - int32_t height) { + int8_t *filter, int32_t height) { common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 16); } static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, - int32_t height) { + int8_t *filter, int32_t height) { common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 32); } static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, - int32_t height) { + int8_t *filter, int32_t height) { common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, 64); } static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { v16i8 src0, src1, src2, src3, src4; @@ -292,8 +277,7 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, } static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; @@ -311,15 +295,15 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, src8 = LD_SB(src); LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, - dst2, dst3); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2, + dst3); ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, src32_r, src43_r); ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, src76_r, src87_r); - ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, - src87_r, src76_r, src2110, src4332, src6554, src8776); + ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, + src76_r, src2110, src4332, src6554, src8776); DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); @@ -331,10 +315,8 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, } static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, int32_t height) { if (4 == height) { common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); @@ -344,8 +326,7 @@ static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, } static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { v16u8 src0, src1, src2, src3, src4; @@ -364,16 +345,13 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, - dst, dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, + dst_stride); } -static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter, - int32_t height) { +static void common_vt_2t_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { uint32_t loop_cnt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; @@ -393,22 +371,22 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, src += (8 * src_stride); LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8); - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, - vec2, vec3); - ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, - vec6, vec7); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, + vec7); DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, - dst, dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst, + dst_stride); dst += (4 * dst_stride); DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, - dst, dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst, + dst_stride); dst += (4 * dst_stride); src0 = src8; @@ -416,10 +394,8 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, } static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, - int32_t dst_stride, - int8_t *filter, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, int32_t height) { if (4 == height) { common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); @@ -430,11 +406,9 @@ static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src, } static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, - int32_t height) { + int8_t *filter, int32_t height) { uint32_t loop_cnt; v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; @@ -481,11 +455,9 @@ static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src, } static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, - int32_t height) { + int8_t *filter, int32_t height) { uint32_t loop_cnt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; @@ -554,11 +526,9 @@ static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src, } static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, - int32_t height) { + int8_t *filter, int32_t height) { uint32_t loop_cnt; v16u8 src0, src1, src2, src3, src4, src5; v16u8 src6, src7, src8, src9, src10, src11, filt0; @@ -636,8 +606,8 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { int8_t cnt, filt_ver[8]; assert(y_step_q4 == 16); @@ -650,68 +620,56 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, if (((const int32_t *)filter_y)[0] == 0) { switch (w) { case 4: - common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_ver[3], h); + common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); break; case 8: - common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_ver[3], h); + common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); break; case 16: - common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_ver[3], h); + common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); break; case 32: - common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_ver[3], h); + common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); break; case 64: - common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_ver[3], h); + common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } else { switch (w) { case 4: - common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_ver, h); + common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); break; case 8: - common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_ver, h); + common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); break; case 16: - common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_ver, h); + common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); break; case 32: - common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_ver, h); + common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); break; case 64: - common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_ver, h); + common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c index dbd120b0d5c546629f1b1175147e8c9169b45947..9e8bf7b5194374ac0d384bb690e8649c4fb353ec 100644 --- a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c @@ -325,7 +325,7 @@ static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, /* rearranging filter */ filt = LD_UH(filter); - filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); LD_SB4(src, src_stride, src0, src1, src2, src3); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); @@ -347,7 +347,7 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, /* rearranging filter */ filt = LD_UH(filter); - filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); @@ -355,8 +355,8 @@ static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, vec6, vec7); SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); - PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, - res2, res3); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); @@ -383,7 +383,7 @@ static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, /* rearranging filter */ filt = LD_UH(filter); - filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); LD_SB4(src, src_stride, src0, src1, src2, src3); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); @@ -406,7 +406,7 @@ static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, /* rearranging filter */ filt = LD_UH(filter); - filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); LD_SB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); @@ -482,7 +482,7 @@ static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, /* rearranging filter */ filt = LD_UH(filter); - filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); LD_SB4(src, src_stride, src0, src2, src4, src6); LD_SB4(src + 8, src_stride, src1, src3, src5, src7); @@ -545,7 +545,7 @@ static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, /* rearranging filter */ filt = LD_UH(filter); - filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); for (loop_cnt = height >> 1; loop_cnt--;) { src0 = LD_SB(src); @@ -590,7 +590,7 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, /* rearranging filter */ filt = LD_UH(filter); - filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); for (loop_cnt = height; loop_cnt--;) { src0 = LD_SB(src); @@ -622,8 +622,8 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { int8_t cnt, filt_hor[8]; assert(x_step_q4 == 16); @@ -636,67 +636,55 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, if (((const int32_t *)filter_x)[0] == 0) { switch (w) { case 4: - common_hz_2t_4w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, &filt_hor[3], h); break; case 8: - common_hz_2t_8w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, &filt_hor[3], h); break; case 16: - common_hz_2t_16w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, &filt_hor[3], h); break; case 32: - common_hz_2t_32w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, &filt_hor[3], h); break; case 64: - common_hz_2t_64w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, &filt_hor[3], h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } else { switch (w) { case 4: - common_hz_8t_4w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, filt_hor, h); break; case 8: - common_hz_8t_8w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, filt_hor, h); break; case 16: - common_hz_8t_16w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, filt_hor, h); break; case 32: - common_hz_8t_32w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, filt_hor, h); break; case 64: - common_hz_8t_64w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, filt_hor, h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/vpx_convolve8_msa.c b/vpx_dsp/mips/vpx_convolve8_msa.c index 7546f13150095c355b8fd1a200071e668b96978e..b16ec57886a533a1283c7f440f5bd24b2b2f34d5 100644 --- a/vpx_dsp/mips/vpx_convolve8_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_msa.c @@ -69,15 +69,15 @@ static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, XORI_B4_128_SB(src7, src8, src9, src10); src += (4 * src_stride); - hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, @@ -151,20 +151,20 @@ static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, XORI_B4_128_SB(src7, src8, src9, src10); - hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, filt_vt2, filt_vt3); @@ -295,11 +295,11 @@ static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, - vec4, vec5, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4, + vec5, vec6, vec7); SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); - PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, - res2, res3); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); @@ -361,12 +361,10 @@ static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, } static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, - int32_t src_stride, - uint8_t *dst, + int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, - int8_t *filter_vert, - int32_t height) { + int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; v16i8 src0, src1, src2, src3, src4, mask, out0, out1; v16u8 filt_hz, filt_vt, vec0; @@ -542,11 +540,10 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, } } -void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t x_step_q4, - const int16_t *filter_y, int32_t y_step_q4, - int32_t w, int32_t h) { +void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int32_t x_step_q4, const int16_t *filter_y, + int32_t y_step_q4, int32_t w, int32_t h) { int8_t cnt, filt_hor[8], filt_ver[8]; assert(x_step_q4 == 16); @@ -563,72 +560,69 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, ((const int32_t *)filter_y)[0] == 0) { switch (w) { case 4: - common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], (int32_t)h); + common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); break; case 8: - common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], (int32_t)h); + common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); break; case 16: - common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], (int32_t)h); + common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); break; case 32: - common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], (int32_t)h); + common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); break; case 64: - common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], (int32_t)h); + common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); break; default: - vpx_convolve8_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); break; } } else if (((const int32_t *)filter_x)[0] == 0 || ((const int32_t *)filter_y)[0] == 0) { - vpx_convolve8_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); } else { switch (w) { case 4: - common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, filt_ver, (int32_t)h); + common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); break; case 8: - common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, filt_ver, (int32_t)h); + common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); break; case 16: - common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, filt_ver, (int32_t)h); + common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); break; case 32: - common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, filt_ver, (int32_t)h); + common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); break; case 64: - common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, - filt_hor, filt_ver, (int32_t)h); + common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); break; default: - vpx_convolve8_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/vpx_dsp/mips/vpx_convolve8_vert_msa.c index 527d4571991cd5ba56b5505f15bf74620ec3bcc2..410682271f5872f29eb53736db964568e2a5cb58 100644 --- a/vpx_dsp/mips/vpx_convolve8_vert_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_vert_msa.c @@ -222,11 +222,11 @@ static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); src_tmp += (7 * src_stride); - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, - src32_r, src54_r, src21_r); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, - src32_l, src54_l, src21_l); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); for (loop_cnt = (height >> 2); loop_cnt--;) { @@ -344,8 +344,8 @@ static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, src32_r, src43_r); ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, src76_r, src87_r); - ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, - src87_r, src76_r, src2110, src4332, src6554, src8776); + ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, + src76_r, src2110, src4332, src6554, src8776); DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); @@ -407,10 +407,10 @@ static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); src += (8 * src_stride); - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, - vec2, vec3); - ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, - vec6, vec7); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, + vec7); DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); @@ -629,8 +629,8 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { int8_t cnt, filt_ver[8]; assert(y_step_q4 == 16); @@ -643,67 +643,55 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, if (((const int32_t *)filter_y)[0] == 0) { switch (w) { case 4: - common_vt_2t_4w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, &filt_ver[3], h); break; case 8: - common_vt_2t_8w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, &filt_ver[3], h); break; case 16: - common_vt_2t_16w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, &filt_ver[3], h); break; case 32: - common_vt_2t_32w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, &filt_ver[3], h); break; case 64: - common_vt_2t_64w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, &filt_ver[3], h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } else { switch (w) { case 4: - common_vt_8t_4w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, filt_ver, h); break; case 8: - common_vt_8t_8w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, filt_ver, h); break; case 16: - common_vt_8t_16w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, filt_ver, h); break; case 32: - common_vt_8t_32w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, filt_ver, h); break; case 64: - common_vt_8t_64w_msa(src, (int32_t)src_stride, - dst, (int32_t)dst_stride, + common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, filt_ver, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/vpx_convolve_avg_msa.c b/vpx_dsp/mips/vpx_convolve_avg_msa.c index 4c3d978031a57e008e5903e471e9ebd86b227035..45399bad852c552722f969c9c92a8c18995227b7 100644 --- a/vpx_dsp/mips/vpx_convolve_avg_msa.c +++ b/vpx_dsp/mips/vpx_convolve_avg_msa.c @@ -10,8 +10,8 @@ #include "vpx_dsp/mips/macros_msa.h" -static void avg_width4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { +static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { int32_t cnt; uint32_t out0, out1, out2, out3; v16u8 src0, src1, src2, src3; @@ -24,8 +24,8 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride, LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, - dst0, dst1, dst2, dst3); + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); out0 = __msa_copy_u_w((v4i32)dst0, 0); out1 = __msa_copy_u_w((v4i32)dst1, 0); @@ -53,8 +53,8 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride, } } -static void avg_width8_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { +static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { int32_t cnt; uint64_t out0, out1, out2, out3; v16u8 src0, src1, src2, src3; @@ -65,8 +65,8 @@ static void avg_width8_msa(const uint8_t *src, int32_t src_stride, src += (4 * src_stride); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, - dst0, dst1, dst2, dst3); + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); out0 = __msa_copy_u_d((v2i64)dst0, 0); out1 = __msa_copy_u_d((v2i64)dst1, 0); @@ -88,10 +88,10 @@ static void avg_width16_msa(const uint8_t *src, int32_t src_stride, src += (8 * src_stride); LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, - dst0, dst1, dst2, dst3); - AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, - dst4, dst5, dst6, dst7); + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); dst += (8 * dst_stride); } @@ -120,14 +120,14 @@ static void avg_width32_msa(const uint8_t *src, int32_t src_stride, LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15); dst_dup += (4 * dst_stride); - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, - dst0, dst1, dst2, dst3); - AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, - dst4, dst5, dst6, dst7); - AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, - dst8, dst9, dst10, dst11); - AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, - dst12, dst13, dst14, dst15); + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, + dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, + dst13, dst14, dst15); ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride); ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride); @@ -166,14 +166,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride, LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15); dst_dup += dst_stride; - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, - dst0, dst1, dst2, dst3); - AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, - dst4, dst5, dst6, dst7); - AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, - dst8, dst9, dst10, dst11); - AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, - dst12, dst13, dst14, dst15); + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, + dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, + dst13, dst14, dst15); ST_UB4(dst0, dst1, dst2, dst3, dst, 16); dst += dst_stride; diff --git a/vpx_dsp/mips/vpx_convolve_copy_msa.c b/vpx_dsp/mips/vpx_convolve_copy_msa.c index ba4012281e84ae9402cff599fabc65b65f81de18..c3d87a4ab8112a3ceec5d64cd7356c2bfdec2dc7 100644 --- a/vpx_dsp/mips/vpx_convolve_copy_msa.c +++ b/vpx_dsp/mips/vpx_convolve_copy_msa.c @@ -105,12 +105,12 @@ static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, dst_tmp = dst; for (loop_cnt = (height >> 3); loop_cnt--;) { - LD_UB8(src_tmp, src_stride, - src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); src_tmp += (8 * src_stride); - ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, - dst_tmp, dst_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp, + dst_stride); dst_tmp += (8 * dst_stride); } diff --git a/vpx_dsp/mips/vpx_convolve_msa.h b/vpx_dsp/mips/vpx_convolve_msa.h index e0013983ae6ae4c0635819753def290e94917dac..198c21ed20a5d9624dd91aa702e49e76698fcdf3 100644 --- a/vpx_dsp/mips/vpx_convolve_msa.h +++ b/vpx_dsp/mips/vpx_convolve_msa.h @@ -16,104 +16,109 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; -#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \ - filt0, filt1, filt2, filt3) ({ \ - v8i16 tmp0, tmp1; \ - \ - tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ - tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \ - tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ - tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \ - tmp0 = __msa_adds_s_h(tmp0, tmp1); \ - \ - tmp0; \ -}) +#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ + filt3) \ + ({ \ + v8i16 tmp0, tmp1; \ + \ + tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \ + tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ + tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \ + tmp0 = __msa_adds_s_h(tmp0, tmp1); \ + \ + tmp0; \ + }) -#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \ - filt_h0, filt_h1, filt_h2, filt_h3) ({ \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ - v8i16 hz_out_m; \ - \ - VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \ - vec0_m, vec1_m, vec2_m, vec3_m); \ - hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \ - filt_h0, filt_h1, filt_h2, filt_h3); \ - \ - hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \ - hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ - \ - hz_out_m; \ -}) +#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \ + filt_h1, filt_h2, filt_h3) \ + ({ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ + v8i16 hz_out_m; \ + \ + VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \ + vec3_m); \ + hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \ + filt_h1, filt_h2, filt_h3); \ + \ + hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \ + hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + \ + hz_out_m; \ + }) -#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ - mask0, mask1, mask2, mask3, \ - filt0, filt1, filt2, filt3, \ - out0, out1) { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m; \ - \ - VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ - DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ - VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ - DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ - VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ - DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ - VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ - DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ - ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ -} +#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, mask3, filt0, filt1, filt2, filt3, \ + out0, out1) \ + { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ + DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ + DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ + ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ + } -#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ - mask0, mask1, mask2, mask3, \ - filt0, filt1, filt2, filt3, \ - out0, out1, out2, out3) { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ - \ - VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ - DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ - res0_m, res1_m, res2_m, res3_m); \ - VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ - DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ - res4_m, res5_m, res6_m, res7_m); \ - VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ - DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ - res0_m, res1_m, res2_m, res3_m); \ - VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ - DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ - res4_m, res5_m, res6_m, res7_m); \ - ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ - res7_m, out0, out1, out2, out3); \ -} +#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, mask3, filt0, filt1, filt2, filt3, \ + out0, out1, out2, out3) \ + { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ + res4_m, res5_m, res6_m, res7_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ + res4_m, res5_m, res6_m, res7_m); \ + ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ + res7_m, out0, out1, out2, out3); \ + } -#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) { \ - v16u8 tmp_m; \ - \ - tmp_m = PCKEV_XORI128_UB(in1, in0); \ - tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ - ST_UB(tmp_m, (pdst)); \ -} +#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \ + { \ + v16u8 tmp_m; \ + \ + tmp_m = PCKEV_XORI128_UB(in1, in0); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ + ST_UB(tmp_m, (pdst)); \ + } -#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) { \ - v16u8 tmp_m; \ - \ - tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ - tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ - ST_UB(tmp_m, (pdst)); \ -} +#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ + { \ + v16u8 tmp_m; \ + \ + tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ + ST_UB(tmp_m, (pdst)); \ + } -#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \ - pdst, stride) { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - \ - PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ - PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ -} -#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */ +#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \ + stride) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ + PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ + } +#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */ diff --git a/vpx_dsp/prob.c b/vpx_dsp/prob.c index 639d24dd2f0158c1f119df9c2f50cf4f7b3bf5fc..819e95062e9fea6d4791d283ce3138fd9235ddcb 100644 --- a/vpx_dsp/prob.c +++ b/vpx_dsp/prob.c @@ -11,22 +11,16 @@ #include "./prob.h" const uint8_t vpx_norm[256] = { - 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; static unsigned int tree_merge_probs_impl(unsigned int i, @@ -35,13 +29,13 @@ static unsigned int tree_merge_probs_impl(unsigned int i, const unsigned int *counts, vpx_prob *probs) { const int l = tree[i]; - const unsigned int left_count = (l <= 0) - ? counts[-l] - : tree_merge_probs_impl(l, tree, pre_probs, counts, probs); + const unsigned int left_count = + (l <= 0) ? counts[-l] + : tree_merge_probs_impl(l, tree, pre_probs, counts, probs); const int r = tree[i + 1]; - const unsigned int right_count = (r <= 0) - ? counts[-r] - : tree_merge_probs_impl(r, tree, pre_probs, counts, probs); + const unsigned int right_count = + (r <= 0) ? counts[-r] + : tree_merge_probs_impl(r, tree, pre_probs, counts, probs); const unsigned int ct[2] = { left_count, right_count }; probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct); return left_count + right_count; diff --git a/vpx_dsp/prob.h b/vpx_dsp/prob.h index c3cb103ffb5e8ca77f234eddbe01f3d304631176..148116ed08994e5c0db84cd47f7a8acc51ea0077 100644 --- a/vpx_dsp/prob.h +++ b/vpx_dsp/prob.h @@ -24,11 +24,11 @@ typedef uint8_t vpx_prob; #define MAX_PROB 255 -#define vpx_prob_half ((vpx_prob) 128) +#define vpx_prob_half ((vpx_prob)128) typedef int8_t vpx_tree_index; -#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2) +#define TREE_SIZE(leaf_count) (2 * (leaf_count)-2) #define vpx_complement(x) (255 - x) @@ -60,8 +60,7 @@ static INLINE vpx_prob weighted_prob(int prob1, int prob2, int factor) { return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); } -static INLINE vpx_prob merge_probs(vpx_prob pre_prob, - const unsigned int ct[2], +static INLINE vpx_prob merge_probs(vpx_prob pre_prob, const unsigned int ct[2], unsigned int count_sat, unsigned int max_update_factor) { const vpx_prob prob = get_binary_prob(ct[0], ct[1]); @@ -72,7 +71,7 @@ static INLINE vpx_prob merge_probs(vpx_prob pre_prob, // MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT; static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = { - 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, + 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, 70, 76, 83, 89, 96, 102, 108, 115, 121, 128 }; @@ -93,7 +92,6 @@ static INLINE vpx_prob mode_mv_merge_probs(vpx_prob pre_prob, void vpx_tree_merge_probs(const vpx_tree_index *tree, const vpx_prob *pre_probs, const unsigned int *counts, vpx_prob *probs); - DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]); #ifdef __cplusplus diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c index 5bf78627114ba36e0c2884d418fd5fb693157cc3..47afd4388abfba8d05e9295c1ee6ffd9d6219688 100644 --- a/vpx_dsp/psnr.c +++ b/vpx_dsp/psnr.c @@ -14,7 +14,6 @@ #include "vpx_dsp/psnr.h" #include "vpx_scale/yv12config.h" - double vpx_sse_to_psnr(double samples, double peak, double sse) { if (sse > 0.0) { const double psnr = 10.0 * log10(samples * peak * peak / sse); @@ -27,9 +26,9 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) { /* TODO(yaowu): The block_variance calls the unoptimized versions of variance() * and highbd_8_variance(). It should not. */ -static void encoder_variance(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, unsigned int *sse, int *sum) { +static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, unsigned int *sse, + int *sum) { int i, j; *sum = 0; @@ -48,10 +47,9 @@ static void encoder_variance(const uint8_t *a, int a_stride, } #if CONFIG_VP9_HIGHBITDEPTH -static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, uint64_t *sse, - int64_t *sum) { +static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h, uint64_t *sse, int64_t *sum) { int i, j; uint16_t *a = CONVERT_TO_SHORTPTR(a8); @@ -70,22 +68,20 @@ static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, } } -static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, - unsigned int *sse, int *sum) { +static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h, unsigned int *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; - encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, - &sse_long, &sum_long); + encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, + &sum_long); *sse = (unsigned int)sse_long; *sum = (int)sum_long; } #endif // CONFIG_VP9_HIGHBITDEPTH -static int64_t get_sse(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int width, int height) { +static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { const int dw = width % 16; const int dh = height % 16; int64_t total_sse = 0; @@ -94,15 +90,15 @@ static int64_t get_sse(const uint8_t *a, int a_stride, int x, y; if (dw > 0) { - encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, - dw, height, &sse, &sum); + encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, + height, &sse, &sum); total_sse += sse; } if (dh > 0) { encoder_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); + &b[(height - dh) * b_stride], b_stride, width - dw, dh, + &sse, &sum); total_sse += sse; } @@ -126,9 +122,8 @@ static int64_t get_sse(const uint8_t *a, int a_stride, #if CONFIG_VP9_HIGHBITDEPTH static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int width, int height, - unsigned int input_shift) { + const uint8_t *b8, int b_stride, int width, + int height, unsigned int input_shift) { const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); int64_t total_sse = 0; @@ -145,9 +140,8 @@ static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride, return total_sse; } -static int64_t highbd_get_sse(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int width, int height) { +static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { int64_t total_sse = 0; int x, y; const int dw = width % 16; @@ -155,15 +149,14 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, unsigned int sse = 0; int sum = 0; if (dw > 0) { - encoder_highbd_8_variance(&a[width - dw], a_stride, - &b[width - dw], b_stride, - dw, height, &sse, &sum); + encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height, &sse, &sum); total_sse += sse; } if (dh > 0) { encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); + &b[(height - dh) * b_stride], b_stride, + width - dw, dh, &sse, &sum); total_sse += sse; } for (y = 0; y < height / 16; ++y) { @@ -182,38 +175,35 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, } #endif // CONFIG_VP9_HIGHBITDEPTH - int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b) { + const YV12_BUFFER_CONFIG *b) { assert(a->y_crop_width == b->y_crop_width); assert(a->y_crop_height == b->y_crop_height); return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, - a->y_crop_width, a->y_crop_height); + a->y_crop_width, a->y_crop_height); } #if CONFIG_VP9_HIGHBITDEPTH int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b) { + const YV12_BUFFER_CONFIG *b) { assert(a->y_crop_width == b->y_crop_width); assert(a->y_crop_height == b->y_crop_height); assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, - a->y_crop_width, a->y_crop_height); + a->y_crop_width, a->y_crop_height); } #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, - PSNR_STATS *psnr, uint32_t bit_depth, - uint32_t in_bit_depth) { - const int widths[3] = - { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; - const int heights[3] = - { a->y_crop_height, a->uv_crop_height, a->uv_crop_height }; + const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, + uint32_t bit_depth, uint32_t in_bit_depth) { + const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; + const int heights[3] = { a->y_crop_height, a->uv_crop_height, + a->uv_crop_height }; const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer }; const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer }; @@ -231,17 +221,14 @@ void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, uint64_t sse; if (a->flags & YV12_FLAG_HIGHBITDEPTH) { if (input_shift) { - sse = highbd_get_sse_shift(a_planes[i], a_strides[i], - b_planes[i], b_strides[i], w, h, - input_shift); + sse = highbd_get_sse_shift(a_planes[i], a_strides[i], b_planes[i], + b_strides[i], w, h, input_shift); } else { - sse = highbd_get_sse(a_planes[i], a_strides[i], - b_planes[i], b_strides[i], w, h); + sse = highbd_get_sse(a_planes[i], a_strides[i], b_planes[i], + b_strides[i], w, h); } } else { - sse = get_sse(a_planes[i], a_strides[i], - b_planes[i], b_strides[i], - w, h); + sse = get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h); } psnr->sse[1 + i] = sse; psnr->samples[1 + i] = samples; @@ -253,8 +240,8 @@ void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, psnr->sse[0] = total_sse; psnr->samples[0] = total_samples; - psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak, - (double)total_sse); + psnr->psnr[0] = + vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse); } #endif // !CONFIG_VP9_HIGHBITDEPTH @@ -262,10 +249,9 @@ void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr) { static const double peak = 255.0; - const int widths[3] = { - a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; - const int heights[3] = { - a->y_crop_height, a->uv_crop_height, a->uv_crop_height }; + const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; + const int heights[3] = { a->y_crop_height, a->uv_crop_height, + a->uv_crop_height }; const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer }; const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer }; @@ -278,9 +264,8 @@ void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, const int w = widths[i]; const int h = heights[i]; const uint32_t samples = w * h; - const uint64_t sse = get_sse(a_planes[i], a_strides[i], - b_planes[i], b_strides[i], - w, h); + const uint64_t sse = + get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h); psnr->sse[1 + i] = sse; psnr->samples[1 + i] = samples; psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse); @@ -291,6 +276,6 @@ void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, psnr->sse[0] = total_sse; psnr->samples[0] = total_samples; - psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak, - (double)total_sse); + psnr->psnr[0] = + vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse); } diff --git a/vpx_dsp/psnr.h b/vpx_dsp/psnr.h index e25b4504af84445dd3d2fb55ca35041c71f0deeb..f321131d0b9f740b9d1b85396c5828ff57d8252b 100644 --- a/vpx_dsp/psnr.h +++ b/vpx_dsp/psnr.h @@ -11,7 +11,6 @@ #ifndef VPX_DSP_PSNR_H_ #define VPX_DSP_PSNR_H_ - #include "vpx_scale/yv12config.h" #define MAX_PSNR 100.0 @@ -37,25 +36,20 @@ typedef struct { * \param[in] sse Sum of squared errors */ double vpx_sse_to_psnr(double samples, double peak, double sse); -int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b); +int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); #if CONFIG_VP9_HIGHBITDEPTH int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, - PSNR_STATS *psnr, - unsigned int bit_depth, - unsigned int in_bit_depth); + const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, + unsigned int bit_depth, unsigned int in_bit_depth); #endif -void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, - PSNR_STATS *psnr); +void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, + PSNR_STATS *psnr); double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, - double *phvs_y, double *phvs_u, - double *phvs_v, uint32_t bd, uint32_t in_bd); + const YV12_BUFFER_CONFIG *dest, double *phvs_y, + double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd); #ifdef __cplusplus } // extern "C" diff --git a/vpx_dsp/psnrhvs.c b/vpx_dsp/psnrhvs.c index 3708cc3c859e379e2fe97cfc54e648334510a6d7..b3910152c472e5264c711c51e981384f2fef94f6 100644 --- a/vpx_dsp/psnrhvs.c +++ b/vpx_dsp/psnrhvs.c @@ -22,28 +22,28 @@ #include "vpx_dsp/psnr.h" #if !defined(M_PI) -# define M_PI (3.141592653589793238462643) +#define M_PI (3.141592653589793238462643) #endif #include <string.h> static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, int xstride) { int i, j; - (void) xstride; + (void)xstride; vpx_fdct8x8(x, y, ystride); for (i = 0; i < 8; i++) - for (j = 0; j< 8; j++) - *(y + ystride*i + j) = (*(y + ystride*i + j) + 4) >> 3; + for (j = 0; j < 8; j++) + *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; } #if CONFIG_VP9_HIGHBITDEPTH static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, - int xstride) { + int xstride) { int i, j; - (void) xstride; + (void)xstride; vpx_highbd_fdct8x8(x, y, ystride); for (i = 0; i < 8; i++) - for (j = 0; j< 8; j++) - *(y + ystride*i + j) = (*(y + ystride*i + j) + 4) >> 3; + for (j = 0; j < 8; j++) + *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; } #endif @@ -51,56 +51,59 @@ static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, * transparency. This is not the JPEG based matrix from the paper, this one gives a slightly higher MOS agreement.*/ static const double csf_y[8][8] = { - {1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334, - 0.678296995242, 0.466224900598, 0.3265091542}, - {2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, - 0.868920337363, 0.61280991668, 0.436405793551}, - {2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257, - 0.670882927016, 0.501731932449, 0.372504254596}, - {1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, - 0.605636379554, 0.48309405692, 0.380429446972, 0.295774038565}, - {1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, - 0.448996256676, 0.352889268808, 0.283006984131, 0.226951348204}, - {0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692, - 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321}, - {0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972, - 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001}, - {0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565, - 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276}}; + { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334, + 0.678296995242, 0.466224900598, 0.3265091542 }, + { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, + 0.868920337363, 0.61280991668, 0.436405793551 }, + { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257, + 0.670882927016, 0.501731932449, 0.372504254596 }, + { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554, + 0.48309405692, 0.380429446972, 0.295774038565 }, + { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676, + 0.352889268808, 0.283006984131, 0.226951348204 }, + { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692, + 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 }, + { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972, + 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 }, + { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565, + 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 } +}; static const double csf_cb420[8][8] = { - {1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788, - 0.898018824055, 0.74725392039, 0.615105596242}, - {2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972, - 1.17428548929, 0.996404342439, 0.830890433625}, - {1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362, - 0.960060382087, 0.849823426169, 0.731221236837}, - {1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, - 0.801821139099, 0.751437590932, 0.685398513368, 0.608694761374}, - {1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, - 0.676555426187, 0.605503172737, 0.55002013668, 0.495804539034}, - {0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932, - 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965}, - {0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368, - 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733}, - {0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374, - 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237}}; + { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788, + 0.898018824055, 0.74725392039, 0.615105596242 }, + { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972, + 1.17428548929, 0.996404342439, 0.830890433625 }, + { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362, + 0.960060382087, 0.849823426169, 0.731221236837 }, + { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099, + 0.751437590932, 0.685398513368, 0.608694761374 }, + { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187, + 0.605503172737, 0.55002013668, 0.495804539034 }, + { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932, + 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 }, + { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368, + 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 }, + { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374, + 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 } +}; static const double csf_cr420[8][8] = { - {2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469, - 0.867069376285, 0.721500455585, 0.593906509971}, - {2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198, - 1.13381474809, 0.962064122248, 0.802254508198}, - {1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848, - 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706}, - {1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195, - 0.725539939514, 0.661776842059, 0.587716619023}, - {1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, - 0.653238524286, 0.584635025748, 0.531064164893, 0.478717061273}, - {0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514, - 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543}, - {0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059, - 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063}, - {0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023, - 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658}}; + { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469, + 0.867069376285, 0.721500455585, 0.593906509971 }, + { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198, + 1.13381474809, 0.962064122248, 0.802254508198 }, + { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848, + 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 }, + { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195, + 0.725539939514, 0.661776842059, 0.587716619023 }, + { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286, + 0.584635025748, 0.531064164893, 0.478717061273 }, + { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514, + 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 }, + { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059, + 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 }, + { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023, + 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 } +}; static double convert_score_db(double _score, double _weight, int bit_depth) { int16_t pix_max = 255; @@ -110,16 +113,14 @@ static double convert_score_db(double _score, double _weight, int bit_depth) { else if (bit_depth == 12) pix_max = 4095; - if (_weight * _score < pix_max * pix_max * 1e-10) - return MAX_PSNR; + if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR; return 10 * (log10(pix_max * pix_max) - log10(_weight * _score)); } static double calc_psnrhvs(const unsigned char *src, int _systride, - const unsigned char *dst, int _dystride, - double _par, int _w, int _h, int _step, - const double _csf[8][8], uint32_t bit_depth, - uint32_t _shift) { + const unsigned char *dst, int _dystride, double _par, + int _w, int _h, int _step, const double _csf[8][8], + uint32_t bit_depth, uint32_t _shift) { double ret; const uint8_t *_src8 = src; const uint8_t *_dst8 = dst; @@ -131,7 +132,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, int pixels; int x; int y; - (void) _par; + (void)_par; ret = pixels = 0; /*In the PSNR-HVS-M paper[1] the authors describe the construction of @@ -152,8 +153,8 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/ for (x = 0; x < 8; x++) for (y = 0; y < 8; y++) - mask[x][y] = (_csf[x][y] * 0.3885746225901003) - * (_csf[x][y] * 0.3885746225901003); + mask[x][y] = + (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003); for (y = 0; y < _h - 7; y += _step) { for (x = 0; x < _w - 7; x += _step) { int i; @@ -188,27 +189,23 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, } s_gmean /= 64.f; d_gmean /= 64.f; - for (i = 0; i < 4; i++) - s_means[i] /= 16.f; - for (i = 0; i < 4; i++) - d_means[i] /= 16.f; + for (i = 0; i < 4; i++) s_means[i] /= 16.f; + for (i = 0; i < 4; i++) d_means[i] /= 16.f; for (i = 0; i < 8; i++) { for (j = 0; j < 8; j++) { int sub = ((i & 12) >> 2) + ((j & 12) >> 1); s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean); d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean); - s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) - * (dct_s[i * 8 + j] - s_means[sub]); - d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) - * (dct_d[i * 8 + j] - d_means[sub]); + s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) * + (dct_s[i * 8 + j] - s_means[sub]); + d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) * + (dct_d[i * 8 + j] - d_means[sub]); } } s_gvar *= 1 / 63.f * 64; d_gvar *= 1 / 63.f * 64; - for (i = 0; i < 4; i++) - s_vars[i] *= 1 / 15.f * 16; - for (i = 0; i < 4; i++) - d_vars[i] *= 1 / 15.f * 16; + for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16; + for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16; if (s_gvar > 0) s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar; if (d_gvar > 0) @@ -231,8 +228,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j]; s_mask = sqrt(s_mask * s_gvar) / 32.f; d_mask = sqrt(d_mask * d_gvar) / 32.f; - if (d_mask > s_mask) - s_mask = d_mask; + if (d_mask > s_mask) s_mask = d_mask; for (i = 0; i < 8; i++) { for (j = 0; j < 8; j++) { double err; @@ -245,16 +241,15 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, } } } - if (pixels <=0) - return 0; + if (pixels <= 0) return 0; ret /= pixels; return ret; } double vpx_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dest, double *y_psnrhvs, - double *u_psnrhvs, double *v_psnrhvs, - uint32_t bd, uint32_t in_bd) { + double *u_psnrhvs, double *v_psnrhvs, uint32_t bd, + uint32_t in_bd) { double psnrhvs; const double par = 1.0; const int step = 7; @@ -268,17 +263,13 @@ double vpx_psnrhvs(const YV12_BUFFER_CONFIG *src, *y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dest->y_buffer, dest->y_stride, par, src->y_crop_width, - src->y_crop_height, step, csf_y, bd, - bd_shift); + src->y_crop_height, step, csf_y, bd, bd_shift); *u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dest->u_buffer, dest->uv_stride, par, src->uv_crop_width, - src->uv_crop_height, step, csf_cb420, bd, - bd_shift); + src->uv_crop_height, step, csf_cb420, bd, bd_shift); *v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dest->v_buffer, dest->uv_stride, par, src->uv_crop_width, - src->uv_crop_height, step, csf_cr420, bd, - bd_shift); + src->uv_crop_height, step, csf_cr420, bd, bd_shift); psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs)); return convert_score_db(psnrhvs, 1.0, in_bd); } - diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c index 80fcd66b052b07680b88a688ba4be98c4676a0f7..3c7f9832f7adc09fe6cd2fc3d6c273cdd5c67618 100644 --- a/vpx_dsp/quantize.c +++ b/vpx_dsp/quantize.c @@ -12,8 +12,7 @@ #include "vpx_dsp/quantize.h" #include "vpx_mem/vpx_mem.h" -void vpx_quantize_dc(const tran_low_t *coeff_ptr, - int n_coeffs, int skip_block, +void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { @@ -29,20 +28,19 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, if (!skip_block) { tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); tmp = (tmp * quant) >> 16; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; - if (tmp) - eob = 0; + if (tmp) eob = 0; } *eob_ptr = eob + 1; } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, - int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { +void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr) { int eob = -1; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); @@ -56,8 +54,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, const int abs_qcoeff = (int)((tmp * quant) >> 16); qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr; - if (abs_qcoeff) - eob = 0; + if (abs_qcoeff) eob = 0; } *eob_ptr = eob + 1; } @@ -81,19 +78,16 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), INT16_MIN, INT16_MAX); tmp = (tmp * quant) >> 15; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; - if (tmp) - eob = 0; + if (tmp) eob = 0; } *eob_ptr = eob + 1; } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, - int skip_block, - const int16_t *round_ptr, - const int16_t quant, +void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, @@ -112,24 +106,22 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, const int abs_qcoeff = (int)((tmp * quant) >> 15); qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2; - if (abs_qcoeff) - eob = 0; + if (abs_qcoeff) eob = 0; } *eob_ptr = eob + 1; } #endif void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, - uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]}; - const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; + const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); @@ -158,12 +150,12 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, if (abs_coeff >= zbins[rc != 0]) { int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * - quant_shift_ptr[rc != 0]) >> 16; // quantization - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + quant_shift_ptr[rc != 0]) >> + 16; // quantization + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (tmp) - eob = i; + if (tmp) eob = i; } } } @@ -176,12 +168,11 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]}; - const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; + const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); @@ -214,8 +205,7 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (abs_qcoeff) - eob = i; + if (abs_qcoeff) eob = i; } } } @@ -224,17 +214,15 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #endif void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, - uint16_t *eob_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1)}; - const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), + ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; int idx = 0; int idx_arr[1024]; @@ -267,33 +255,28 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * - quant_shift_ptr[rc != 0]) >> 15; + quant_shift_ptr[rc != 0]) >> + 15; qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (tmp) - eob = idx_arr[i]; + if (tmp) eob = idx_arr[i]; } } *eob_ptr = eob + 1; } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, - intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, - uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1)}; - const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; +void vpx_highbd_quantize_b_32x32_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), + ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; int idx = 0; int idx_arr[1024]; @@ -322,15 +305,14 @@ void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = abs_coeff - + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) - eob = idx_arr[i]; + if (abs_qcoeff) eob = idx_arr[i]; } } *eob_ptr = eob + 1; diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h index 89ec5979247b86cc9a9d577782fa039da1dda24a..e13284546333cd2b38a30bd9be0a5f2833ed3ec4 100644 --- a/vpx_dsp/quantize.h +++ b/vpx_dsp/quantize.h @@ -18,8 +18,7 @@ extern "C" { #endif -void vpx_quantize_dc(const tran_low_t *coeff_ptr, - int n_coeffs, int skip_block, +void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr); @@ -29,19 +28,17 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t dequant_ptr, uint16_t *eob_ptr); #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, - int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, - int skip_block, +void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr); +void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, - uint16_t *eob_ptr); + const int16_t dequant_ptr, uint16_t *eob_ptr); #endif #ifdef __cplusplus diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index e49148d32e925a1e29e60e4deee14885377a7ad7..7306e8fb0ed67b489fb7b8efc6e02f1571f7f3a5 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c @@ -17,15 +17,13 @@ #include "vpx_ports/mem.h" /* Sum the difference between every corresponding element of the buffers. */ -static INLINE unsigned int sad(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int width, int height) { +static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { int y, x; unsigned int sad = 0; for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) - sad += abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); a += a_stride; b += b_stride; @@ -33,40 +31,43 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, return sad; } -#define sadMxN(m, n) \ -unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - return sad(src, src_stride, ref, ref_stride, m, n); \ -} \ -unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - uint8_t comp_pred[m * n]; \ - vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ - return sad(src, src_stride, comp_pred, m, m, n); \ -} +#define sadMxN(m, n) \ + unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad(src, src_stride, ref, ref_stride, m, n); \ + } \ + unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint8_t comp_pred[m * n]; \ + vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ + return sad(src, src_stride, comp_pred, m, m, n); \ + } // depending on call sites, pass **ref_array to avoid & in subsequent call and // de-dup with 4D below. -#define sadMxNxK(m, n, k) \ -void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref_array, int ref_stride, \ - uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < k; ++i) \ - sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \ -} +#define sadMxNxK(m, n, k) \ + void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref_array, int ref_stride, \ + uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < k; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \ + } // This appears to be equivalent to the above when k == 4 and refs is const -#define sadMxNx4D(m, n) \ -void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const ref_array[], int ref_stride, \ - uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < 4; ++i) \ - sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ -} +#define sadMxNx4D(m, n) \ + void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ + } +/* clang-format off */ #if CONFIG_VP10 && CONFIG_EXT_PARTITION // 128x128 sadMxN(128, 128) @@ -150,18 +151,18 @@ sadMxN(4, 4) sadMxNxK(4, 4, 3) sadMxNxK(4, 4, 8) sadMxNx4D(4, 4) +/* clang-format on */ #if CONFIG_VP9_HIGHBITDEPTH -static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int width, int height) { + static INLINE + unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { int y, x; unsigned int sad = 0; const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) - sad += abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); a += a_stride; b += b_stride; @@ -176,8 +177,7 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, unsigned int sad = 0; const uint16_t *a = CONVERT_TO_SHORTPTR(a8); for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) - sad += abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); a += a_stride; b += b_stride; @@ -185,43 +185,43 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, return sad; } -#define highbd_sadMxN(m, n) \ -unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ -} \ -unsigned int vpx_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - const uint8_t *second_pred) { \ - uint16_t comp_pred[m * n]; \ - vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ - return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ -} +#define highbd_sadMxN(m, n) \ + unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ + } \ + unsigned int vpx_highbd_sad##m##x##n##_avg_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint16_t comp_pred[m * n]; \ + vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ + return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ + } -#define highbd_sadMxNxK(m, n, k) \ -void vpx_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref_array, int ref_stride, \ - uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < k; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, &ref_array[i], \ - ref_stride); \ - } \ -} +#define highbd_sadMxNxK(m, n, k) \ + void vpx_highbd_sad##m##x##n##x##k##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref_array, \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < k; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, \ + &ref_array[i], ref_stride); \ + } \ + } -#define highbd_sadMxNx4D(m, n) \ -void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < 4; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, ref_array[i], \ - ref_stride); \ - } \ -} +#define highbd_sadMxNx4D(m, n) \ + void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, \ + ref_array[i], ref_stride); \ + } \ + } +/* clang-format off */ #if CONFIG_VP10 && CONFIG_EXT_PARTITION // 128x128 highbd_sadMxN(128, 128) @@ -305,20 +305,19 @@ highbd_sadMxN(4, 4) highbd_sadMxNxK(4, 4, 3) highbd_sadMxNxK(4, 4, 8) highbd_sadMxNx4D(4, 4) - +/* clang-format on */ #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP10 && CONFIG_EXT_INTER -static INLINE unsigned int masked_sad(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *m, int m_stride, - int width, int height) { + static INLINE + unsigned int masked_sad(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, const uint8_t *m, int m_stride, + int width, int height) { int y, x; unsigned int sad = 0; for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) - sad += m[x] * abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]); a += a_stride; b += b_stride; @@ -329,13 +328,15 @@ static INLINE unsigned int masked_sad(const uint8_t *a, int a_stride, return sad; } -#define MASKSADMxN(m, n) \ -unsigned int vpx_masked_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *msk, int msk_stride) { \ - return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, n); \ -} +#define MASKSADMxN(m, n) \ + unsigned int vpx_masked_sad##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, \ + n); \ + } +/* clang-format off */ #if CONFIG_EXT_PARTITION MASKSADMxN(128, 128) MASKSADMxN(128, 64) @@ -354,20 +355,21 @@ MASKSADMxN(8, 8) MASKSADMxN(8, 4) MASKSADMxN(4, 8) MASKSADMxN(4, 4) +/* clang-format on */ #if CONFIG_VP9_HIGHBITDEPTH -static INLINE unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m, int m_stride, - int width, int height) { + static INLINE + unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, int width, + int height) { int y, x; unsigned int sad = 0; const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) - sad += m[x] * abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]); a += a_stride; b += b_stride; @@ -378,16 +380,13 @@ static INLINE unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride, return sad; } -#define HIGHBD_MASKSADMXN(m, n) \ -unsigned int vpx_highbd_masked_sad##m##x##n##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - const uint8_t *msk, \ - int msk_stride) { \ - return highbd_masked_sad(src, src_stride, ref, ref_stride, \ - msk, msk_stride, m, n); \ -} +#define HIGHBD_MASKSADMXN(m, n) \ + unsigned int vpx_highbd_masked_sad##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return highbd_masked_sad(src, src_stride, ref, ref_stride, msk, \ + msk_stride, m, n); \ + } #if CONFIG_EXT_PARTITION HIGHBD_MASKSADMXN(128, 128) @@ -415,8 +414,7 @@ HIGHBD_MASKSADMXN(4, 4) // wsrc: target weighted prediction (has been *4096 to keep precision) // mask: 2d weights (scaled by 4096) static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride, - const int32_t *wsrc, - const int32_t *mask, + const int32_t *wsrc, const int32_t *mask, int width, int height) { int y, x; unsigned int sad = 0; @@ -433,13 +431,14 @@ static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride, return sad; } -#define OBMCSADMxN(m, n) \ -unsigned int vpx_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \ - const int32_t *wsrc, \ - const int32_t *mask) { \ - return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ -} +#define OBMCSADMxN(m, n) \ + unsigned int vpx_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *mask) { \ + return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ + } +/* clang-format off */ #if CONFIG_EXT_PARTITION OBMCSADMxN(128, 128) OBMCSADMxN(128, 64) @@ -458,12 +457,13 @@ OBMCSADMxN(8, 8) OBMCSADMxN(8, 4) OBMCSADMxN(4, 8) OBMCSADMxN(4, 4) +/* clang-format on */ #if CONFIG_VP9_HIGHBITDEPTH -static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - int width, int height) { + static INLINE + unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int width, int height) { int y, x; unsigned int sad = 0; const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); @@ -480,14 +480,14 @@ static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, return sad; } -#define HIGHBD_OBMCSADMXN(m, n) \ -unsigned int vpx_highbd_obmc_sad##m##x##n##_c(const uint8_t *ref, \ - int ref_stride, \ - const int32_t *wsrc, \ - const int32_t *mask) { \ - return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ -} +#define HIGHBD_OBMCSADMXN(m, n) \ + unsigned int vpx_highbd_obmc_sad##m##x##n##_c( \ + const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ + } +/* clang-format off */ #if CONFIG_EXT_PARTITION HIGHBD_OBMCSADMXN(128, 128) HIGHBD_OBMCSADMXN(128, 64) @@ -506,5 +506,6 @@ HIGHBD_OBMCSADMXN(8, 8) HIGHBD_OBMCSADMXN(8, 4) HIGHBD_OBMCSADMXN(4, 8) HIGHBD_OBMCSADMXN(4, 4) +/* clang-format on */ #endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_VP10 && CONFIG_OBMC diff --git a/vpx_dsp/ssim.c b/vpx_dsp/ssim.c index 632e272dc947e6da885db3c2cf365cbe926b3943..7a29bd29f9f1c1509956baaa031e7d3775e22659 100644 --- a/vpx_dsp/ssim.c +++ b/vpx_dsp/ssim.c @@ -15,8 +15,8 @@ #include "vpx_ports/mem.h" #include "vpx_ports/system_state.h" -void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, - int rp, uint32_t *sum_s, uint32_t *sum_r, +void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr) { int i, j; @@ -31,9 +31,8 @@ void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, } } void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, - uint32_t *sum_s, uint32_t *sum_r, - uint32_t *sum_sq_s, uint32_t *sum_sq_r, - uint32_t *sum_sxr) { + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { int i, j; for (i = 0; i < 8; i++, s += sp, r += rp) { for (j = 0; j < 8; j++) { @@ -47,9 +46,8 @@ void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, - const uint16_t *r, int rp, - uint32_t *sum_s, uint32_t *sum_r, +void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr) { int i, j; @@ -65,16 +63,15 @@ void vpx_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, } #endif // CONFIG_VP9_HIGHBITDEPTH -static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 -static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 -static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 -static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2 -static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2 +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 +static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2 +static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2 static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 -static double similarity(uint32_t sum_s, uint32_t sum_r, - uint32_t sum_sq_s, uint32_t sum_sq_r, - uint32_t sum_sxr, int count, +static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, + uint32_t sum_sq_r, uint32_t sum_sxr, int count, uint32_t bd) { int64_t ssim_n, ssim_d; int64_t c1, c2; @@ -93,12 +90,12 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, assert(0); } - ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr - - (int64_t) 2 * sum_s * sum_r + c2); + ssim_n = (2 * sum_s * sum_r + c1) * + ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + - (int64_t)count * sum_sq_r - (int64_t) sum_r * sum_r + c2); + (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); return ssim_n * 1.0 / ssim_d; } @@ -116,12 +113,8 @@ static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; vpx_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); - return similarity(sum_s >> shift, - sum_r >> shift, - sum_sq_s >> (2 * shift), - sum_sq_r >> (2 * shift), - sum_sxr >> (2 * shift), - 64, bd); + return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), + sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -161,8 +154,8 @@ static double vpx_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { for (j = 0; j <= width - 8; j += 4) { double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, - CONVERT_TO_SHORTPTR(img2 + j), stride_img2, - bd, shift); + CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, + shift); ssim_total += v; samples++; } @@ -173,22 +166,18 @@ static double vpx_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, #endif // CONFIG_VP9_HIGHBITDEPTH double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, - double *weight) { + const YV12_BUFFER_CONFIG *dest, double *weight) { double a, b, c; double ssimv; - a = vpx_ssim2(source->y_buffer, dest->y_buffer, - source->y_stride, dest->y_stride, - source->y_crop_width, source->y_crop_height); + a = vpx_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, + dest->y_stride, source->y_crop_width, source->y_crop_height); - b = vpx_ssim2(source->u_buffer, dest->u_buffer, - source->uv_stride, dest->uv_stride, - source->uv_crop_width, source->uv_crop_height); + b = vpx_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, source->uv_crop_height); - c = vpx_ssim2(source->v_buffer, dest->v_buffer, - source->uv_stride, dest->uv_stride, - source->uv_crop_width, source->uv_crop_height); + c = vpx_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, source->uv_crop_height); ssimv = a * .8 + .1 * (b + c); @@ -232,13 +221,13 @@ static double ssimv_similarity(const Ssimv *sv, int64_t n) { const int64_t c2 = (cc2 * n * n) >> 12; const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) / - (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1); + (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1); // Since these variables are unsigned sums, convert to double so // math is done in double arithmetic. - const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) - / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + n * sv->sum_sq_r - - sv->sum_r * sv->sum_r + c2); + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); return l * v; } @@ -267,24 +256,21 @@ static double ssimv_similarity2(const Ssimv *sv, int64_t n) { // Since these variables are unsigned, sums convert to double so // math is done in double arithmetic. - const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) - / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + - n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); return l * v; } static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, int img2_pitch, Ssimv *sv) { - vpx_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, - &sv->sum_s, &sv->sum_r, &sv->sum_sq_s, &sv->sum_sq_r, - &sv->sum_sxr); + vpx_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r, + &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr); } -double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, - uint8_t *img2, int img2_pitch, - int width, int height, - Ssimv *sv2, Metrics *m, - int do_inconsistency) { +double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency) { double dssim_total = 0; double ssim_total = 0; double ssim2_total = 0; @@ -295,10 +281,10 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, double old_ssim_total = 0; vpx_clear_system_state(); // We can sample points as frequently as we like start with 1 per 4x4. - for (i = 0; i < height; i += 4, - img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { for (j = 0; j < width; j += 4, ++c) { - Ssimv sv = {0}; + Ssimv sv = { 0 }; double ssim; double ssim2; double dssim; @@ -384,27 +370,29 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, // This measures how much consistent variance is in two consecutive // source frames. 1.0 means they have exactly the same variance. - const double variance_term = (2.0 * var_old * var_new + c1) / + const double variance_term = + (2.0 * var_old * var_new + c1) / (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1); // This measures how consistent the local mean are between two // consecutive frames. 1.0 means they have exactly the same mean. - const double mean_term = (2.0 * mean_old * mean_new + c2) / + const double mean_term = + (2.0 * mean_old * mean_new + c2) / (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2); // This measures how consistent the ssims of two // consecutive frames is. 1.0 means they are exactly the same. - double ssim_term = pow((2.0 * ssim_old * ssim_new + c3) / - (ssim_old * ssim_old + ssim_new * ssim_new + c3), - 5); + double ssim_term = + pow((2.0 * ssim_old * ssim_new + c3) / + (ssim_old * ssim_old + ssim_new * ssim_new + c3), + 5); double this_inconsistency; // Floating point math sometimes makes this > 1 by a tiny bit. // We want the metric to scale between 0 and 1.0 so we can convert // it to an snr scaled value. - if (ssim_term > 1) - ssim_term = 1; + if (ssim_term > 1) ssim_term = 1; // This converts the consistency metric to an inconsistency metric // ( so we can scale it like psnr to something like sum square error. @@ -432,8 +420,7 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, ssim2_total *= norm; m->ssim2 = ssim2_total; m->ssim = ssim_total; - if (old_ssim_total == 0) - inconsistency_total = 0; + if (old_ssim_total == 0) inconsistency_total = 0; m->ssimc = inconsistency_total; @@ -441,11 +428,10 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, return inconsistency_total; } - #if CONFIG_VP9_HIGHBITDEPTH double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, - double *weight, uint32_t bd, uint32_t in_bd) { + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd) { double a, b, c; double ssimv; uint32_t shift = 0; @@ -453,20 +439,17 @@ double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, assert(bd >= in_bd); shift = bd - in_bd; - a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer, - source->y_stride, dest->y_stride, - source->y_crop_width, source->y_crop_height, - in_bd, shift); + a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, + dest->y_stride, source->y_crop_width, + source->y_crop_height, in_bd, shift); - b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer, - source->uv_stride, dest->uv_stride, - source->uv_crop_width, source->uv_crop_height, - in_bd, shift); + b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, shift); - c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer, - source->uv_stride, dest->uv_stride, - source->uv_crop_width, source->uv_crop_height, - in_bd, shift); + c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, shift); ssimv = a * .8 + .1 * (b + c); diff --git a/vpx_dsp/ssim.h b/vpx_dsp/ssim.h index d4d6b0d8a93d3a9f51884f3d73e5823b71e9c12e..4f2bb1d556c8cd972fb5ed9f0304fff5983c1c18 100644 --- a/vpx_dsp/ssim.h +++ b/vpx_dsp/ssim.h @@ -63,22 +63,20 @@ typedef struct { } Metrics; double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, - int img2_pitch, int width, int height, Ssimv *sv2, - Metrics *m, int do_inconsistency); + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency); double vpx_calc_ssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, - double *weight); + const YV12_BUFFER_CONFIG *dest, double *weight); double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, - double *ssim_y, double *ssim_u, - double *ssim_v, uint32_t bd, uint32_t in_bd); + const YV12_BUFFER_CONFIG *dest, double *ssim_y, + double *ssim_u, double *ssim_v, uint32_t bd, + uint32_t in_bd); #if CONFIG_VP9_HIGHBITDEPTH double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, - double *weight, + const YV12_BUFFER_CONFIG *dest, double *weight, uint32_t bd, uint32_t in_bd); #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/subtract.c b/vpx_dsp/subtract.c index 556e0134f387bfdc49ae3895d4c253e0c774a6a8..95e7071b27e969fff125ae60b2e5b20fe944ccbe 100644 --- a/vpx_dsp/subtract.c +++ b/vpx_dsp/subtract.c @@ -16,32 +16,30 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -void vpx_subtract_block_c(int rows, int cols, - int16_t *diff, ptrdiff_t diff_stride, - const uint8_t *src, ptrdiff_t src_stride, - const uint8_t *pred, ptrdiff_t pred_stride) { +void vpx_subtract_block_c(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { int r, c; for (r = 0; r < rows; r++) { - for (c = 0; c < cols; c++) - diff[c] = src[c] - pred[c]; + for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c]; diff += diff_stride; pred += pred_stride; - src += src_stride; + src += src_stride; } } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_subtract_block_c(int rows, int cols, - int16_t *diff, ptrdiff_t diff_stride, - const uint8_t *src8, ptrdiff_t src_stride, - const uint8_t *pred8, ptrdiff_t pred_stride, - int bd) { +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride, int bd) { int r, c; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - (void) bd; + (void)bd; for (r = 0; r < rows; r++) { for (c = 0; c < cols; c++) { @@ -50,7 +48,7 @@ void vpx_highbd_subtract_block_c(int rows, int cols, diff += diff_stride; pred += pred_stride; - src += src_stride; + src += src_stride; } } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/sum_squares.c b/vpx_dsp/sum_squares.c index c72461cd18aa2ed5555de7f2128ad662fb769c2b..73a90063415221830926d3dc7cc72bc927ce9755 100644 --- a/vpx_dsp/sum_squares.c +++ b/vpx_dsp/sum_squares.c @@ -20,9 +20,9 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride, for (r = 0; r < size; r++) { for (c = 0; c < size; c++) { const int16_t v = src[c]; - ss += v*v; + ss += v * v; } - src += src_stride; + src += src_stride; } return ss; @@ -32,7 +32,7 @@ uint64_t vpx_sum_squares_i16_c(const int16_t *src, uint32_t n) { uint64_t ss = 0; do { const int16_t v = *src++; - ss += v*v; + ss += v * v; } while (--n); return ss; diff --git a/vpx_dsp/txfm_common.h b/vpx_dsp/txfm_common.h index 9b0e9900a88f9b062d3b9e0c3fcefe5471728ff8..3559b91e2c7dd1aef468fb1eb9b37ba1226e999e 100644 --- a/vpx_dsp/txfm_common.h +++ b/vpx_dsp/txfm_common.h @@ -15,7 +15,7 @@ // Constants and Macros used by all idct/dct functions #define DCT_CONST_BITS 14 -#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) +#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) #define UNIT_QUANT_SHIFT 2 #define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT) @@ -25,15 +25,15 @@ // printf("static const int cospi_%d_64 = %.0f;\n", i, // round(16384 * cos(i*M_PI/64))); // Note: sin(k*Pi/64) = cos((32-k)*Pi/64) -static const tran_high_t cospi_1_64 = 16364; -static const tran_high_t cospi_2_64 = 16305; -static const tran_high_t cospi_3_64 = 16207; -static const tran_high_t cospi_4_64 = 16069; -static const tran_high_t cospi_5_64 = 15893; -static const tran_high_t cospi_6_64 = 15679; -static const tran_high_t cospi_7_64 = 15426; -static const tran_high_t cospi_8_64 = 15137; -static const tran_high_t cospi_9_64 = 14811; +static const tran_high_t cospi_1_64 = 16364; +static const tran_high_t cospi_2_64 = 16305; +static const tran_high_t cospi_3_64 = 16207; +static const tran_high_t cospi_4_64 = 16069; +static const tran_high_t cospi_5_64 = 15893; +static const tran_high_t cospi_6_64 = 15679; +static const tran_high_t cospi_7_64 = 15426; +static const tran_high_t cospi_8_64 = 15137; +static const tran_high_t cospi_9_64 = 14811; static const tran_high_t cospi_10_64 = 14449; static const tran_high_t cospi_11_64 = 14053; static const tran_high_t cospi_12_64 = 13623; diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c index 3fd80dcf374a89e46b66b52f1a2afab92625e19b..79b6760c0f756a0d7a8de313cec3d87eb7ace69b 100644 --- a/vpx_dsp/variance.c +++ b/vpx_dsp/variance.c @@ -18,8 +18,8 @@ #include "vpx_dsp/variance.h" #include "vpx_dsp/vpx_filter.h" -uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride) { +uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride) { int distortion = 0; int r, c; @@ -49,28 +49,23 @@ uint32_t vpx_get_mb_ss_c(const int16_t *a) { uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, uint32_t *sse) { - return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0, - b, b_stride, sse); + return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse); } - uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, uint32_t *sse) { - return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4, - b, b_stride, sse); + return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse); } uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, uint32_t *sse) { - return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4, - b, b_stride, sse); + return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse); } -static void variance(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, uint32_t *sse, int *sum) { +static void variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, int *sum) { int i, j; *sum = 0; @@ -106,9 +101,8 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] + - (int)a[pixel_step] * filter[1], - FILTER_BITS); + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); ++a; } @@ -133,13 +127,12 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { - unsigned int i, j; + unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] + - (int)a[pixel_step] * filter[1], - FILTER_BITS); + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); ++a; } @@ -148,82 +141,78 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, } } -#define VAR(W, H) \ -uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} +#define VAR(W, H) \ + uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } -#define SUBPIX_VAR(W, H) \ -uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ -\ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ -\ - return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ -} +#define SUBPIX_VAR(W, H) \ + uint32_t vpx_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ + } -#define SUBPIX_AVG_VAR(W, H) \ -uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - int xoffset, int yoffset, \ - const uint8_t *b, \ - int b_stride, \ - uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ -\ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ -\ - vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ -\ - return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ -} +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ + \ + return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ + } /* Identical to the variance call except it takes an additional parameter, sum, * and returns that value using pass-by-reference instead of returning * sse - sum^2 / w*h */ -#define GET_VAR(W, H) \ -void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse, int *sum) { \ - variance(a, a_stride, b, b_stride, W, H, sse, sum); \ -} +#define GET_VAR(W, H) \ + void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + int *sum) { \ + variance(a, a_stride, b, b_stride, W, H, sse, sum); \ + } /* Identical to the variance call except it does not calculate the * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in * variable. */ -#define MSE(W, H) \ -uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse; \ -} +#define MSE(W, H) \ + uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse; \ + } /* All three forms of the variance are available in the same sizes. */ #define VARIANCES(W, H) \ - VAR(W, H) \ - SUBPIX_VAR(W, H) \ - SUBPIX_AVG_VAR(W, H) + VAR(W, H) \ + SUBPIX_VAR(W, H) \ + SUBPIX_AVG_VAR(W, H) #if CONFIG_VP10 && CONFIG_EXT_PARTITION VARIANCES(128, 128) @@ -252,9 +241,8 @@ MSE(16, 8) MSE(8, 16) MSE(8, 8) -void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, - const uint8_t *ref, int ref_stride) { +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { int i, j; for (i = 0; i < height; ++i) { @@ -269,42 +257,41 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, } // Get pred block from up-sampled reference. -void vpx_upsampled_pred_c(uint8_t *comp_pred, - int width, int height, - const uint8_t *ref, int ref_stride) { - int i, j, k; - int stride = ref_stride << 3; - - for (i = 0; i < height; i++) { - for (j = 0, k = 0; j < width; j++, k += 8) { - comp_pred[j] = ref[k]; - } - comp_pred += width; - ref += stride; +void vpx_upsampled_pred_c(uint8_t *comp_pred, int width, int height, + const uint8_t *ref, int ref_stride) { + int i, j, k; + int stride = ref_stride << 3; + + for (i = 0; i < height; i++) { + for (j = 0, k = 0; j < width; j++, k += 8) { + comp_pred[j] = ref[k]; } + comp_pred += width; + ref += stride; + } } void vpx_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, - const uint8_t *ref, int ref_stride) { - int i, j; - int stride = ref_stride << 3; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - const int tmp = ref[(j << 3)] + pred[j]; - comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); - } - comp_pred += width; - pred += width; - ref += stride; + int width, int height, const uint8_t *ref, + int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = ref[(j << 3)] + pred[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); } + comp_pred += width; + pred += width; + ref += stride; + } } #if CONFIG_VP9_HIGHBITDEPTH -static void highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, uint64_t *sse, int64_t *sum) { +static void highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint64_t *sse, int64_t *sum) { int i, j; uint16_t *a = CONVERT_TO_SHORTPTR(a8); @@ -323,9 +310,9 @@ static void highbd_variance64(const uint8_t *a8, int a_stride, } } -static void highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, uint32_t *sse, int *sum) { +static void highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); @@ -333,9 +320,9 @@ static void highbd_8_variance(const uint8_t *a8, int a_stride, *sum = (int)sum_long; } -static void highbd_10_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, uint32_t *sse, int *sum) { +static void highbd_10_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); @@ -343,9 +330,9 @@ static void highbd_10_variance(const uint8_t *a8, int a_stride, *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); } -static void highbd_12_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, uint32_t *sse, int *sum) { +static void highbd_12_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); @@ -353,107 +340,91 @@ static void highbd_12_variance(const uint8_t *a8, int a_stride, *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); } -#define HIGHBD_VAR(W, H) \ -uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ -\ -uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ -} \ -\ -uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ -} +#define HIGHBD_VAR(W, H) \ + uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } \ + \ + uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } -#define HIGHBD_GET_VAR(S) \ -void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ -} \ -\ -void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ -} \ -\ -void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ -} +#define HIGHBD_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ + highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ + highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ + highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ + } -#define HIGHBD_MSE(W, H) \ -uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ -} \ -\ -uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ -} \ -\ -uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ -} +#define HIGHBD_MSE(W, H) \ + uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } void vpx_highbd_var_filter_block2d_bil_first_pass( - const uint8_t *src_ptr8, - uint16_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, const uint8_t *filter) { unsigned int i, j; uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - output_ptr[j] = - ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] + - (int)src_ptr[pixel_step] * filter[1], - FILTER_BITS); + output_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); ++src_ptr; } @@ -465,21 +436,17 @@ void vpx_highbd_var_filter_block2d_bil_first_pass( } void vpx_highbd_var_filter_block2d_bil_second_pass( - const uint16_t *src_ptr, - uint16_t *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, const uint8_t *filter) { - unsigned int i, j; + unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - output_ptr[j] = - ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] + - (int)src_ptr[pixel_step] * filter[1], - FILTER_BITS); + output_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); ++src_ptr; } @@ -488,136 +455,118 @@ void vpx_highbd_var_filter_block2d_bil_second_pass( } } -#define HIGHBD_SUBPIX_VAR(W, H) \ -uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ -\ - vpx_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ -\ - return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ - dst_stride, sse); \ -} \ -\ -uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ -\ - vpx_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ -\ - return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ - W, dst, dst_stride, sse); \ -} \ -\ -uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ -\ - vpx_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ -\ - return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ - W, dst, dst_stride, sse); \ -} +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + vpx_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + vpx_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + vpx_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + vpx_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + vpx_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + vpx_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ -uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ -\ - vpx_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ -\ - vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ -\ - return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ -} \ -\ -uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ -\ - vpx_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ -\ - vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ -\ - return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ - W, dst, dst_stride, sse); \ -} \ -\ -uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ -\ - vpx_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ -\ - vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ -\ - return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ - W, dst, dst_stride, sse); \ -} +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + vpx_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + vpx_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + vpx_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + vpx_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + vpx_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + vpx_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } /* All three forms of the variance are available in the same sizes. */ #define HIGHBD_VARIANCES(W, H) \ - HIGHBD_VAR(W, H) \ - HIGHBD_SUBPIX_VAR(W, H) \ - HIGHBD_SUBPIX_AVG_VAR(W, H) + HIGHBD_VAR(W, H) \ + HIGHBD_SUBPIX_VAR(W, H) \ + HIGHBD_SUBPIX_AVG_VAR(W, H) #if CONFIG_VP10 && CONFIG_EXT_PARTITION HIGHBD_VARIANCES(128, 128) @@ -663,10 +612,8 @@ void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, } } -void vpx_highbd_upsampled_pred_c(uint16_t *comp_pred, - int width, int height, - const uint8_t *ref8, - int ref_stride) { +void vpx_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height, + const uint8_t *ref8, int ref_stride) { int i, j; int stride = ref_stride << 3; @@ -681,9 +628,8 @@ void vpx_highbd_upsampled_pred_c(uint16_t *comp_pred, } void vpx_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred, - const uint8_t *pred8, - int width, int height, - const uint8_t *ref8, + const uint8_t *pred8, int width, + int height, const uint8_t *ref8, int ref_stride) { int i, j; int stride = ref_stride << 3; @@ -703,10 +649,9 @@ void vpx_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred, #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP10 && CONFIG_EXT_INTER -void masked_variance(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *m, int m_stride, - int w, int h, unsigned int *sse, int *sum) { +void masked_variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, const uint8_t *m, int m_stride, int w, int h, + unsigned int *sse, int *sum) { int i, j; int64_t sum64 = 0; @@ -723,39 +668,36 @@ void masked_variance(const uint8_t *a, int a_stride, b += b_stride; m += m_stride; } - sum64 = (sum64 >= 0) ? sum64 : -sum64; + sum64 = (sum64 >= 0) ? sum64 : -sum64; *sum = (int)ROUND_POWER_OF_TWO(sum64, 6); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 12); } -#define MASK_VAR(W, H) \ -unsigned int vpx_masked_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - const uint8_t *m, int m_stride, \ - unsigned int *sse) { \ - int sum; \ - masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} +#define MASK_VAR(W, H) \ + unsigned int vpx_masked_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + int sum; \ + masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } -#define MASK_SUBPIX_VAR(W, H) \ -unsigned int vpx_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ -\ - var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ -\ - return vpx_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, \ - msk, msk_stride, sse); \ -} +#define MASK_SUBPIX_VAR(W, H) \ + unsigned int vpx_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return vpx_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, msk, \ + msk_stride, sse); \ + } MASK_VAR(4, 4) MASK_SUBPIX_VAR(4, 4) @@ -808,11 +750,10 @@ MASK_SUBPIX_VAR(128, 128) #endif // CONFIG_EXT_PARTITION #if CONFIG_VP9_HIGHBITDEPTH -void highbd_masked_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m, int m_stride, - int w, int h, - uint64_t *sse, int64_t *sum) { +void highbd_masked_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m, + int m_stride, int w, int h, uint64_t *sse, + int64_t *sum) { int i, j; uint16_t *a = CONVERT_TO_SHORTPTR(a8); uint16_t *b = CONVERT_TO_SHORTPTR(b8); @@ -831,150 +772,122 @@ void highbd_masked_variance64(const uint8_t *a8, int a_stride, b += b_stride; m += m_stride; } - *sum = (*sum >= 0) ? *sum : -*sum; + *sum = (*sum >= 0) ? *sum : -*sum; *sum = ROUND_POWER_OF_TWO(*sum, 6); *sse = ROUND_POWER_OF_TWO(*sse, 12); } -void highbd_masked_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m, int m_stride, - int w, int h, - unsigned int *sse, int *sum) { +void highbd_masked_variance(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, const uint8_t *m, int m_stride, int w, + int h, unsigned int *sse, int *sum) { int64_t sum64; uint64_t sse64; - highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, - w, h, &sse64, &sum64); + highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h, + &sse64, &sum64); *sum = (int)sum64; *sse = (unsigned int)sse64; } -void highbd_10_masked_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m, int m_stride, - int w, int h, +void highbd_10_masked_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse, int *sum) { int64_t sum64; uint64_t sse64; - highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, - w, h, &sse64, &sum64); + highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h, + &sse64, &sum64); *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); } -void highbd_12_masked_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m, int m_stride, - int w, int h, +void highbd_12_masked_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse, int *sum) { int64_t sum64; uint64_t sse64; - highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, - w, h, &sse64, &sum64); + highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h, + &sse64, &sum64); *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); } -#define HIGHBD_MASK_VAR(W, H) \ -unsigned int vpx_highbd_masked_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - const uint8_t *m, \ - int m_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, \ - W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ -\ -unsigned int vpx_highbd_10_masked_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - const uint8_t *m, \ - int m_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, \ - W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ -\ -unsigned int vpx_highbd_12_masked_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - const uint8_t *m, \ - int m_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, \ - W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} +#define HIGHBD_MASK_VAR(W, H) \ + unsigned int vpx_highbd_masked_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + int sum; \ + highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, \ + &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int vpx_highbd_10_masked_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + int sum; \ + highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, \ + sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int vpx_highbd_12_masked_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + int sum; \ + highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, \ + sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } -#define HIGHBD_MASK_SUBPIX_VAR(W, H) \ -unsigned int vpx_highbd_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ -\ - vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \ - H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ -\ - return vpx_highbd_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ - W, dst, dst_stride, \ - msk, msk_stride, sse); \ -} \ -\ -unsigned int vpx_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ -\ - vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \ - H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ -\ - return vpx_highbd_10_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ - W, dst, dst_stride, \ - msk, msk_stride, sse); \ -} \ -\ -unsigned int vpx_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ -\ - vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \ - H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ -\ - return vpx_highbd_12_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ - W, dst, dst_stride, \ - msk, msk_stride, sse); \ -} +#define HIGHBD_MASK_SUBPIX_VAR(W, H) \ + unsigned int vpx_highbd_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + vpx_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + vpx_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return vpx_highbd_masked_variance##W##x##H##_c( \ + CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \ + } \ + \ + unsigned int vpx_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + vpx_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + vpx_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return vpx_highbd_10_masked_variance##W##x##H##_c( \ + CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \ + } \ + \ + unsigned int vpx_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + vpx_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + vpx_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return vpx_highbd_12_masked_variance##W##x##H##_c( \ + CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \ + } HIGHBD_MASK_VAR(4, 4) HIGHBD_MASK_SUBPIX_VAR(4, 4) @@ -1029,7 +942,7 @@ HIGHBD_MASK_SUBPIX_VAR(128, 128) #endif // CONFIG_VP10 && CONFIG_EXT_INTER #if CONFIG_VP10 && CONFIG_OBMC -static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, +static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int i, j; @@ -1050,35 +963,29 @@ static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, } } -#define OBMC_VAR(W, H) \ -unsigned int vpx_obmc_variance##W##x##H##_c(const uint8_t *pre, \ - int pre_stride, \ - const int32_t *wsrc, \ - const int32_t *mask, \ - unsigned int *sse) { \ - int sum; \ - obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} +#define OBMC_VAR(W, H) \ + unsigned int vpx_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } -#define OBMC_SUBPIX_VAR(W, H) \ -unsigned int vpx_obmc_sub_pixel_variance##W##x##H##_c(const uint8_t *pre, \ - int pre_stride, \ - int xoffset, \ - int yoffset, \ - const int32_t *wsrc, \ - const int32_t *mask, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return vpx_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ -} +#define OBMC_SUBPIX_VAR(W, H) \ + unsigned int vpx_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return vpx_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ + } OBMC_VAR(4, 4) OBMC_SUBPIX_VAR(4, 4) @@ -1133,8 +1040,7 @@ OBMC_SUBPIX_VAR(128, 128) #if CONFIG_VP9_HIGHBITDEPTH static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, - const int32_t *mask, - int w, int h, + const int32_t *mask, int w, int h, uint64_t *sse, int64_t *sum) { int i, j; uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); @@ -1157,8 +1063,7 @@ static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride, static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, - const int32_t *mask, - int w, int h, + const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int64_t sum64; uint64_t sse64; @@ -1167,10 +1072,9 @@ static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, *sse = (unsigned int)sse64; } -static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, +static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, - const int32_t *mask, - int w, int h, + const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int64_t sum64; uint64_t sse64; @@ -1179,10 +1083,9 @@ static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); } -static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, +static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, - const int32_t *mask, - int w, int h, + const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int64_t sum64; uint64_t sse64; @@ -1191,94 +1094,76 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); } -#define HIGHBD_OBMC_VAR(W, H) \ -unsigned int vpx_highbd_obmc_variance##W##x##H##_c(const uint8_t *pre, \ - int pre_stride, \ - const int32_t *wsrc, \ - const int32_t *mask, \ - unsigned int *sse) { \ - int sum; \ - highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ - \ -unsigned int vpx_highbd_10_obmc_variance##W##x##H##_c(const uint8_t *pre, \ - int pre_stride, \ - const int32_t *wsrc, \ - const int32_t *mask, \ - unsigned int *sse) { \ - int sum; \ - highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ - \ -unsigned int vpx_highbd_12_obmc_variance##W##x##H##_c(const uint8_t *pre, \ - int pre_stride, \ - const int32_t *wsrc, \ - const int32_t *mask, \ - unsigned int *sse) { \ - int sum; \ - highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} +#define HIGHBD_OBMC_VAR(W, H) \ + unsigned int vpx_highbd_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int vpx_highbd_10_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int vpx_highbd_12_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } -#define HIGHBD_OBMC_SUBPIX_VAR(W, H) \ -unsigned int vpx_highbd_obmc_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, \ - int xoffset, int yoffset, \ - const int32_t *wsrc, \ - const int32_t *mask, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, \ - H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return vpx_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ - W, wsrc, mask, sse); \ -} \ - \ -unsigned int vpx_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, \ - int xoffset, int yoffset, \ - const int32_t *wsrc, \ - const int32_t *mask, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, \ - H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return vpx_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ - W, wsrc, mask, sse); \ -} \ - \ -unsigned int vpx_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, \ - int xoffset, int yoffset, \ - const int32_t *wsrc, \ - const int32_t *mask, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, \ - H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return vpx_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ - W, wsrc, mask, sse); \ -} +#define HIGHBD_OBMC_SUBPIX_VAR(W, H) \ + unsigned int vpx_highbd_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + vpx_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + vpx_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return vpx_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + wsrc, mask, sse); \ + } \ + \ + unsigned int vpx_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + vpx_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + vpx_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return vpx_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, wsrc, mask, sse); \ + } \ + \ + unsigned int vpx_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + vpx_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + vpx_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return vpx_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, wsrc, mask, sse); \ + } HIGHBD_OBMC_VAR(4, 4) HIGHBD_OBMC_SUBPIX_VAR(4, 4) diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h index 837fc3dbcbc1aa25829ae951d03e2bdfe46a1313..18b62dbdedc8f2f0b9f962b6d915e773e5b97ecf 100644 --- a/vpx_dsp/variance.h +++ b/vpx_dsp/variance.h @@ -22,15 +22,15 @@ extern "C" { #define FILTER_BITS 7 #define FILTER_WEIGHT 128 -typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride); +typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride); -typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *second_pred); +typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *second_pred); -typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride, - uint8_t *b, int b_stride, int n); +typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b, + int b_stride, int n); typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, @@ -38,8 +38,7 @@ typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride, typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, const uint8_t *const b_array[], - int b_stride, - unsigned int *sad_array); + int b_stride, unsigned int *sad_array); typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, @@ -50,95 +49,71 @@ typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, unsigned int *sse); -typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a, - int a_stride, - int xoffset, int yoffset, - const uint8_t *b, - int b_stride, - unsigned int *sse, - const uint8_t *second_pred); +typedef unsigned int (*vpx_subp_avg_variance_fn_t)( + const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, + int b_stride, unsigned int *sse, const uint8_t *second_pred); #if CONFIG_VP10 && CONFIG_EXT_INTER -typedef unsigned int(*vpx_masked_sad_fn_t)(const uint8_t *src, - int src_stride, - const uint8_t *ref, - int ref_stride, - const uint8_t *msk_ptr, - int msk_stride); -typedef unsigned int (*vpx_masked_variance_fn_t)(const uint8_t *src, - int src_stride, - const uint8_t *ref, - int ref_stride, - const uint8_t *msk, - int msk_stride, - unsigned int *sse); -typedef unsigned int (*vpx_masked_subpixvariance_fn_t)(const uint8_t *src, - int src_stride, - int xoffset, int yoffset, - const uint8_t *ref, - int ref_stride, - const uint8_t *msk, - int msk_stride, - unsigned int *sse); +typedef unsigned int (*vpx_masked_sad_fn_t)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *msk_ptr, + int msk_stride); +typedef unsigned int (*vpx_masked_variance_fn_t)( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *msk, int msk_stride, unsigned int *sse); +typedef unsigned int (*vpx_masked_subpixvariance_fn_t)( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *ref, int ref_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse); #endif // CONFIG_VP10 && CONFIG_EXT_INTER #if CONFIG_VP10 && CONFIG_OBMC -typedef unsigned int(*vpx_obmc_sad_fn_t)(const uint8_t *pred, - int pred_stride, - const int32_t *wsrc, - const int32_t *msk); +typedef unsigned int (*vpx_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, + const int32_t *wsrc, + const int32_t *msk); typedef unsigned int (*vpx_obmc_variance_fn_t)(const uint8_t *pred, int pred_stride, const int32_t *wsrc, const int32_t *msk, unsigned int *sse); -typedef unsigned int (*vpx_obmc_subpixvariance_fn_t)(const uint8_t *pred, - int pred_stride, - int xoffset, int yoffset, - const int32_t *wsrc, - const int32_t *msk, - unsigned int *sse); +typedef unsigned int (*vpx_obmc_subpixvariance_fn_t)( + const uint8_t *pred, int pred_stride, int xoffset, int yoffset, + const int32_t *wsrc, const int32_t *msk, unsigned int *sse); #endif // CONFIG_VP10 && CONFIG_OBMC #if CONFIG_VP10 typedef struct vpx_variance_vtable { - vpx_sad_fn_t sdf; - vpx_sad_avg_fn_t sdaf; - vpx_variance_fn_t vf; - vpx_subpixvariance_fn_t svf; - vpx_subp_avg_variance_fn_t svaf; - vpx_sad_multi_fn_t sdx3f; - vpx_sad_multi_fn_t sdx8f; - vpx_sad_multi_d_fn_t sdx4df; + vpx_sad_fn_t sdf; + vpx_sad_avg_fn_t sdaf; + vpx_variance_fn_t vf; + vpx_subpixvariance_fn_t svf; + vpx_subp_avg_variance_fn_t svaf; + vpx_sad_multi_fn_t sdx3f; + vpx_sad_multi_fn_t sdx8f; + vpx_sad_multi_d_fn_t sdx4df; #if CONFIG_EXT_INTER - vpx_masked_sad_fn_t msdf; - vpx_masked_variance_fn_t mvf; + vpx_masked_sad_fn_t msdf; + vpx_masked_variance_fn_t mvf; vpx_masked_subpixvariance_fn_t msvf; #endif // CONFIG_EXT_INTER #if CONFIG_OBMC - vpx_obmc_sad_fn_t osdf; - vpx_obmc_variance_fn_t ovf; - vpx_obmc_subpixvariance_fn_t osvf; + vpx_obmc_sad_fn_t osdf; + vpx_obmc_variance_fn_t ovf; + vpx_obmc_subpixvariance_fn_t osvf; #endif // CONFIG_OBMC } vpx_variance_fn_ptr_t; #endif // CONFIG_VP10 void vpx_highbd_var_filter_block2d_bil_first_pass( - const uint8_t *src_ptr8, - uint16_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, const uint8_t *filter); void vpx_highbd_var_filter_block2d_bil_second_pass( - const uint16_t *src_ptr, - uint16_t *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, const uint8_t *filter); #ifdef __cplusplus diff --git a/vpx_dsp/vpx_convolve.c b/vpx_dsp/vpx_convolve.c index 59d0488122fd15ac2ab6dd45b876659daace0f08..20d83640cab358ff693d54df668d5c9be1d9aa1b 100644 --- a/vpx_dsp/vpx_convolve.c +++ b/vpx_dsp/vpx_convolve.c @@ -21,8 +21,8 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, - int x0_q4, int x_step_q4, int w, int h) { + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { int x, y; src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { @@ -31,8 +31,7 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_x[k] * x_filter[k]; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); x_q4 += x_step_q4; } @@ -43,8 +42,8 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, - int x0_q4, int x_step_q4, int w, int h) { + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { int x, y; src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { @@ -53,10 +52,9 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_x[k] * x_filter[k]; - dst[x] = ROUND_POWER_OF_TWO(dst[x] + - clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); x_q4 += x_step_q4; } src += src_stride; @@ -66,8 +64,8 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, - int y0_q4, int y_step_q4, int w, int h) { + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { int x, y; src -= src_stride * (SUBPEL_TAPS / 2 - 1); @@ -89,8 +87,8 @@ static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, - int y0_q4, int y_step_q4, int w, int h) { + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { int x, y; src -= src_stride * (SUBPEL_TAPS / 2 - 1); @@ -102,8 +100,10 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_y[k * src_stride] * y_filter[k]; - dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + - clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), + 1); y_q4 += y_step_q4; } ++src; @@ -111,13 +111,11 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, +static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4, int x_step_q4, - const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, - int w, int h) { + const InterpKernel *const y_filters, int y0_q4, + int y_step_q4, int w, int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -132,7 +130,7 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= MAX_SB_SIZE); assert(h <= MAX_SB_SIZE); @@ -140,12 +138,11 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, assert(y_step_q4 <= 32); assert(x_step_q4 <= 32); - convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, - temp, MAX_SB_SIZE, - x_filters, x0_q4, x_step_q4, w, intermediate_height); - convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, - dst, dst_stride, - y_filters, y0_q4, y_step_q4, w, h); + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, + MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w, + intermediate_height); + convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, + dst_stride, y_filters, y0_q4, y_step_q4, w, h); } static const InterpKernel *get_filter_base(const int16_t *filter) { @@ -161,67 +158,66 @@ static int get_filter_offset(const int16_t *f, const InterpKernel *base) { void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); (void)filter_y; (void)y_step_q4; - convolve_horiz(src, src_stride, dst, dst_stride, filters_x, - x0_q4, x_step_q4, w, h); + convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, + w, h); } void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); (void)filter_y; (void)y_step_q4; - convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, - x0_q4, x_step_q4, w, h); + convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h); } void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); (void)filter_x; (void)x_step_q4; - convolve_vert(src, src_stride, dst, dst_stride, filters_y, - y0_q4, y_step_q4, w, h); + convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, + w, h); } void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); (void)filter_x; (void)x_step_q4; - convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, - y0_q4, y_step_q4, w, h); + convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h); } -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); @@ -229,36 +225,35 @@ void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); - convolve(src, src_stride, dst, dst_stride, - filters_x, x0_q4, x_step_q4, + convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w, h); } -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { /* Fixed size intermediate buffer places limits on parameters. */ DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]); assert(w <= MAX_SB_SIZE); assert(h <= MAX_SB_SIZE); - vpx_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - vpx_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, - NULL, 0, NULL, 0, w, h); + vpx_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + vpx_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w, + h); } -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int filter_x_stride, const int16_t *filter_y, + int filter_y_stride, int w, int h) { int r; - (void)filter_x; (void)filter_x_stride; - (void)filter_y; (void)filter_y_stride; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; for (r = h; r > 0; --r) { memcpy(dst, src, w); @@ -267,47 +262,44 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, } } -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int filter_x_stride, const int16_t *filter_y, + int filter_y_stride, int w, int h) { int x, y; - (void)filter_x; (void)filter_x_stride; - (void)filter_y; (void)filter_y_stride; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) - dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); src += src_stride; dst += dst_stride; } } -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); } -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); } -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); @@ -316,8 +308,8 @@ void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); } @@ -325,17 +317,16 @@ void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { + const int16_t *filter_y, int y_step_q4, int w, + int h) { vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); } -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); } @@ -343,9 +334,8 @@ void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, #if CONFIG_VP9_HIGHBITDEPTH static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, - const InterpKernel *x_filters, - int x0_q4, int x_step_q4, - int w, int h, int bd) { + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int bd) { int x, y; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); @@ -356,8 +346,7 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_x[k] * x_filter[k]; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); x_q4 += x_step_q4; } @@ -368,9 +357,8 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, - const InterpKernel *x_filters, - int x0_q4, int x_step_q4, - int w, int h, int bd) { + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int bd) { int x, y; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); @@ -381,10 +369,10 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_x[k] * x_filter[k]; - dst[x] = ROUND_POWER_OF_TWO(dst[x] + - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1); + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), + 1); x_q4 += x_step_q4; } src += src_stride; @@ -394,9 +382,8 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, - const InterpKernel *y_filters, - int y0_q4, int y_step_q4, int w, int h, - int bd) { + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int bd) { int x, y; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); @@ -409,8 +396,8 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_y[k * src_stride] * y_filter[k]; - dst[y * dst_stride] = clip_pixel_highbd( - ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); y_q4 += y_step_q4; } ++src; @@ -420,9 +407,8 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, - const InterpKernel *y_filters, - int y0_q4, int y_step_q4, int w, int h, - int bd) { + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int bd) { int x, y; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); @@ -435,8 +421,10 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_y[k * src_stride] * y_filter[k]; - dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1); + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), + 1); y_q4 += y_step_q4; } ++src; @@ -446,11 +434,9 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, - int x0_q4, int x_step_q4, - const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, - int w, int h, int bd) { + const InterpKernel *const x_filters, int x0_q4, + int x_step_q4, const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h, int bd) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -465,7 +451,7 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= MAX_SB_SIZE); assert(h <= MAX_SB_SIZE); @@ -473,28 +459,25 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 <= 32); highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, - CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, - x_filters, x0_q4, x_step_q4, w, - intermediate_height, bd); + CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4, + x_step_q4, w, intermediate_height, bd); highbd_convolve_vert( - CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, - dst, dst_stride, - y_filters, y0_q4, y_step_q4, w, h, bd); + CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd); } - void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd) { + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); (void)filter_y; (void)y_step_q4; - highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, - x0_q4, x_step_q4, w, h, bd); + highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h, bd); } void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, @@ -507,22 +490,22 @@ void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, (void)filter_y; (void)y_step_q4; - highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, - x0_q4, x_step_q4, w, h, bd); + highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h, bd); } void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd) { + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); (void)filter_x; (void)x_step_q4; - highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, - y0_q4, y_step_q4, w, h, bd); + highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h, bd); } void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, @@ -535,42 +518,39 @@ void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, (void)filter_x; (void)x_step_q4; - highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, - y0_q4, y_step_q4, w, h, bd); + highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h, bd); } void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd) { + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); - highbd_convolve(src, src_stride, dst, dst_stride, - filters_x, x0_q4, x_step_q4, + highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd); } void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd) { + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]); assert(w <= MAX_SB_SIZE); assert(h <= MAX_SB_SIZE); - vpx_highbd_convolve8_c(src, src_stride, - CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, + vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); - vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, - dst, dst_stride, - NULL, 0, NULL, 0, w, h, bd); + vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst, + dst_stride, NULL, 0, NULL, 0, w, h, bd); } void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, diff --git a/vpx_dsp/vpx_convolve.h b/vpx_dsp/vpx_convolve.h index bd8679d1095112bd63f54ebb9a395789fc618fc0..1da2c1f7cca33bc5df26ecae30dc1babc58d7818 100644 --- a/vpx_dsp/vpx_convolve.h +++ b/vpx_dsp/vpx_convolve.h @@ -30,16 +30,16 @@ extern "C" { // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. #if CONFIG_VP10 && CONFIG_EXT_PARTITION -# define MAX_EXT_SIZE 263 +#define MAX_EXT_SIZE 263 #else -# define MAX_EXT_SIZE 135 +#define MAX_EXT_SIZE 135 #endif // CONFIG_VP10 && CONFIG_EXT_PARTITION typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); + const int16_t *filter_y, int y_step_q4, int w, + int h); #if CONFIG_VP9_HIGHBITDEPTH typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h index 4648c34e57e0c674ccc6c03d166a6f04c8122a0b..5706cad542979c0bd252a97d8e35caa924d44d3c 100644 --- a/vpx_dsp/vpx_dsp_common.h +++ b/vpx_dsp/vpx_dsp_common.h @@ -20,30 +20,30 @@ extern "C" { #endif #ifndef MAX_SB_SIZE -# if CONFIG_VP10 && CONFIG_EXT_PARTITION -# define MAX_SB_SIZE 128 -# else -# define MAX_SB_SIZE 64 -# endif // CONFIG_VP10 && CONFIG_EXT_PARTITION +#if CONFIG_VP10 && CONFIG_EXT_PARTITION +#define MAX_SB_SIZE 128 +#else +#define MAX_SB_SIZE 64 +#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION #endif // ndef MAX_SB_SIZE #define VPXMIN(x, y) (((x) < (y)) ? (x) : (y)) #define VPXMAX(x, y) (((x) > (y)) ? (x) : (y)) -#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b') +#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b') -#define IS_POWER_OF_TWO(x) (((x) & ((x) - 1)) == 0) +#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0) // These can be used to give a hint about branch outcomes. // This can have an effect, even if your target processor has a // good branch predictor, as these hints can affect basic block // ordering by the compiler. #ifdef __GNUC__ -# define LIKELY(v) __builtin_expect(v, 1) -# define UNLIKELY(v) __builtin_expect(v, 0) +#define LIKELY(v) __builtin_expect(v, 1) +#define UNLIKELY(v) __builtin_expect(v, 0) #else -# define LIKELY(v) (v) -# define UNLIKELY(v) (v) +#define LIKELY(v) (v) +#define UNLIKELY(v) (v) #endif #define VPX_SWAP(type, a, b) \ @@ -83,12 +83,9 @@ static INLINE double fclamp(double value, double low, double high) { static INLINE uint16_t clip_pixel_highbd(int val, int bd) { switch (bd) { case 8: - default: - return (uint16_t)clamp(val, 0, 255); - case 10: - return (uint16_t)clamp(val, 0, 1023); - case 12: - return (uint16_t)clamp(val, 0, 4095); + default: return (uint16_t)clamp(val, 0, 255); + case 10: return (uint16_t)clamp(val, 0, 1023); + case 12: return (uint16_t)clamp(val, 0, 4095); } } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp_rtcd.c b/vpx_dsp/vpx_dsp_rtcd.c index 5fe27b614bdcd7be4df2ea700f16190d1258ab27..030c456d391777b0ed3b365f7a13f81648a09a5d 100644 --- a/vpx_dsp/vpx_dsp_rtcd.c +++ b/vpx_dsp/vpx_dsp_rtcd.c @@ -12,6 +12,4 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/vpx_once.h" -void vpx_dsp_rtcd() { - once(setup_rtcd_internal); -} +void vpx_dsp_rtcd() { once(setup_rtcd_internal); } diff --git a/vpx_dsp/vpx_filter.h b/vpx_dsp/vpx_filter.h index cfe8161961ee718b1932a7ec8b6709a5a601985f..157daea988211aa5643100690bd71bd58526382c 100644 --- a/vpx_dsp/vpx_filter.h +++ b/vpx_dsp/vpx_filter.h @@ -13,7 +13,6 @@ #include "vpx/vpx_integer.h" - #ifdef __cplusplus extern "C" { #endif @@ -27,19 +26,13 @@ extern "C" { typedef int16_t InterpKernel[SUBPEL_TAPS]; -#define BIL_SUBPEL_BITS 3 -#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS) +#define BIL_SUBPEL_BITS 3 +#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS) // 2 tap bilinear filters static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = { - { 128, 0 }, - { 112, 16 }, - { 96, 32 }, - { 80, 48 }, - { 64, 64 }, - { 48, 80 }, - { 32, 96 }, - { 16, 112 }, + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, }; #ifdef __cplusplus diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index ecc215a8e79cc98291970473a645b34283900d0f..9eafc6ce9ed6c19274c3e8fbca76511c120881b5 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -18,7 +18,7 @@ void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max) { __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; - u0 = _mm_setzero_si128(); + u0 = _mm_setzero_si128(); // Row 0 s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); @@ -96,7 +96,7 @@ void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) { __m128i s0, s1, u0; unsigned int avg = 0; - u0 = _mm_setzero_si128(); + u0 = _mm_setzero_si128(); s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); s0 = _mm_adds_epu16(s0, s1); @@ -124,7 +124,7 @@ unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) { __m128i s0, s1, u0; unsigned int avg = 0; - u0 = _mm_setzero_si128(); + u0 = _mm_setzero_si128(); s0 = _mm_unpacklo_epi8(xx_loadl_32(s), u0); s1 = _mm_unpacklo_epi8(xx_loadl_32(s + p), u0); s0 = _mm_adds_epu16(s0, s1); @@ -251,8 +251,8 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, int16_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { - int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride - + (idx & 0x01) * 8; + int16_t const *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); } @@ -312,7 +312,7 @@ int vpx_satd_sse2(const int16_t *coeff, int length) { return _mm_cvtsi128_si32(accum); } -void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, +void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height) { int idx; __m128i zero = _mm_setzero_si128(); @@ -381,8 +381,7 @@ int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) { return _mm_extract_epi16(s0, 0); } -int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, - const int bwl) { +int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl) { int idx; int width = 4 << bwl; int16_t mean; @@ -401,23 +400,23 @@ int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, diff = _mm_subs_epi16(v0, v1); sum = _mm_add_epi16(sum, diff); - v0 = _mm_madd_epi16(diff, diff); + v0 = _mm_madd_epi16(diff, diff); sse = _mm_add_epi32(sse, v0); ref += 8; src += 8; } - v0 = _mm_srli_si128(sum, 8); + v0 = _mm_srli_si128(sum, 8); sum = _mm_add_epi16(sum, v0); - v0 = _mm_srli_epi64(sum, 32); + v0 = _mm_srli_epi64(sum, 32); sum = _mm_add_epi16(sum, v0); - v0 = _mm_srli_epi32(sum, 16); + v0 = _mm_srli_epi32(sum, 16); sum = _mm_add_epi16(sum, v0); - v1 = _mm_srli_si128(sse, 8); + v1 = _mm_srli_si128(sse, 8); sse = _mm_add_epi32(sse, v1); - v1 = _mm_srli_epi64(sse, 32); + v1 = _mm_srli_epi64(sse, 32); sse = _mm_add_epi32(sse, v1); mean = _mm_extract_epi16(sum, 0); diff --git a/vpx_dsp/x86/blend_a64_hmask_sse4.c b/vpx_dsp/x86/blend_a64_hmask_sse4.c index a10e0771bc92acc2e48bc218ff7e73d6f44f7c57..892323463cdc49313bcad37ca3f433b867ca3742 100644 --- a/vpx_dsp/x86/blend_a64_hmask_sse4.c +++ b/vpx_dsp/x86/blend_a64_hmask_sse4.c @@ -15,27 +15,21 @@ // To start out, just dispatch to the function using the 2D mask and // pass mask stride as 0. This can be improved upon if necessary. -void vpx_blend_a64_hmask_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { - vpx_blend_a64_mask_sse4_1(dst, dst_stride, - src0, src0_stride, - src1, src1_stride, - mask, 0, h, w, 0, 0); +void vpx_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + vpx_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, 0, h, w, 0, 0); } #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_blend_a64_hmask_sse4_1( - uint8_t *dst_8, uint32_t dst_stride, - const uint8_t *src0_8, uint32_t src0_stride, - const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int h, int w, - int bd) { - vpx_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, - src0_8, src0_stride, - src1_8, src1_stride, - mask, 0, h, w, 0, 0, bd); + uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, + uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + vpx_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride, + src1_8, src1_stride, mask, 0, h, w, 0, 0, + bd); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/blend_a64_mask_sse4.c b/vpx_dsp/x86/blend_a64_mask_sse4.c index 6aa89fa2ed1d9e5c9398b1dc5785cf4dc6abf555..a5c6de5de4b3eaf479902dfa89976d458544a60a 100644 --- a/vpx_dsp/x86/blend_a64_mask_sse4.c +++ b/vpx_dsp/x86/blend_a64_mask_sse4.c @@ -26,12 +26,11 @@ // No sub-sampling ////////////////////////////////////////////////////////////////////////////// -static void blend_a64_mask_w4_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { +static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -54,12 +53,11 @@ static void blend_a64_mask_w4_sse4_1( } while (--h); } -static void blend_a64_mask_w8_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { +static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -83,11 +81,9 @@ static void blend_a64_mask_w8_sse4_1( } static void blend_a64_mask_w16n_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { @@ -100,10 +96,9 @@ static void blend_a64_mask_w16n_sse4_1( const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, - v_m0l_w, v_m1l_w); - const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8, - v_m0h_w, v_m1h_w); + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); @@ -121,13 +116,11 @@ static void blend_a64_mask_w16n_sse4_1( ////////////////////////////////////////////////////////////////////////////// static void blend_a64_mask_sx_w4_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, - 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -153,13 +146,11 @@ static void blend_a64_mask_sx_w4_sse4_1( } static void blend_a64_mask_sx_w8_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, - 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -185,13 +176,11 @@ static void blend_a64_mask_sx_w8_sse4_1( } static void blend_a64_mask_sx_w16n_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, - 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { @@ -207,10 +196,9 @@ static void blend_a64_mask_sx_w16n_sse4_1( const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, - v_m0l_w, v_m1l_w); - const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8, - v_m0h_w, v_m1h_w); + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); @@ -228,11 +216,9 @@ static void blend_a64_mask_sx_w16n_sse4_1( ////////////////////////////////////////////////////////////////////////////// static void blend_a64_mask_sy_w4_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -259,11 +245,9 @@ static void blend_a64_mask_sy_w4_sse4_1( } static void blend_a64_mask_sy_w8_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -290,11 +274,9 @@ static void blend_a64_mask_sy_w8_sse4_1( } static void blend_a64_mask_sy_w16n_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { const __m128i v_zero = _mm_setzero_si128(); const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); @@ -310,10 +292,9 @@ static void blend_a64_mask_sy_w16n_sse4_1( const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, - v_m0l_w, v_m1l_w); - const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8, - v_m0h_w, v_m1h_w); + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); @@ -331,13 +312,11 @@ static void blend_a64_mask_sy_w16n_sse4_1( ////////////////////////////////////////////////////////////////////////////// static void blend_a64_mask_sx_sy_w4_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, - 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -347,8 +326,8 @@ static void blend_a64_mask_sx_sy_w4_sse4_1( const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), - v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); @@ -368,13 +347,11 @@ static void blend_a64_mask_sx_sy_w4_sse4_1( } static void blend_a64_mask_sx_sy_w8_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, - 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -384,8 +361,8 @@ static void blend_a64_mask_sx_sy_w8_sse4_1( const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), - v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); @@ -405,13 +382,11 @@ static void blend_a64_mask_sx_sy_w8_sse4_1( } static void blend_a64_mask_sx_sy_w16n_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, - 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { @@ -425,10 +400,10 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1( const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b); const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b); const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b); - const __m128i v_rvsbl_w = _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), - v_zmask_b); - const __m128i v_rvsbh_w = _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), - v_zmask_b); + const __m128i v_rvsbl_w = + _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b); + const __m128i v_rvsbh_w = + _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b); const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w); const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w); @@ -437,10 +412,9 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1( const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, - v_m0l_w, v_m1l_w); - const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8, - v_m0h_w, v_m1h_w); + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); @@ -457,30 +431,27 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1( // Dispatch ////////////////////////////////////////////////////////////////////////////// -void vpx_blend_a64_mask_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, int suby, int subx) { - typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w); +void vpx_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + int w, int suby, int subx) { + typedef void (*blend_fn)( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w); // Dimensions are: width_index X subx X suby static const blend_fn blend[3][2][2] = { - { // w % 16 == 0 - {blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1}, - {blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1} - }, { // w == 4 - {blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1}, - {blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1} - }, { // w == 8 - {blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1}, - {blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1} - } + { // w % 16 == 0 + { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 }, + { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 }, + { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } }, + { // w == 8 + { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 }, + { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } } }; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); @@ -492,17 +463,12 @@ void vpx_blend_a64_mask_sse4_1( assert(IS_POWER_OF_TWO(w)); if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) - vpx_blend_a64_mask_c(dst, dst_stride, - src0, src0_stride, - src1, src1_stride, - mask, mask_stride, - h, w, suby, subx); + vpx_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, h, w, suby, subx); } else { - blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, - src0, src0_stride, - src1, src1_stride, - mask, mask_stride, - h, w); + blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0, + src0_stride, src1, src1_stride, + mask, mask_stride, h, w); } } @@ -512,11 +478,9 @@ void vpx_blend_a64_mask_sse4_1( ////////////////////////////////////////////////////////////////////////////// static INLINE void blend_a64_mask_bn_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, blend_unit_fn blend) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { @@ -536,35 +500,28 @@ static INLINE void blend_a64_mask_bn_w4_sse4_1( } static void blend_a64_mask_b10_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b10); + src1_stride, mask, mask_stride, h, blend_4_b10); } static void blend_a64_mask_b12_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, - blend_4_b12); + src1_stride, mask, mask_stride, h, blend_4_b12); } static INLINE void blend_a64_mask_bn_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, blend_unit_fn blend) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { @@ -586,22 +543,18 @@ static INLINE void blend_a64_mask_bn_w8n_sse4_1( } static void blend_a64_mask_b10_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, blend_8_b10); } static void blend_a64_mask_b12_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, blend_8_b12); @@ -612,13 +565,11 @@ static void blend_a64_mask_b12_w8n_sse4_1( ////////////////////////////////////////////////////////////////////////////// static INLINE void blend_a64_mask_bn_sx_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, blend_unit_fn blend) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, - 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { @@ -640,11 +591,9 @@ static INLINE void blend_a64_mask_bn_sx_w4_sse4_1( } static void blend_a64_mask_b10_sx_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -652,25 +601,22 @@ static void blend_a64_mask_b10_sx_w4_sse4_1( } static void blend_a64_mask_b12_sx_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; - blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b12); } static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, blend_unit_fn blend) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, - 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { @@ -694,22 +640,18 @@ static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( } static void blend_a64_mask_b10_sx_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, blend_8_b10); } static void blend_a64_mask_b12_sx_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, blend_8_b12); @@ -720,11 +662,9 @@ static void blend_a64_mask_b12_sx_w8n_sse4_1( ////////////////////////////////////////////////////////////////////////////// static INLINE void blend_a64_mask_bn_sy_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, blend_unit_fn blend) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { @@ -747,11 +687,9 @@ static INLINE void blend_a64_mask_bn_sy_w4_sse4_1( } static void blend_a64_mask_b10_sy_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -759,11 +697,9 @@ static void blend_a64_mask_b10_sy_w4_sse4_1( } static void blend_a64_mask_b12_sy_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -771,11 +707,10 @@ static void blend_a64_mask_b12_sy_w4_sse4_1( } static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, blend_unit_fn blend) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { @@ -800,22 +735,18 @@ static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( } static void blend_a64_mask_b10_sy_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, blend_8_b10); } static void blend_a64_mask_b12_sy_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, blend_8_b12); @@ -826,13 +757,11 @@ static void blend_a64_mask_b12_sy_w8n_sse4_1( ////////////////////////////////////////////////////////////////////////////// static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, blend_unit_fn blend) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, - 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { @@ -840,8 +769,8 @@ static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), - v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); @@ -859,11 +788,9 @@ static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( } static void blend_a64_mask_b10_sx_sy_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -871,11 +798,9 @@ static void blend_a64_mask_b10_sx_sy_w4_sse4_1( } static void blend_a64_mask_b12_sx_sy_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { (void)w; blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -883,13 +808,12 @@ static void blend_a64_mask_b12_sx_sy_w4_sse4_1( } static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, blend_unit_fn blend) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, - 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { @@ -899,8 +823,8 @@ static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), - v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); @@ -918,22 +842,18 @@ static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( } static void blend_a64_mask_b10_sx_sy_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, blend_8_b10); } static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, w, blend_8_b12); @@ -943,38 +863,38 @@ static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( // Dispatch ////////////////////////////////////////////////////////////////////////////// -void vpx_highbd_blend_a64_mask_sse4_1( - uint8_t *dst_8, uint32_t dst_stride, - const uint8_t *src0_8, uint32_t src0_stride, - const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w, int suby, int subx, int bd) { - typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, - int h, int w); +void vpx_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, + uint32_t src0_stride, + const uint8_t *src1_8, + uint32_t src1_stride, const uint8_t *mask, + uint32_t mask_stride, int h, int w, + int suby, int subx, int bd) { + typedef void (*blend_fn)( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w); // Dimensions are: bd_index X width_index X subx X suby static const blend_fn blend[2][2][2][2] = { { // bd == 8 or 10 - { // w % 8 == 0 - {blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1}, - {blend_a64_mask_b10_sx_w8n_sse4_1, blend_a64_mask_b10_sx_sy_w8n_sse4_1} - }, { // w == 4 - {blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1}, - {blend_a64_mask_b10_sx_w4_sse4_1, blend_a64_mask_b10_sx_sy_w4_sse4_1} - } - }, + { // w % 8 == 0 + { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 }, + { blend_a64_mask_b10_sx_w8n_sse4_1, + blend_a64_mask_b10_sx_sy_w8n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 }, + { blend_a64_mask_b10_sx_w4_sse4_1, + blend_a64_mask_b10_sx_sy_w4_sse4_1 } } }, { // bd == 12 - { // w % 8 == 0 - {blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1}, - {blend_a64_mask_b12_sx_w8n_sse4_1, blend_a64_mask_b12_sx_sy_w8n_sse4_1} - }, { // w == 4 - {blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1}, - {blend_a64_mask_b12_sx_w4_sse4_1, blend_a64_mask_b12_sx_sy_w4_sse4_1} - } - } + { // w % 8 == 0 + { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 }, + { blend_a64_mask_b12_sx_w8n_sse4_1, + blend_a64_mask_b12_sx_sy_w8n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 }, + { blend_a64_mask_b12_sx_w4_sse4_1, + blend_a64_mask_b12_sx_sy_w4_sse4_1 } } } }; assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); @@ -987,21 +907,17 @@ void vpx_highbd_blend_a64_mask_sse4_1( assert(bd == 8 || bd == 10 || bd == 12); if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) - vpx_highbd_blend_a64_mask_c(dst_8, dst_stride, - src0_8, src0_stride, - src1_8, src1_stride, - mask, mask_stride, - h, w, suby, subx, bd); + vpx_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, mask_stride, h, w, suby, + subx, bd); } else { uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); - blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride, - src0, src0_stride, - src1, src1_stride, - mask, mask_stride, - h, w); + blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0]( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w); } } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/blend_a64_vmask_sse4.c b/vpx_dsp/x86/blend_a64_vmask_sse4.c index 85842e111935fec99f61da70d2b11d5a730f89f0..e7fe1bba150fe5bafdc95cdc30f4f27351416667 100644 --- a/vpx_dsp/x86/blend_a64_vmask_sse4.c +++ b/vpx_dsp/x86/blend_a64_vmask_sse4.c @@ -26,11 +26,10 @@ // Implementation - No sub-sampling ////////////////////////////////////////////////////////////////////////////// -static void blend_a64_vmask_w4_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { +static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -52,11 +51,10 @@ static void blend_a64_vmask_w4_sse4_1( } while (--h); } -static void blend_a64_vmask_w8_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { +static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); (void)w; @@ -78,11 +76,12 @@ static void blend_a64_vmask_w8_sse4_1( } while (--h); } -static void blend_a64_vmask_w16n_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { +static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, + uint32_t src0_stride, + const uint8_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); do { @@ -90,10 +89,9 @@ static void blend_a64_vmask_w16n_sse4_1( const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); for (c = 0; c < w; c += 16) { - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, - v_m0_w, v_m1_w); - const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8, - v_m0_w, v_m1_w); + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); @@ -110,15 +108,14 @@ static void blend_a64_vmask_w16n_sse4_1( // Dispatch ////////////////////////////////////////////////////////////////////////////// -void vpx_blend_a64_vmask_sse4_1( - uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { - typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride, - const uint8_t *src0, uint32_t src0_stride, - const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w); +void vpx_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w); // Dimension: width_index static const blend_fn blend[9] = { @@ -141,10 +138,8 @@ void vpx_blend_a64_vmask_sse4_1( assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); - blend[w & 0xf](dst, dst_stride, - src0, src0_stride, - src1, src1_stride, - mask, h, w); + blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h, + w); } #if CONFIG_VP9_HIGHBITDEPTH @@ -153,9 +148,8 @@ void vpx_blend_a64_vmask_sse4_1( ////////////////////////////////////////////////////////////////////////////// static INLINE void blend_a64_vmask_bn_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); @@ -174,32 +168,31 @@ static INLINE void blend_a64_vmask_bn_w4_sse4_1( } while (--h); } -static void blend_a64_vmask_b10_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { +static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { (void)w; blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, h, - blend_4_b10); + src1_stride, mask, h, blend_4_b10); } -static void blend_a64_vmask_b12_w4_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { +static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { (void)w; blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, h, - blend_4_b12); + src1_stride, mask, h, blend_4_b12); } static INLINE void blend_a64_vmask_bn_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA); @@ -219,24 +212,24 @@ static INLINE void blend_a64_vmask_bn_w8n_sse4_1( } while (--h); } -static void blend_a64_vmask_b10_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { +static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, h, w, - blend_8_b10); + src1_stride, mask, h, w, blend_8_b10); } -static void blend_a64_vmask_b12_w8n_sse4_1( - uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { +static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, h, w, - blend_8_b12); + src1_stride, mask, h, w, blend_8_b12); } ////////////////////////////////////////////////////////////////////////////// @@ -244,23 +237,25 @@ static void blend_a64_vmask_b12_w8n_sse4_1( ////////////////////////////////////////////////////////////////////////////// void vpx_highbd_blend_a64_vmask_sse4_1( - uint8_t *dst_8, uint32_t dst_stride, - const uint8_t *src0_8, uint32_t src0_stride, - const uint8_t *src1_8, uint32_t src1_stride, + uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, + uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd) { - typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride, - const uint16_t *src0, uint32_t src0_stride, - const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w); + typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w); // Dimensions are: bd_index X width_index static const blend_fn blend[2][2] = { - { // bd == 8 or 10 - blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0 - blend_a64_vmask_b10_w4_sse4_1, // w == 4 - }, { // bd == 12 - blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0 - blend_a64_vmask_b12_w4_sse4_1, // w == 4 + { + // bd == 8 or 10 + blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b10_w4_sse4_1, // w == 4 + }, + { + // bd == 12 + blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b12_w4_sse4_1, // w == 4 } }; @@ -275,19 +270,15 @@ void vpx_highbd_blend_a64_vmask_sse4_1( assert(bd == 8 || bd == 10 || bd == 12); if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) - vpx_highbd_blend_a64_vmask_c(dst_8, dst_stride, - src0_8, src0_stride, - src1_8, src1_stride, - mask, h, w, bd); + vpx_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, h, w, bd); } else { uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); - blend[bd == 12][(w >> 2) & 1](dst, dst_stride, - src0, src0_stride, - src1, src1_stride, - mask, h, w); + blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, w); } } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/blend_sse4.h b/vpx_dsp/x86/blend_sse4.h index 9b74f905442d05f58447493dd39bdcb837914d44..e3b031931541f49bea2bec4336c7f5676c4172d0 100644 --- a/vpx_dsp/x86/blend_sse4.h +++ b/vpx_dsp/x86/blend_sse4.h @@ -99,8 +99,8 @@ static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1, const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); // Scale - const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, - VPX_BLEND_A64_ROUND_BITS - 1); + const __m128i v_ssum_d = + _mm_srli_epi32(v_sum_d, VPX_BLEND_A64_ROUND_BITS - 1); // Pack const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); @@ -127,10 +127,10 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); // Scale - const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, - VPX_BLEND_A64_ROUND_BITS - 1); - const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, - VPX_BLEND_A64_ROUND_BITS - 1); + const __m128i v_ssuml_d = + _mm_srli_epi32(v_suml_d, VPX_BLEND_A64_ROUND_BITS - 1); + const __m128i v_ssumh_d = + _mm_srli_epi32(v_sumh_d, VPX_BLEND_A64_ROUND_BITS - 1); // Pack const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h index ab387d664d3828fe5fcde52ecb183330959e2310..ae1089e38a605c2f64183e04bd5987b3644cba45 100644 --- a/vpx_dsp/x86/convolve.h +++ b/vpx_dsp/x86/convolve.h @@ -17,272 +17,186 @@ #include "vpx_ports/mem.h" #include "vpx_dsp/vpx_convolve.h" -typedef void filter8_1dfunction ( - const uint8_t *src_ptr, - ptrdiff_t src_pitch, - uint8_t *output_ptr, - ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter -); +typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, const int16_t *filter); -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h) { \ - assert(filter[3] != 128); \ - assert(step_q4 == 16); \ - if (filter[0] | filter[1] | filter[2]) { \ - while (w >= 16) { \ - vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - if (w == 8) { \ - vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - } else if (w == 4) { \ - vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - } \ - } else { \ - while (w >= 16) { \ - vpx_filter_block1d16_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - if (w == 8) { \ - vpx_filter_block1d8_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - } else if (w == 4) { \ - vpx_filter_block1d4_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - } \ - } \ -} +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vpx_convolve8_##name##_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + assert(filter[3] != 128); \ + assert(step_q4 == 16); \ + if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + if (w == 8) { \ + vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + } else if (w == 4) { \ + vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + } \ + } else { \ + while (w >= 16) { \ + vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + if (w == 8) { \ + vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + } else if (w == 4) { \ + vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + } \ + } \ + } -#define FUN_CONV_2D(avg, opt) \ -void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h) { \ - assert(filter_x[3] != 128); \ - assert(filter_y[3] != 128); \ - assert(w <= MAX_SB_SIZE); \ - assert(h <= MAX_SB_SIZE); \ - assert(x_step_q4 == 16); \ - assert(y_step_q4 == 16); \ - if (filter_x[0] || filter_x[1] || filter_x[2]|| \ - filter_y[0] || filter_y[1] || filter_y[2]) { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+7)]); \ - vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ - fdata2, MAX_SB_SIZE, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h + 7); \ - vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ - dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } else { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+1)]); \ - vpx_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h + 1); \ - vpx_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } \ -} +#define FUN_CONV_2D(avg, opt) \ + void vpx_convolve8_##avg##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + assert(filter_x[3] != 128); \ + assert(filter_y[3] != 128); \ + assert(w <= MAX_SB_SIZE); \ + assert(h <= MAX_SB_SIZE); \ + assert(x_step_q4 == 16); \ + assert(y_step_q4 == 16); \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] || \ + filter_y[1] || filter_y[2]) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ + vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, \ + MAX_SB_SIZE, filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h + 7); \ + vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ + dst, dst_stride, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \ + vpx_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, \ + h + 1); \ + vpx_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } \ + } #if CONFIG_VP9_HIGHBITDEPTH -typedef void highbd_filter8_1dfunction ( - const uint16_t *src_ptr, - const ptrdiff_t src_pitch, - uint16_t *output_ptr, - ptrdiff_t out_pitch, - unsigned int output_height, - const int16_t *filter, - int bd -); +typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, int bd); #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ - ptrdiff_t src_stride, \ - uint8_t *dst8, \ - ptrdiff_t dst_stride, \ - const int16_t *filter_x, \ - int x_step_q4, \ - const int16_t *filter_y, \ - int y_step_q4, \ - int w, int h, int bd) { \ - if (step_q4 == 16 && filter[3] != 128) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - if (filter[0] | filter[1] | filter[2]) { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ - } \ -} + void vpx_highbd_convolve8_##name##_##opt( \ + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h, bd); \ + } \ + } -#define HIGH_FUN_CONV_2D(avg, opt) \ -void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h, int bd) { \ - assert(w <= MAX_SB_SIZE); \ - assert(h <= MAX_SB_SIZE); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ - filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+7)]); \ - vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, \ - src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), \ - MAX_SB_SIZE, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h + 7, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, \ - MAX_SB_SIZE, \ - dst, \ - dst_stride, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h, bd); \ - } else { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+1)]); \ - vpx_highbd_convolve8_horiz_##opt(src, \ - src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), \ - MAX_SB_SIZE, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h + 1, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), \ - MAX_SB_SIZE, \ - dst, \ - dst_stride, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h, bd); \ - } \ - } else { \ - vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, \ - h, bd); \ - } \ -} +#define HIGH_FUN_CONV_2D(avg, opt) \ + void vpx_highbd_convolve8_##avg##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ + assert(w <= MAX_SB_SIZE); \ + assert(h <= MAX_SB_SIZE); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, \ + fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ + vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), \ + MAX_SB_SIZE, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h + 7, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, MAX_SB_SIZE, dst, \ + dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, \ + fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \ + vpx_highbd_convolve8_horiz_##opt( \ + src, src_stride, CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ + } \ + } else { \ + vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h, bd); \ + } \ + } #endif // CONFIG_VP9_HIGHBITDEPTH #endif // VPX_DSP_X86_CONVOLVE_H_ diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h index 951af3a622dbeb24ed90bdada0e22e9b259c4217..39d3a3f59c1c4c0759b39abd1b008a96a808757d 100644 --- a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h +++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h @@ -13,15 +13,15 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/txfm_common.h" -#define pair256_set_epi16(a, b) \ +#define pair256_set_epi16(a, b) \ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) -#define pair256_set_epi32(a, b) \ - _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), \ - (int)(b), (int)(a), (int)(b), (int)(a)) +#define pair256_set_epi32(a, b) \ + _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \ + (int)(b), (int)(a)) #if FDCT32x32_HIGH_PRECISION static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) { @@ -40,8 +40,7 @@ static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) { } #endif -void FDCT32x32_2D_AVX2(const int16_t *input, - int16_t *output_org, int stride) { +void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) { // Calculate pre-multiplied strides const int str1 = stride; const int str2 = 2 * stride; @@ -53,43 +52,45 @@ void FDCT32x32_2D_AVX2(const int16_t *input, // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64); - const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i k__cospi_p16_m16 = + pair256_set_epi16(+cospi_16_64, -cospi_16_64); + const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64); - const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64); - const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64); + const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64); + const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64); + const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); + const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); + const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64); const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); - const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); - const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64); - const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64); - const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64); - const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64); - const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64); - const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64); - const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64); - const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64); - const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64); - const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64); - const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64); - const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64); - const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64); - const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64); - const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64); - const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64); - const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64); - const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64); - const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64); - const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64); - const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64); - const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64); - const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); - const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); + const __m256i k__cospi_m12_m20 = + pair256_set_epi16(-cospi_12_64, -cospi_20_64); + const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64); + const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64); + const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64); + const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64); + const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64); + const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64); + const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64); + const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64); + const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64); + const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64); + const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64); + const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64); + const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64); + const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64); + const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64); + const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64); + const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64); + const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64); + const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64); + const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64); + const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64); + const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64); + const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); + const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); const __m256i kZero = _mm256_set1_epi16(0); - const __m256i kOne = _mm256_set1_epi16(1); + const __m256i kOne = _mm256_set1_epi16(1); // Do the two transform/transpose passes int pass; for (pass = 0; pass < 2; ++pass) { @@ -104,125 +105,149 @@ void FDCT32x32_2D_AVX2(const int16_t *input, // Note: even though all the loads below are aligned, using the aligned // intrinsic make the code slightly slower. if (0 == pass) { - const int16_t *in = &input[column_start]; + const int16_t *in = &input[column_start]; // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; // Note: the next four blocks could be in a loop. That would help the // instruction cache but is actually slower. { - const int16_t *ina = in + 0 * str1; - const int16_t *inb = in + 31 * str1; - __m256i *step1a = &step1[ 0]; + const int16_t *ina = in + 0 * str1; + const int16_t *inb = in + 31 * str1; + __m256i *step1a = &step1[0]; __m256i *step1b = &step1[31]; - const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); - const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1)); - const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2)); - const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3)); - const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3)); - const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2)); - const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1)); - const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); - step1a[ 0] = _mm256_add_epi16(ina0, inb0); - step1a[ 1] = _mm256_add_epi16(ina1, inb1); - step1a[ 2] = _mm256_add_epi16(ina2, inb2); - step1a[ 3] = _mm256_add_epi16(ina3, inb3); + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); step1b[-3] = _mm256_sub_epi16(ina3, inb3); step1b[-2] = _mm256_sub_epi16(ina2, inb2); step1b[-1] = _mm256_sub_epi16(ina1, inb1); step1b[-0] = _mm256_sub_epi16(ina0, inb0); - step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2); - step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2); - step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2); - step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); } { - const int16_t *ina = in + 4 * str1; - const int16_t *inb = in + 27 * str1; - __m256i *step1a = &step1[ 4]; + const int16_t *ina = in + 4 * str1; + const int16_t *inb = in + 27 * str1; + __m256i *step1a = &step1[4]; __m256i *step1b = &step1[27]; - const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); - const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1)); - const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2)); - const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3)); - const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3)); - const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2)); - const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1)); - const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); - step1a[ 0] = _mm256_add_epi16(ina0, inb0); - step1a[ 1] = _mm256_add_epi16(ina1, inb1); - step1a[ 2] = _mm256_add_epi16(ina2, inb2); - step1a[ 3] = _mm256_add_epi16(ina3, inb3); + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); step1b[-3] = _mm256_sub_epi16(ina3, inb3); step1b[-2] = _mm256_sub_epi16(ina2, inb2); step1b[-1] = _mm256_sub_epi16(ina1, inb1); step1b[-0] = _mm256_sub_epi16(ina0, inb0); - step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2); - step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2); - step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2); - step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); } { - const int16_t *ina = in + 8 * str1; - const int16_t *inb = in + 23 * str1; - __m256i *step1a = &step1[ 8]; + const int16_t *ina = in + 8 * str1; + const int16_t *inb = in + 23 * str1; + __m256i *step1a = &step1[8]; __m256i *step1b = &step1[23]; - const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); - const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1)); - const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2)); - const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3)); - const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3)); - const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2)); - const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1)); - const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); - step1a[ 0] = _mm256_add_epi16(ina0, inb0); - step1a[ 1] = _mm256_add_epi16(ina1, inb1); - step1a[ 2] = _mm256_add_epi16(ina2, inb2); - step1a[ 3] = _mm256_add_epi16(ina3, inb3); + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); step1b[-3] = _mm256_sub_epi16(ina3, inb3); step1b[-2] = _mm256_sub_epi16(ina2, inb2); step1b[-1] = _mm256_sub_epi16(ina1, inb1); step1b[-0] = _mm256_sub_epi16(ina0, inb0); - step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2); - step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2); - step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2); - step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); } { - const int16_t *ina = in + 12 * str1; - const int16_t *inb = in + 19 * str1; + const int16_t *ina = in + 12 * str1; + const int16_t *inb = in + 19 * str1; __m256i *step1a = &step1[12]; __m256i *step1b = &step1[19]; - const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); - const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1)); - const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2)); - const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3)); - const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3)); - const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2)); - const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1)); - const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); - step1a[ 0] = _mm256_add_epi16(ina0, inb0); - step1a[ 1] = _mm256_add_epi16(ina1, inb1); - step1a[ 2] = _mm256_add_epi16(ina2, inb2); - step1a[ 3] = _mm256_add_epi16(ina3, inb3); + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); step1b[-3] = _mm256_sub_epi16(ina3, inb3); step1b[-2] = _mm256_sub_epi16(ina2, inb2); step1b[-1] = _mm256_sub_epi16(ina1, inb1); step1b[-0] = _mm256_sub_epi16(ina0, inb0); - step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2); - step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2); - step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2); - step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); @@ -237,52 +262,52 @@ void FDCT32x32_2D_AVX2(const int16_t *input, // Note: the next four blocks could be in a loop. That would help the // instruction cache but is actually slower. { - __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32)); - __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32)); - __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32)); - __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32)); - __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32)); - __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32)); - __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32)); - __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32)); - step1[ 0] = _mm256_add_epi16(in00, in31); - step1[ 1] = _mm256_add_epi16(in01, in30); - step1[ 2] = _mm256_add_epi16(in02, in29); - step1[ 3] = _mm256_add_epi16(in03, in28); + __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32)); + __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32)); + __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32)); + __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32)); + __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32)); + __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32)); + __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32)); + __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32)); + step1[0] = _mm256_add_epi16(in00, in31); + step1[1] = _mm256_add_epi16(in01, in30); + step1[2] = _mm256_add_epi16(in02, in29); + step1[3] = _mm256_add_epi16(in03, in28); step1[28] = _mm256_sub_epi16(in03, in28); step1[29] = _mm256_sub_epi16(in02, in29); step1[30] = _mm256_sub_epi16(in01, in30); step1[31] = _mm256_sub_epi16(in00, in31); } { - __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32)); - __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32)); - __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32)); - __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32)); - __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32)); - __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32)); - __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32)); - __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32)); - step1[ 4] = _mm256_add_epi16(in04, in27); - step1[ 5] = _mm256_add_epi16(in05, in26); - step1[ 6] = _mm256_add_epi16(in06, in25); - step1[ 7] = _mm256_add_epi16(in07, in24); + __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32)); + __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32)); + __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32)); + __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32)); + __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32)); + __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32)); + __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32)); + __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32)); + step1[4] = _mm256_add_epi16(in04, in27); + step1[5] = _mm256_add_epi16(in05, in26); + step1[6] = _mm256_add_epi16(in06, in25); + step1[7] = _mm256_add_epi16(in07, in24); step1[24] = _mm256_sub_epi16(in07, in24); step1[25] = _mm256_sub_epi16(in06, in25); step1[26] = _mm256_sub_epi16(in05, in26); step1[27] = _mm256_sub_epi16(in04, in27); } { - __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32)); - __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32)); - __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32)); - __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32)); - __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32)); - __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32)); - __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32)); - __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32)); - step1[ 8] = _mm256_add_epi16(in08, in23); - step1[ 9] = _mm256_add_epi16(in09, in22); + __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32)); + __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32)); + __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32)); + __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32)); + __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32)); + __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32)); + __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32)); + __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32)); + step1[8] = _mm256_add_epi16(in08, in23); + step1[9] = _mm256_add_epi16(in09, in22); step1[10] = _mm256_add_epi16(in10, in21); step1[11] = _mm256_add_epi16(in11, in20); step1[20] = _mm256_sub_epi16(in11, in20); @@ -291,14 +316,14 @@ void FDCT32x32_2D_AVX2(const int16_t *input, step1[23] = _mm256_sub_epi16(in08, in23); } { - __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32)); - __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32)); - __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32)); - __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32)); - __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32)); - __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32)); - __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32)); - __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32)); + __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32)); + __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32)); + __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32)); + __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32)); + __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32)); + __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32)); + __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32)); + __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32)); step1[12] = _mm256_add_epi16(in12, in19); step1[13] = _mm256_add_epi16(in13, in18); step1[14] = _mm256_add_epi16(in14, in17); @@ -311,16 +336,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, } // Stage 2 { - step2[ 0] = _mm256_add_epi16(step1[0], step1[15]); - step2[ 1] = _mm256_add_epi16(step1[1], step1[14]); - step2[ 2] = _mm256_add_epi16(step1[2], step1[13]); - step2[ 3] = _mm256_add_epi16(step1[3], step1[12]); - step2[ 4] = _mm256_add_epi16(step1[4], step1[11]); - step2[ 5] = _mm256_add_epi16(step1[5], step1[10]); - step2[ 6] = _mm256_add_epi16(step1[6], step1[ 9]); - step2[ 7] = _mm256_add_epi16(step1[7], step1[ 8]); - step2[ 8] = _mm256_sub_epi16(step1[7], step1[ 8]); - step2[ 9] = _mm256_sub_epi16(step1[6], step1[ 9]); + step2[0] = _mm256_add_epi16(step1[0], step1[15]); + step2[1] = _mm256_add_epi16(step1[1], step1[14]); + step2[2] = _mm256_add_epi16(step1[2], step1[13]); + step2[3] = _mm256_add_epi16(step1[3], step1[12]); + step2[4] = _mm256_add_epi16(step1[4], step1[11]); + step2[5] = _mm256_add_epi16(step1[5], step1[10]); + step2[6] = _mm256_add_epi16(step1[6], step1[9]); + step2[7] = _mm256_add_epi16(step1[7], step1[8]); + step2[8] = _mm256_sub_epi16(step1[7], step1[8]); + step2[9] = _mm256_sub_epi16(step1[6], step1[9]); step2[10] = _mm256_sub_epi16(step1[5], step1[10]); step2[11] = _mm256_sub_epi16(step1[4], step1[11]); step2[12] = _mm256_sub_epi16(step1[3], step1[12]); @@ -354,22 +379,38 @@ void FDCT32x32_2D_AVX2(const int16_t *input, const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16); const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16); // dct_const_round_shift - const __m256i s2_20_4 = _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); - const __m256i s2_20_5 = _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); - const __m256i s2_21_4 = _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); - const __m256i s2_21_5 = _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); - const __m256i s2_22_4 = _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); - const __m256i s2_22_5 = _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); - const __m256i s2_23_4 = _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); - const __m256i s2_23_5 = _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); - const __m256i s2_24_4 = _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); - const __m256i s2_24_5 = _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); - const __m256i s2_25_4 = _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); - const __m256i s2_25_5 = _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); - const __m256i s2_26_4 = _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); - const __m256i s2_26_5 = _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); - const __m256i s2_27_4 = _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); - const __m256i s2_27_5 = _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); + const __m256i s2_20_4 = + _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); + const __m256i s2_20_5 = + _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); + const __m256i s2_21_4 = + _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); + const __m256i s2_21_5 = + _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); + const __m256i s2_22_4 = + _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); + const __m256i s2_22_5 = + _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); + const __m256i s2_23_4 = + _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); + const __m256i s2_23_5 = + _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); + const __m256i s2_24_4 = + _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); + const __m256i s2_24_5 = + _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); + const __m256i s2_25_4 = + _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); + const __m256i s2_25_5 = + _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); + const __m256i s2_26_4 = + _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); + const __m256i s2_26_5 = + _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); + const __m256i s2_27_4 = + _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); + const __m256i s2_27_5 = + _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS); const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS); const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS); @@ -401,49 +442,49 @@ void FDCT32x32_2D_AVX2(const int16_t *input, // dump the magnitude by half, hence the intermediate values are within // the range of 16 bits. if (1 == pass) { - __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero,step2[ 0]); - __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero,step2[ 1]); - __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero,step2[ 2]); - __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero,step2[ 3]); - __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero,step2[ 4]); - __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero,step2[ 5]); - __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero,step2[ 6]); - __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero,step2[ 7]); - __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero,step2[ 8]); - __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero,step2[ 9]); - __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero,step2[10]); - __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero,step2[11]); - __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero,step2[12]); - __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero,step2[13]); - __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero,step2[14]); - __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero,step2[15]); - __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero,step1[16]); - __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero,step1[17]); - __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero,step1[18]); - __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero,step1[19]); - __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero,step2[20]); - __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero,step2[21]); - __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero,step2[22]); - __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero,step2[23]); - __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero,step2[24]); - __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero,step2[25]); - __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero,step2[26]); - __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero,step2[27]); - __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero,step1[28]); - __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero,step1[29]); - __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero,step1[30]); - __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero,step1[31]); - - step2[ 0] = _mm256_sub_epi16(step2[ 0], s3_00_0); - step2[ 1] = _mm256_sub_epi16(step2[ 1], s3_01_0); - step2[ 2] = _mm256_sub_epi16(step2[ 2], s3_02_0); - step2[ 3] = _mm256_sub_epi16(step2[ 3], s3_03_0); - step2[ 4] = _mm256_sub_epi16(step2[ 4], s3_04_0); - step2[ 5] = _mm256_sub_epi16(step2[ 5], s3_05_0); - step2[ 6] = _mm256_sub_epi16(step2[ 6], s3_06_0); - step2[ 7] = _mm256_sub_epi16(step2[ 7], s3_07_0); - step2[ 8] = _mm256_sub_epi16(step2[ 8], s2_08_0); - step2[ 9] = _mm256_sub_epi16(step2[ 9], s2_09_0); + __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]); + __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]); + __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]); + __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]); + __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]); + __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]); + __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]); + __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]); + __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]); + __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]); + __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]); + __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]); + __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]); + __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]); + __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]); + __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]); + __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]); + __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]); + __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]); + __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]); + __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]); + __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]); + __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]); + __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]); + __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]); + __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]); + __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]); + __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]); + __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]); + __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]); + __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]); + __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]); + + step2[0] = _mm256_sub_epi16(step2[0], s3_00_0); + step2[1] = _mm256_sub_epi16(step2[1], s3_01_0); + step2[2] = _mm256_sub_epi16(step2[2], s3_02_0); + step2[3] = _mm256_sub_epi16(step2[3], s3_03_0); + step2[4] = _mm256_sub_epi16(step2[4], s3_04_0); + step2[5] = _mm256_sub_epi16(step2[5], s3_05_0); + step2[6] = _mm256_sub_epi16(step2[6], s3_06_0); + step2[7] = _mm256_sub_epi16(step2[7], s3_07_0); + step2[8] = _mm256_sub_epi16(step2[8], s2_08_0); + step2[9] = _mm256_sub_epi16(step2[9], s2_09_0); step2[10] = _mm256_sub_epi16(step2[10], s3_10_0); step2[11] = _mm256_sub_epi16(step2[11], s3_11_0); step2[12] = _mm256_sub_epi16(step2[12], s3_12_0); @@ -467,16 +508,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, step1[30] = _mm256_sub_epi16(step1[30], s3_30_0); step1[31] = _mm256_sub_epi16(step1[31], s3_31_0); - step2[ 0] = _mm256_add_epi16(step2[ 0], kOne); - step2[ 1] = _mm256_add_epi16(step2[ 1], kOne); - step2[ 2] = _mm256_add_epi16(step2[ 2], kOne); - step2[ 3] = _mm256_add_epi16(step2[ 3], kOne); - step2[ 4] = _mm256_add_epi16(step2[ 4], kOne); - step2[ 5] = _mm256_add_epi16(step2[ 5], kOne); - step2[ 6] = _mm256_add_epi16(step2[ 6], kOne); - step2[ 7] = _mm256_add_epi16(step2[ 7], kOne); - step2[ 8] = _mm256_add_epi16(step2[ 8], kOne); - step2[ 9] = _mm256_add_epi16(step2[ 9], kOne); + step2[0] = _mm256_add_epi16(step2[0], kOne); + step2[1] = _mm256_add_epi16(step2[1], kOne); + step2[2] = _mm256_add_epi16(step2[2], kOne); + step2[3] = _mm256_add_epi16(step2[3], kOne); + step2[4] = _mm256_add_epi16(step2[4], kOne); + step2[5] = _mm256_add_epi16(step2[5], kOne); + step2[6] = _mm256_add_epi16(step2[6], kOne); + step2[7] = _mm256_add_epi16(step2[7], kOne); + step2[8] = _mm256_add_epi16(step2[8], kOne); + step2[9] = _mm256_add_epi16(step2[9], kOne); step2[10] = _mm256_add_epi16(step2[10], kOne); step2[11] = _mm256_add_epi16(step2[11], kOne); step2[12] = _mm256_add_epi16(step2[12], kOne); @@ -500,16 +541,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, step1[30] = _mm256_add_epi16(step1[30], kOne); step1[31] = _mm256_add_epi16(step1[31], kOne); - step2[ 0] = _mm256_srai_epi16(step2[ 0], 2); - step2[ 1] = _mm256_srai_epi16(step2[ 1], 2); - step2[ 2] = _mm256_srai_epi16(step2[ 2], 2); - step2[ 3] = _mm256_srai_epi16(step2[ 3], 2); - step2[ 4] = _mm256_srai_epi16(step2[ 4], 2); - step2[ 5] = _mm256_srai_epi16(step2[ 5], 2); - step2[ 6] = _mm256_srai_epi16(step2[ 6], 2); - step2[ 7] = _mm256_srai_epi16(step2[ 7], 2); - step2[ 8] = _mm256_srai_epi16(step2[ 8], 2); - step2[ 9] = _mm256_srai_epi16(step2[ 9], 2); + step2[0] = _mm256_srai_epi16(step2[0], 2); + step2[1] = _mm256_srai_epi16(step2[1], 2); + step2[2] = _mm256_srai_epi16(step2[2], 2); + step2[3] = _mm256_srai_epi16(step2[3], 2); + step2[4] = _mm256_srai_epi16(step2[4], 2); + step2[5] = _mm256_srai_epi16(step2[5], 2); + step2[6] = _mm256_srai_epi16(step2[6], 2); + step2[7] = _mm256_srai_epi16(step2[7], 2); + step2[8] = _mm256_srai_epi16(step2[8], 2); + step2[9] = _mm256_srai_epi16(step2[9], 2); step2[10] = _mm256_srai_epi16(step2[10], 2); step2[11] = _mm256_srai_epi16(step2[11], 2); step2[12] = _mm256_srai_epi16(step2[12], 2); @@ -538,616 +579,796 @@ void FDCT32x32_2D_AVX2(const int16_t *input, #if FDCT32x32_HIGH_PRECISION if (pass == 0) { #endif - // Stage 3 - { - step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]); - step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]); - step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]); - step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]); - step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]); - step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]); - step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]); - step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]); - } - { - const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); - const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); - const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); - const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); - const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); - const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); - const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); - const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); - const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); - const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); - const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); - const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); - // Combine - step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7); - step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7); - step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7); - step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7); - } - { - step3[16] = _mm256_add_epi16(step2[23], step1[16]); - step3[17] = _mm256_add_epi16(step2[22], step1[17]); - step3[18] = _mm256_add_epi16(step2[21], step1[18]); - step3[19] = _mm256_add_epi16(step2[20], step1[19]); - step3[20] = _mm256_sub_epi16(step1[19], step2[20]); - step3[21] = _mm256_sub_epi16(step1[18], step2[21]); - step3[22] = _mm256_sub_epi16(step1[17], step2[22]); - step3[23] = _mm256_sub_epi16(step1[16], step2[23]); - step3[24] = _mm256_sub_epi16(step1[31], step2[24]); - step3[25] = _mm256_sub_epi16(step1[30], step2[25]); - step3[26] = _mm256_sub_epi16(step1[29], step2[26]); - step3[27] = _mm256_sub_epi16(step1[28], step2[27]); - step3[28] = _mm256_add_epi16(step2[27], step1[28]); - step3[29] = _mm256_add_epi16(step2[26], step1[29]); - step3[30] = _mm256_add_epi16(step2[25], step1[30]); - step3[31] = _mm256_add_epi16(step2[24], step1[31]); - } + // Stage 3 + { + step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]); + step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]); + step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]); + step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]); + step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]); + step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]); + step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]); + step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]); + } + { + const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); + const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); + const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); + const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); + const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s3_10_4 = + _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m256i s3_10_5 = + _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m256i s3_11_4 = + _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m256i s3_11_5 = + _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m256i s3_12_4 = + _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m256i s3_12_5 = + _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m256i s3_13_4 = + _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m256i s3_13_5 = + _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7); + } + { + step3[16] = _mm256_add_epi16(step2[23], step1[16]); + step3[17] = _mm256_add_epi16(step2[22], step1[17]); + step3[18] = _mm256_add_epi16(step2[21], step1[18]); + step3[19] = _mm256_add_epi16(step2[20], step1[19]); + step3[20] = _mm256_sub_epi16(step1[19], step2[20]); + step3[21] = _mm256_sub_epi16(step1[18], step2[21]); + step3[22] = _mm256_sub_epi16(step1[17], step2[22]); + step3[23] = _mm256_sub_epi16(step1[16], step2[23]); + step3[24] = _mm256_sub_epi16(step1[31], step2[24]); + step3[25] = _mm256_sub_epi16(step1[30], step2[25]); + step3[26] = _mm256_sub_epi16(step1[29], step2[26]); + step3[27] = _mm256_sub_epi16(step1[28], step2[27]); + step3[28] = _mm256_add_epi16(step2[27], step1[28]); + step3[29] = _mm256_add_epi16(step2[26], step1[29]); + step3[30] = _mm256_add_epi16(step2[25], step1[30]); + step3[31] = _mm256_add_epi16(step2[24], step1[31]); + } - // Stage 4 - { - step1[ 0] = _mm256_add_epi16(step3[ 3], step3[ 0]); - step1[ 1] = _mm256_add_epi16(step3[ 2], step3[ 1]); - step1[ 2] = _mm256_sub_epi16(step3[ 1], step3[ 2]); - step1[ 3] = _mm256_sub_epi16(step3[ 0], step3[ 3]); - step1[ 8] = _mm256_add_epi16(step3[11], step2[ 8]); - step1[ 9] = _mm256_add_epi16(step3[10], step2[ 9]); - step1[10] = _mm256_sub_epi16(step2[ 9], step3[10]); - step1[11] = _mm256_sub_epi16(step2[ 8], step3[11]); - step1[12] = _mm256_sub_epi16(step2[15], step3[12]); - step1[13] = _mm256_sub_epi16(step2[14], step3[13]); - step1[14] = _mm256_add_epi16(step3[13], step2[14]); - step1[15] = _mm256_add_epi16(step3[12], step2[15]); - } - { - const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]); - const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]); - const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16); - const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16); - const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16); - const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m256i s1_05_4 = _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); - const __m256i s1_05_5 = _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); - const __m256i s1_06_4 = _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); - const __m256i s1_06_5 = _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); - const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS); - const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS); - const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS); - const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS); - // Combine - step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7); - step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7); - } - { - const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]); - const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]); - const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]); - const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]); - const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]); - const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]); - const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]); - const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]); - const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24); - const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24); - const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24); - const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24); - const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08); - const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08); - const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08); - const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08); - const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24); - const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24); - const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24); - const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24); - const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08); - const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08); - const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08); - const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m256i s1_18_4 = _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); - const __m256i s1_18_5 = _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); - const __m256i s1_19_4 = _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); - const __m256i s1_19_5 = _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); - const __m256i s1_20_4 = _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); - const __m256i s1_20_5 = _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); - const __m256i s1_21_4 = _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); - const __m256i s1_21_5 = _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); - const __m256i s1_26_4 = _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); - const __m256i s1_26_5 = _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); - const __m256i s1_27_4 = _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); - const __m256i s1_27_5 = _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); - const __m256i s1_28_4 = _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); - const __m256i s1_28_5 = _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); - const __m256i s1_29_4 = _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); - const __m256i s1_29_5 = _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); - const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS); - const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS); - const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS); - const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS); - const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS); - const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS); - const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS); - const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS); - const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS); - const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS); - const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS); - const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS); - const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS); - const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS); - const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS); - const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS); - // Combine - step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7); - step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7); - step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7); - step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7); - step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7); - step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7); - step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7); - step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7); - } - // Stage 5 - { - step2[4] = _mm256_add_epi16(step1[5], step3[4]); - step2[5] = _mm256_sub_epi16(step3[4], step1[5]); - step2[6] = _mm256_sub_epi16(step3[7], step1[6]); - step2[7] = _mm256_add_epi16(step1[6], step3[7]); - } - { - const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]); - const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]); - const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]); - const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]); - const __m256i out_00_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_p16); - const __m256i out_00_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_p16); - const __m256i out_16_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_m16); - const __m256i out_16_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_m16); - const __m256i out_08_2 = _mm256_madd_epi16(out_08_0, k__cospi_p24_p08); - const __m256i out_08_3 = _mm256_madd_epi16(out_08_1, k__cospi_p24_p08); - const __m256i out_24_2 = _mm256_madd_epi16(out_08_0, k__cospi_m08_p24); - const __m256i out_24_3 = _mm256_madd_epi16(out_08_1, k__cospi_m08_p24); - // dct_const_round_shift - const __m256i out_00_4 = _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); - const __m256i out_00_5 = _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); - const __m256i out_16_4 = _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); - const __m256i out_16_5 = _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); - const __m256i out_08_4 = _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); - const __m256i out_08_5 = _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); - const __m256i out_24_4 = _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); - const __m256i out_24_5 = _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); - const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS); - const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS); - const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS); - const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS); - const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS); - const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS); - const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS); - const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS); - // Combine - out[ 0] = _mm256_packs_epi32(out_00_6, out_00_7); - out[16] = _mm256_packs_epi32(out_16_6, out_16_7); - out[ 8] = _mm256_packs_epi32(out_08_6, out_08_7); - out[24] = _mm256_packs_epi32(out_24_6, out_24_7); - } - { - const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[ 9], step1[14]); - const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[ 9], step1[14]); - const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]); - const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]); - const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24); - const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24); - const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08); - const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08); - const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24); - const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24); - const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08); - const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m256i s2_09_4 = _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); - const __m256i s2_09_5 = _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); - const __m256i s2_10_4 = _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); - const __m256i s2_10_5 = _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); - const __m256i s2_13_4 = _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); - const __m256i s2_13_5 = _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); - const __m256i s2_14_4 = _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); - const __m256i s2_14_5 = _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); - const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS); - const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS); - const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS); - const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS); - const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS); - const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS); - const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS); - const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS); - // Combine - step2[ 9] = _mm256_packs_epi32(s2_09_6, s2_09_7); - step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7); - step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7); - step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7); - } - { - step2[16] = _mm256_add_epi16(step1[19], step3[16]); - step2[17] = _mm256_add_epi16(step1[18], step3[17]); - step2[18] = _mm256_sub_epi16(step3[17], step1[18]); - step2[19] = _mm256_sub_epi16(step3[16], step1[19]); - step2[20] = _mm256_sub_epi16(step3[23], step1[20]); - step2[21] = _mm256_sub_epi16(step3[22], step1[21]); - step2[22] = _mm256_add_epi16(step1[21], step3[22]); - step2[23] = _mm256_add_epi16(step1[20], step3[23]); - step2[24] = _mm256_add_epi16(step1[27], step3[24]); - step2[25] = _mm256_add_epi16(step1[26], step3[25]); - step2[26] = _mm256_sub_epi16(step3[25], step1[26]); - step2[27] = _mm256_sub_epi16(step3[24], step1[27]); - step2[28] = _mm256_sub_epi16(step3[31], step1[28]); - step2[29] = _mm256_sub_epi16(step3[30], step1[29]); - step2[30] = _mm256_add_epi16(step1[29], step3[30]); - step2[31] = _mm256_add_epi16(step1[28], step3[31]); - } - // Stage 6 - { - const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); - const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); - const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); - const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); - const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); - const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); - const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); - const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); - const __m256i out_04_2 = _mm256_madd_epi16(out_04_0, k__cospi_p28_p04); - const __m256i out_04_3 = _mm256_madd_epi16(out_04_1, k__cospi_p28_p04); - const __m256i out_20_2 = _mm256_madd_epi16(out_20_0, k__cospi_p12_p20); - const __m256i out_20_3 = _mm256_madd_epi16(out_20_1, k__cospi_p12_p20); - const __m256i out_12_2 = _mm256_madd_epi16(out_12_0, k__cospi_m20_p12); - const __m256i out_12_3 = _mm256_madd_epi16(out_12_1, k__cospi_m20_p12); - const __m256i out_28_2 = _mm256_madd_epi16(out_28_0, k__cospi_m04_p28); - const __m256i out_28_3 = _mm256_madd_epi16(out_28_1, k__cospi_m04_p28); - // dct_const_round_shift - const __m256i out_04_4 = _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); - const __m256i out_04_5 = _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); - const __m256i out_20_4 = _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); - const __m256i out_20_5 = _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); - const __m256i out_12_4 = _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); - const __m256i out_12_5 = _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); - const __m256i out_28_4 = _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); - const __m256i out_28_5 = _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); - const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS); - const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS); - const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS); - const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS); - const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS); - const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS); - const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS); - const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS); - // Combine - out[ 4] = _mm256_packs_epi32(out_04_6, out_04_7); - out[20] = _mm256_packs_epi32(out_20_6, out_20_7); - out[12] = _mm256_packs_epi32(out_12_6, out_12_7); - out[28] = _mm256_packs_epi32(out_28_6, out_28_7); - } - { - step3[ 8] = _mm256_add_epi16(step2[ 9], step1[ 8]); - step3[ 9] = _mm256_sub_epi16(step1[ 8], step2[ 9]); - step3[10] = _mm256_sub_epi16(step1[11], step2[10]); - step3[11] = _mm256_add_epi16(step2[10], step1[11]); - step3[12] = _mm256_add_epi16(step2[13], step1[12]); - step3[13] = _mm256_sub_epi16(step1[12], step2[13]); - step3[14] = _mm256_sub_epi16(step1[15], step2[14]); - step3[15] = _mm256_add_epi16(step2[14], step1[15]); - } - { - const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]); - const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]); - const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]); - const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]); - const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]); - const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]); - const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]); - const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]); - const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28); - const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28); - const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04); - const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04); - const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12); - const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12); - const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20); - const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20); - const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12); - const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12); - const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20); - const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20); - const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28); - const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28); - const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04); - const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04); - // dct_const_round_shift - const __m256i s3_17_4 = _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); - const __m256i s3_17_5 = _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); - const __m256i s3_18_4 = _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); - const __m256i s3_18_5 = _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); - const __m256i s3_21_4 = _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); - const __m256i s3_21_5 = _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); - const __m256i s3_22_4 = _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); - const __m256i s3_22_5 = _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); - const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS); - const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS); - const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS); - const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS); - const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS); - const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS); - const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS); - const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS); - const __m256i s3_25_4 = _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); - const __m256i s3_25_5 = _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); - const __m256i s3_26_4 = _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); - const __m256i s3_26_5 = _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); - const __m256i s3_29_4 = _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); - const __m256i s3_29_5 = _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); - const __m256i s3_30_4 = _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); - const __m256i s3_30_5 = _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); - const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS); - const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS); - const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS); - const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS); - const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS); - const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS); - const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS); - const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS); - // Combine - step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7); - step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7); - step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7); - step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7); - // Combine - step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7); - step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7); - step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7); - step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7); - } - // Stage 7 - { - const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[ 8], step3[15]); - const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[ 8], step3[15]); - const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[ 9], step3[14]); - const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[ 9], step3[14]); - const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]); - const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]); - const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]); - const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]); - const __m256i out_02_2 = _mm256_madd_epi16(out_02_0, k__cospi_p30_p02); - const __m256i out_02_3 = _mm256_madd_epi16(out_02_1, k__cospi_p30_p02); - const __m256i out_18_2 = _mm256_madd_epi16(out_18_0, k__cospi_p14_p18); - const __m256i out_18_3 = _mm256_madd_epi16(out_18_1, k__cospi_p14_p18); - const __m256i out_10_2 = _mm256_madd_epi16(out_10_0, k__cospi_p22_p10); - const __m256i out_10_3 = _mm256_madd_epi16(out_10_1, k__cospi_p22_p10); - const __m256i out_26_2 = _mm256_madd_epi16(out_26_0, k__cospi_p06_p26); - const __m256i out_26_3 = _mm256_madd_epi16(out_26_1, k__cospi_p06_p26); - const __m256i out_06_2 = _mm256_madd_epi16(out_26_0, k__cospi_m26_p06); - const __m256i out_06_3 = _mm256_madd_epi16(out_26_1, k__cospi_m26_p06); - const __m256i out_22_2 = _mm256_madd_epi16(out_10_0, k__cospi_m10_p22); - const __m256i out_22_3 = _mm256_madd_epi16(out_10_1, k__cospi_m10_p22); - const __m256i out_14_2 = _mm256_madd_epi16(out_18_0, k__cospi_m18_p14); - const __m256i out_14_3 = _mm256_madd_epi16(out_18_1, k__cospi_m18_p14); - const __m256i out_30_2 = _mm256_madd_epi16(out_02_0, k__cospi_m02_p30); - const __m256i out_30_3 = _mm256_madd_epi16(out_02_1, k__cospi_m02_p30); - // dct_const_round_shift - const __m256i out_02_4 = _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); - const __m256i out_02_5 = _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); - const __m256i out_18_4 = _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); - const __m256i out_18_5 = _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); - const __m256i out_10_4 = _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); - const __m256i out_10_5 = _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); - const __m256i out_26_4 = _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); - const __m256i out_26_5 = _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); - const __m256i out_06_4 = _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); - const __m256i out_06_5 = _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); - const __m256i out_22_4 = _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); - const __m256i out_22_5 = _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); - const __m256i out_14_4 = _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); - const __m256i out_14_5 = _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); - const __m256i out_30_4 = _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); - const __m256i out_30_5 = _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); - const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS); - const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS); - const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS); - const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS); - const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS); - const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS); - const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS); - const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS); - const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS); - const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS); - const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS); - const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS); - const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS); - const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS); - const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS); - const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS); - // Combine - out[ 2] = _mm256_packs_epi32(out_02_6, out_02_7); - out[18] = _mm256_packs_epi32(out_18_6, out_18_7); - out[10] = _mm256_packs_epi32(out_10_6, out_10_7); - out[26] = _mm256_packs_epi32(out_26_6, out_26_7); - out[ 6] = _mm256_packs_epi32(out_06_6, out_06_7); - out[22] = _mm256_packs_epi32(out_22_6, out_22_7); - out[14] = _mm256_packs_epi32(out_14_6, out_14_7); - out[30] = _mm256_packs_epi32(out_30_6, out_30_7); - } - { - step1[16] = _mm256_add_epi16(step3[17], step2[16]); - step1[17] = _mm256_sub_epi16(step2[16], step3[17]); - step1[18] = _mm256_sub_epi16(step2[19], step3[18]); - step1[19] = _mm256_add_epi16(step3[18], step2[19]); - step1[20] = _mm256_add_epi16(step3[21], step2[20]); - step1[21] = _mm256_sub_epi16(step2[20], step3[21]); - step1[22] = _mm256_sub_epi16(step2[23], step3[22]); - step1[23] = _mm256_add_epi16(step3[22], step2[23]); - step1[24] = _mm256_add_epi16(step3[25], step2[24]); - step1[25] = _mm256_sub_epi16(step2[24], step3[25]); - step1[26] = _mm256_sub_epi16(step2[27], step3[26]); - step1[27] = _mm256_add_epi16(step3[26], step2[27]); - step1[28] = _mm256_add_epi16(step3[29], step2[28]); - step1[29] = _mm256_sub_epi16(step2[28], step3[29]); - step1[30] = _mm256_sub_epi16(step2[31], step3[30]); - step1[31] = _mm256_add_epi16(step3[30], step2[31]); - } - // Final stage --- outputs indices are bit-reversed. - { - const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]); - const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]); - const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]); - const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]); - const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]); - const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]); - const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]); - const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]); - const __m256i out_01_2 = _mm256_madd_epi16(out_01_0, k__cospi_p31_p01); - const __m256i out_01_3 = _mm256_madd_epi16(out_01_1, k__cospi_p31_p01); - const __m256i out_17_2 = _mm256_madd_epi16(out_17_0, k__cospi_p15_p17); - const __m256i out_17_3 = _mm256_madd_epi16(out_17_1, k__cospi_p15_p17); - const __m256i out_09_2 = _mm256_madd_epi16(out_09_0, k__cospi_p23_p09); - const __m256i out_09_3 = _mm256_madd_epi16(out_09_1, k__cospi_p23_p09); - const __m256i out_25_2 = _mm256_madd_epi16(out_25_0, k__cospi_p07_p25); - const __m256i out_25_3 = _mm256_madd_epi16(out_25_1, k__cospi_p07_p25); - const __m256i out_07_2 = _mm256_madd_epi16(out_25_0, k__cospi_m25_p07); - const __m256i out_07_3 = _mm256_madd_epi16(out_25_1, k__cospi_m25_p07); - const __m256i out_23_2 = _mm256_madd_epi16(out_09_0, k__cospi_m09_p23); - const __m256i out_23_3 = _mm256_madd_epi16(out_09_1, k__cospi_m09_p23); - const __m256i out_15_2 = _mm256_madd_epi16(out_17_0, k__cospi_m17_p15); - const __m256i out_15_3 = _mm256_madd_epi16(out_17_1, k__cospi_m17_p15); - const __m256i out_31_2 = _mm256_madd_epi16(out_01_0, k__cospi_m01_p31); - const __m256i out_31_3 = _mm256_madd_epi16(out_01_1, k__cospi_m01_p31); - // dct_const_round_shift - const __m256i out_01_4 = _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); - const __m256i out_01_5 = _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); - const __m256i out_17_4 = _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); - const __m256i out_17_5 = _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); - const __m256i out_09_4 = _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); - const __m256i out_09_5 = _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); - const __m256i out_25_4 = _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); - const __m256i out_25_5 = _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); - const __m256i out_07_4 = _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); - const __m256i out_07_5 = _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); - const __m256i out_23_4 = _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); - const __m256i out_23_5 = _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); - const __m256i out_15_4 = _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); - const __m256i out_15_5 = _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); - const __m256i out_31_4 = _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); - const __m256i out_31_5 = _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); - const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS); - const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS); - const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS); - const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS); - const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS); - const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS); - const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS); - const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS); - const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS); - const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS); - const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS); - const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS); - const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS); - const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS); - const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS); - const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS); - // Combine - out[ 1] = _mm256_packs_epi32(out_01_6, out_01_7); - out[17] = _mm256_packs_epi32(out_17_6, out_17_7); - out[ 9] = _mm256_packs_epi32(out_09_6, out_09_7); - out[25] = _mm256_packs_epi32(out_25_6, out_25_7); - out[ 7] = _mm256_packs_epi32(out_07_6, out_07_7); - out[23] = _mm256_packs_epi32(out_23_6, out_23_7); - out[15] = _mm256_packs_epi32(out_15_6, out_15_7); - out[31] = _mm256_packs_epi32(out_31_6, out_31_7); - } - { - const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]); - const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]); - const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]); - const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]); - const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]); - const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]); - const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]); - const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]); - const __m256i out_05_2 = _mm256_madd_epi16(out_05_0, k__cospi_p27_p05); - const __m256i out_05_3 = _mm256_madd_epi16(out_05_1, k__cospi_p27_p05); - const __m256i out_21_2 = _mm256_madd_epi16(out_21_0, k__cospi_p11_p21); - const __m256i out_21_3 = _mm256_madd_epi16(out_21_1, k__cospi_p11_p21); - const __m256i out_13_2 = _mm256_madd_epi16(out_13_0, k__cospi_p19_p13); - const __m256i out_13_3 = _mm256_madd_epi16(out_13_1, k__cospi_p19_p13); - const __m256i out_29_2 = _mm256_madd_epi16(out_29_0, k__cospi_p03_p29); - const __m256i out_29_3 = _mm256_madd_epi16(out_29_1, k__cospi_p03_p29); - const __m256i out_03_2 = _mm256_madd_epi16(out_29_0, k__cospi_m29_p03); - const __m256i out_03_3 = _mm256_madd_epi16(out_29_1, k__cospi_m29_p03); - const __m256i out_19_2 = _mm256_madd_epi16(out_13_0, k__cospi_m13_p19); - const __m256i out_19_3 = _mm256_madd_epi16(out_13_1, k__cospi_m13_p19); - const __m256i out_11_2 = _mm256_madd_epi16(out_21_0, k__cospi_m21_p11); - const __m256i out_11_3 = _mm256_madd_epi16(out_21_1, k__cospi_m21_p11); - const __m256i out_27_2 = _mm256_madd_epi16(out_05_0, k__cospi_m05_p27); - const __m256i out_27_3 = _mm256_madd_epi16(out_05_1, k__cospi_m05_p27); - // dct_const_round_shift - const __m256i out_05_4 = _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); - const __m256i out_05_5 = _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); - const __m256i out_21_4 = _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); - const __m256i out_21_5 = _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); - const __m256i out_13_4 = _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); - const __m256i out_13_5 = _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); - const __m256i out_29_4 = _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); - const __m256i out_29_5 = _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); - const __m256i out_03_4 = _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); - const __m256i out_03_5 = _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); - const __m256i out_19_4 = _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); - const __m256i out_19_5 = _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); - const __m256i out_11_4 = _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); - const __m256i out_11_5 = _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); - const __m256i out_27_4 = _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); - const __m256i out_27_5 = _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); - const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS); - const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS); - const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS); - const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS); - const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS); - const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS); - const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS); - const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS); - const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS); - const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS); - const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS); - const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS); - const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS); - const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS); - const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS); - const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS); - // Combine - out[ 5] = _mm256_packs_epi32(out_05_6, out_05_7); - out[21] = _mm256_packs_epi32(out_21_6, out_21_7); - out[13] = _mm256_packs_epi32(out_13_6, out_13_7); - out[29] = _mm256_packs_epi32(out_29_6, out_29_7); - out[ 3] = _mm256_packs_epi32(out_03_6, out_03_7); - out[19] = _mm256_packs_epi32(out_19_6, out_19_7); - out[11] = _mm256_packs_epi32(out_11_6, out_11_7); - out[27] = _mm256_packs_epi32(out_27_6, out_27_7); - } + // Stage 4 + { + step1[0] = _mm256_add_epi16(step3[3], step3[0]); + step1[1] = _mm256_add_epi16(step3[2], step3[1]); + step1[2] = _mm256_sub_epi16(step3[1], step3[2]); + step1[3] = _mm256_sub_epi16(step3[0], step3[3]); + step1[8] = _mm256_add_epi16(step3[11], step2[8]); + step1[9] = _mm256_add_epi16(step3[10], step2[9]); + step1[10] = _mm256_sub_epi16(step2[9], step3[10]); + step1[11] = _mm256_sub_epi16(step2[8], step3[11]); + step1[12] = _mm256_sub_epi16(step2[15], step3[12]); + step1[13] = _mm256_sub_epi16(step2[14], step3[13]); + step1[14] = _mm256_add_epi16(step3[13], step2[14]); + step1[15] = _mm256_add_epi16(step3[12], step2[15]); + } + { + const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]); + const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]); + const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s1_05_4 = + _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m256i s1_05_5 = + _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m256i s1_06_4 = + _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m256i s1_06_5 = + _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7); + } + { + const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]); + const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]); + const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]); + const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]); + const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]); + const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]); + const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]); + const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]); + const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m256i s1_18_4 = + _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m256i s1_18_5 = + _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m256i s1_19_4 = + _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m256i s1_19_5 = + _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m256i s1_20_4 = + _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m256i s1_20_5 = + _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m256i s1_21_4 = + _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m256i s1_21_5 = + _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m256i s1_26_4 = + _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m256i s1_26_5 = + _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m256i s1_27_4 = + _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m256i s1_27_5 = + _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m256i s1_28_4 = + _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m256i s1_28_5 = + _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m256i s1_29_4 = + _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m256i s1_29_5 = + _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7); + } + // Stage 5 + { + step2[4] = _mm256_add_epi16(step1[5], step3[4]); + step2[5] = _mm256_sub_epi16(step3[4], step1[5]); + step2[6] = _mm256_sub_epi16(step3[7], step1[6]); + step2[7] = _mm256_add_epi16(step1[6], step3[7]); + } + { + const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]); + const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]); + const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]); + const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]); + const __m256i out_00_2 = + _mm256_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m256i out_00_3 = + _mm256_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m256i out_16_2 = + _mm256_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m256i out_16_3 = + _mm256_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m256i out_08_2 = + _mm256_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m256i out_08_3 = + _mm256_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m256i out_24_2 = + _mm256_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m256i out_24_3 = + _mm256_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m256i out_00_4 = + _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m256i out_00_5 = + _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m256i out_16_4 = + _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m256i out_16_5 = + _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m256i out_08_4 = + _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m256i out_08_5 = + _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m256i out_24_4 = + _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m256i out_24_5 = + _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[0] = _mm256_packs_epi32(out_00_6, out_00_7); + out[16] = _mm256_packs_epi32(out_16_6, out_16_7); + out[8] = _mm256_packs_epi32(out_08_6, out_08_7); + out[24] = _mm256_packs_epi32(out_24_6, out_24_7); + } + { + const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]); + const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]); + const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]); + const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]); + const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m256i s2_09_4 = + _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m256i s2_09_5 = + _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m256i s2_10_4 = + _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m256i s2_10_5 = + _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m256i s2_13_4 = + _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m256i s2_13_5 = + _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m256i s2_14_4 = + _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m256i s2_14_5 = + _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7); + } + { + step2[16] = _mm256_add_epi16(step1[19], step3[16]); + step2[17] = _mm256_add_epi16(step1[18], step3[17]); + step2[18] = _mm256_sub_epi16(step3[17], step1[18]); + step2[19] = _mm256_sub_epi16(step3[16], step1[19]); + step2[20] = _mm256_sub_epi16(step3[23], step1[20]); + step2[21] = _mm256_sub_epi16(step3[22], step1[21]); + step2[22] = _mm256_add_epi16(step1[21], step3[22]); + step2[23] = _mm256_add_epi16(step1[20], step3[23]); + step2[24] = _mm256_add_epi16(step1[27], step3[24]); + step2[25] = _mm256_add_epi16(step1[26], step3[25]); + step2[26] = _mm256_sub_epi16(step3[25], step1[26]); + step2[27] = _mm256_sub_epi16(step3[24], step1[27]); + step2[28] = _mm256_sub_epi16(step3[31], step1[28]); + step2[29] = _mm256_sub_epi16(step3[30], step1[29]); + step2[30] = _mm256_add_epi16(step1[29], step3[30]); + step2[31] = _mm256_add_epi16(step1[28], step3[31]); + } + // Stage 6 + { + const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); + const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); + const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); + const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); + const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); + const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); + const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); + const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); + const __m256i out_04_2 = + _mm256_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m256i out_04_3 = + _mm256_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m256i out_20_2 = + _mm256_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m256i out_20_3 = + _mm256_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m256i out_12_2 = + _mm256_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m256i out_12_3 = + _mm256_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m256i out_28_2 = + _mm256_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m256i out_28_3 = + _mm256_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m256i out_04_4 = + _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m256i out_04_5 = + _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m256i out_20_4 = + _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m256i out_20_5 = + _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m256i out_12_4 = + _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m256i out_12_5 = + _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m256i out_28_4 = + _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m256i out_28_5 = + _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[4] = _mm256_packs_epi32(out_04_6, out_04_7); + out[20] = _mm256_packs_epi32(out_20_6, out_20_7); + out[12] = _mm256_packs_epi32(out_12_6, out_12_7); + out[28] = _mm256_packs_epi32(out_28_6, out_28_7); + } + { + step3[8] = _mm256_add_epi16(step2[9], step1[8]); + step3[9] = _mm256_sub_epi16(step1[8], step2[9]); + step3[10] = _mm256_sub_epi16(step1[11], step2[10]); + step3[11] = _mm256_add_epi16(step2[10], step1[11]); + step3[12] = _mm256_add_epi16(step2[13], step1[12]); + step3[13] = _mm256_sub_epi16(step1[12], step2[13]); + step3[14] = _mm256_sub_epi16(step1[15], step2[14]); + step3[15] = _mm256_add_epi16(step2[14], step1[15]); + } + { + const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]); + const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]); + const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]); + const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]); + const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]); + const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]); + const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]); + const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]); + const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m256i s3_17_4 = + _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m256i s3_17_5 = + _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m256i s3_18_4 = + _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m256i s3_18_5 = + _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m256i s3_21_4 = + _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m256i s3_21_5 = + _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m256i s3_22_4 = + _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m256i s3_22_5 = + _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m256i s3_25_4 = + _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m256i s3_25_5 = + _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m256i s3_26_4 = + _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m256i s3_26_5 = + _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m256i s3_29_4 = + _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m256i s3_29_5 = + _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m256i s3_30_4 = + _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m256i s3_30_5 = + _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7); + } + // Stage 7 + { + const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]); + const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]); + const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]); + const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]); + const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]); + const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]); + const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]); + const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]); + const __m256i out_02_2 = + _mm256_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m256i out_02_3 = + _mm256_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m256i out_18_2 = + _mm256_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m256i out_18_3 = + _mm256_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m256i out_10_2 = + _mm256_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m256i out_10_3 = + _mm256_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m256i out_26_2 = + _mm256_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m256i out_26_3 = + _mm256_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m256i out_06_2 = + _mm256_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m256i out_06_3 = + _mm256_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m256i out_22_2 = + _mm256_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m256i out_22_3 = + _mm256_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m256i out_14_2 = + _mm256_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m256i out_14_3 = + _mm256_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m256i out_30_2 = + _mm256_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m256i out_30_3 = + _mm256_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m256i out_02_4 = + _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m256i out_02_5 = + _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m256i out_18_4 = + _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m256i out_18_5 = + _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m256i out_10_4 = + _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m256i out_10_5 = + _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m256i out_26_4 = + _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m256i out_26_5 = + _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m256i out_06_4 = + _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m256i out_06_5 = + _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m256i out_22_4 = + _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m256i out_22_5 = + _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m256i out_14_4 = + _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m256i out_14_5 = + _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m256i out_30_4 = + _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m256i out_30_5 = + _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[2] = _mm256_packs_epi32(out_02_6, out_02_7); + out[18] = _mm256_packs_epi32(out_18_6, out_18_7); + out[10] = _mm256_packs_epi32(out_10_6, out_10_7); + out[26] = _mm256_packs_epi32(out_26_6, out_26_7); + out[6] = _mm256_packs_epi32(out_06_6, out_06_7); + out[22] = _mm256_packs_epi32(out_22_6, out_22_7); + out[14] = _mm256_packs_epi32(out_14_6, out_14_7); + out[30] = _mm256_packs_epi32(out_30_6, out_30_7); + } + { + step1[16] = _mm256_add_epi16(step3[17], step2[16]); + step1[17] = _mm256_sub_epi16(step2[16], step3[17]); + step1[18] = _mm256_sub_epi16(step2[19], step3[18]); + step1[19] = _mm256_add_epi16(step3[18], step2[19]); + step1[20] = _mm256_add_epi16(step3[21], step2[20]); + step1[21] = _mm256_sub_epi16(step2[20], step3[21]); + step1[22] = _mm256_sub_epi16(step2[23], step3[22]); + step1[23] = _mm256_add_epi16(step3[22], step2[23]); + step1[24] = _mm256_add_epi16(step3[25], step2[24]); + step1[25] = _mm256_sub_epi16(step2[24], step3[25]); + step1[26] = _mm256_sub_epi16(step2[27], step3[26]); + step1[27] = _mm256_add_epi16(step3[26], step2[27]); + step1[28] = _mm256_add_epi16(step3[29], step2[28]); + step1[29] = _mm256_sub_epi16(step2[28], step3[29]); + step1[30] = _mm256_sub_epi16(step2[31], step3[30]); + step1[31] = _mm256_add_epi16(step3[30], step2[31]); + } + // Final stage --- outputs indices are bit-reversed. + { + const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]); + const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]); + const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]); + const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]); + const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]); + const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]); + const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]); + const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]); + const __m256i out_01_2 = + _mm256_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m256i out_01_3 = + _mm256_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m256i out_17_2 = + _mm256_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m256i out_17_3 = + _mm256_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m256i out_09_2 = + _mm256_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m256i out_09_3 = + _mm256_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m256i out_25_2 = + _mm256_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m256i out_25_3 = + _mm256_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m256i out_07_2 = + _mm256_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m256i out_07_3 = + _mm256_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m256i out_23_2 = + _mm256_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m256i out_23_3 = + _mm256_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m256i out_15_2 = + _mm256_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m256i out_15_3 = + _mm256_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m256i out_31_2 = + _mm256_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m256i out_31_3 = + _mm256_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m256i out_01_4 = + _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m256i out_01_5 = + _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m256i out_17_4 = + _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m256i out_17_5 = + _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m256i out_09_4 = + _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m256i out_09_5 = + _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m256i out_25_4 = + _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m256i out_25_5 = + _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m256i out_07_4 = + _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m256i out_07_5 = + _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m256i out_23_4 = + _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m256i out_23_5 = + _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m256i out_15_4 = + _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m256i out_15_5 = + _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m256i out_31_4 = + _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m256i out_31_5 = + _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[1] = _mm256_packs_epi32(out_01_6, out_01_7); + out[17] = _mm256_packs_epi32(out_17_6, out_17_7); + out[9] = _mm256_packs_epi32(out_09_6, out_09_7); + out[25] = _mm256_packs_epi32(out_25_6, out_25_7); + out[7] = _mm256_packs_epi32(out_07_6, out_07_7); + out[23] = _mm256_packs_epi32(out_23_6, out_23_7); + out[15] = _mm256_packs_epi32(out_15_6, out_15_7); + out[31] = _mm256_packs_epi32(out_31_6, out_31_7); + } + { + const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]); + const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]); + const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]); + const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]); + const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]); + const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]); + const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]); + const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]); + const __m256i out_05_2 = + _mm256_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m256i out_05_3 = + _mm256_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m256i out_21_2 = + _mm256_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m256i out_21_3 = + _mm256_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m256i out_13_2 = + _mm256_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m256i out_13_3 = + _mm256_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m256i out_29_2 = + _mm256_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m256i out_29_3 = + _mm256_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m256i out_03_2 = + _mm256_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m256i out_03_3 = + _mm256_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m256i out_19_2 = + _mm256_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m256i out_19_3 = + _mm256_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m256i out_11_2 = + _mm256_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m256i out_11_3 = + _mm256_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m256i out_27_2 = + _mm256_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m256i out_27_3 = + _mm256_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m256i out_05_4 = + _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m256i out_05_5 = + _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m256i out_21_4 = + _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m256i out_21_5 = + _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m256i out_13_4 = + _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m256i out_13_5 = + _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m256i out_29_4 = + _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m256i out_29_5 = + _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m256i out_03_4 = + _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m256i out_03_5 = + _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m256i out_19_4 = + _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m256i out_19_5 = + _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m256i out_11_4 = + _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m256i out_11_5 = + _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m256i out_27_4 = + _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m256i out_27_5 = + _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[5] = _mm256_packs_epi32(out_05_6, out_05_7); + out[21] = _mm256_packs_epi32(out_21_6, out_21_7); + out[13] = _mm256_packs_epi32(out_13_6, out_13_7); + out[29] = _mm256_packs_epi32(out_29_6, out_29_7); + out[3] = _mm256_packs_epi32(out_03_6, out_03_7); + out[19] = _mm256_packs_epi32(out_19_6, out_19_7); + out[11] = _mm256_packs_epi32(out_11_6, out_11_7); + out[27] = _mm256_packs_epi32(out_27_6, out_27_7); + } #if FDCT32x32_HIGH_PRECISION } else { __m256i lstep1[64], lstep2[64], lstep3[64]; @@ -1157,32 +1378,32 @@ void FDCT32x32_2D_AVX2(const int16_t *input, // stage 3 { // expanding to 32-bit length priori to addition operations - lstep2[ 0] = _mm256_unpacklo_epi16(step2[ 0], kZero); - lstep2[ 1] = _mm256_unpackhi_epi16(step2[ 0], kZero); - lstep2[ 2] = _mm256_unpacklo_epi16(step2[ 1], kZero); - lstep2[ 3] = _mm256_unpackhi_epi16(step2[ 1], kZero); - lstep2[ 4] = _mm256_unpacklo_epi16(step2[ 2], kZero); - lstep2[ 5] = _mm256_unpackhi_epi16(step2[ 2], kZero); - lstep2[ 6] = _mm256_unpacklo_epi16(step2[ 3], kZero); - lstep2[ 7] = _mm256_unpackhi_epi16(step2[ 3], kZero); - lstep2[ 8] = _mm256_unpacklo_epi16(step2[ 4], kZero); - lstep2[ 9] = _mm256_unpackhi_epi16(step2[ 4], kZero); - lstep2[10] = _mm256_unpacklo_epi16(step2[ 5], kZero); - lstep2[11] = _mm256_unpackhi_epi16(step2[ 5], kZero); - lstep2[12] = _mm256_unpacklo_epi16(step2[ 6], kZero); - lstep2[13] = _mm256_unpackhi_epi16(step2[ 6], kZero); - lstep2[14] = _mm256_unpacklo_epi16(step2[ 7], kZero); - lstep2[15] = _mm256_unpackhi_epi16(step2[ 7], kZero); - lstep2[ 0] = _mm256_madd_epi16(lstep2[ 0], kOne); - lstep2[ 1] = _mm256_madd_epi16(lstep2[ 1], kOne); - lstep2[ 2] = _mm256_madd_epi16(lstep2[ 2], kOne); - lstep2[ 3] = _mm256_madd_epi16(lstep2[ 3], kOne); - lstep2[ 4] = _mm256_madd_epi16(lstep2[ 4], kOne); - lstep2[ 5] = _mm256_madd_epi16(lstep2[ 5], kOne); - lstep2[ 6] = _mm256_madd_epi16(lstep2[ 6], kOne); - lstep2[ 7] = _mm256_madd_epi16(lstep2[ 7], kOne); - lstep2[ 8] = _mm256_madd_epi16(lstep2[ 8], kOne); - lstep2[ 9] = _mm256_madd_epi16(lstep2[ 9], kOne); + lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero); + lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero); + lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero); + lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero); + lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero); + lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero); + lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero); + lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero); + lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero); + lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero); + lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero); + lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero); + lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero); + lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero); + lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero); + lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero); + lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne); + lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne); + lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne); + lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne); + lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne); + lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne); + lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne); + lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne); + lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne); + lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne); lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne); lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne); lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne); @@ -1190,22 +1411,22 @@ void FDCT32x32_2D_AVX2(const int16_t *input, lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne); lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne); - lstep3[ 0] = _mm256_add_epi32(lstep2[14], lstep2[ 0]); - lstep3[ 1] = _mm256_add_epi32(lstep2[15], lstep2[ 1]); - lstep3[ 2] = _mm256_add_epi32(lstep2[12], lstep2[ 2]); - lstep3[ 3] = _mm256_add_epi32(lstep2[13], lstep2[ 3]); - lstep3[ 4] = _mm256_add_epi32(lstep2[10], lstep2[ 4]); - lstep3[ 5] = _mm256_add_epi32(lstep2[11], lstep2[ 5]); - lstep3[ 6] = _mm256_add_epi32(lstep2[ 8], lstep2[ 6]); - lstep3[ 7] = _mm256_add_epi32(lstep2[ 9], lstep2[ 7]); - lstep3[ 8] = _mm256_sub_epi32(lstep2[ 6], lstep2[ 8]); - lstep3[ 9] = _mm256_sub_epi32(lstep2[ 7], lstep2[ 9]); - lstep3[10] = _mm256_sub_epi32(lstep2[ 4], lstep2[10]); - lstep3[11] = _mm256_sub_epi32(lstep2[ 5], lstep2[11]); - lstep3[12] = _mm256_sub_epi32(lstep2[ 2], lstep2[12]); - lstep3[13] = _mm256_sub_epi32(lstep2[ 3], lstep2[13]); - lstep3[14] = _mm256_sub_epi32(lstep2[ 0], lstep2[14]); - lstep3[15] = _mm256_sub_epi32(lstep2[ 1], lstep2[15]); + lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]); + lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]); + lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]); + lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]); + lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]); + lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]); + lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]); + lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]); + lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]); + lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]); + lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]); + lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]); + lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]); + lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]); + lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]); + lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]); } { const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); @@ -1221,14 +1442,22 @@ void FDCT32x32_2D_AVX2(const int16_t *input, const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); // dct_const_round_shift - const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m256i s3_10_4 = + _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m256i s3_10_5 = + _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m256i s3_11_4 = + _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m256i s3_11_5 = + _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m256i s3_12_4 = + _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m256i s3_12_5 = + _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m256i s3_13_4 = + _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m256i s3_13_5 = + _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); @@ -1343,10 +1572,10 @@ void FDCT32x32_2D_AVX2(const int16_t *input, // stage 4 { // expanding to 32-bit length priori to addition operations - lstep2[16] = _mm256_unpacklo_epi16(step2[ 8], kZero); - lstep2[17] = _mm256_unpackhi_epi16(step2[ 8], kZero); - lstep2[18] = _mm256_unpacklo_epi16(step2[ 9], kZero); - lstep2[19] = _mm256_unpackhi_epi16(step2[ 9], kZero); + lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero); + lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero); + lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero); + lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero); lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero); lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero); lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero); @@ -1360,14 +1589,14 @@ void FDCT32x32_2D_AVX2(const int16_t *input, lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne); lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne); - lstep1[ 0] = _mm256_add_epi32(lstep3[ 6], lstep3[ 0]); - lstep1[ 1] = _mm256_add_epi32(lstep3[ 7], lstep3[ 1]); - lstep1[ 2] = _mm256_add_epi32(lstep3[ 4], lstep3[ 2]); - lstep1[ 3] = _mm256_add_epi32(lstep3[ 5], lstep3[ 3]); - lstep1[ 4] = _mm256_sub_epi32(lstep3[ 2], lstep3[ 4]); - lstep1[ 5] = _mm256_sub_epi32(lstep3[ 3], lstep3[ 5]); - lstep1[ 6] = _mm256_sub_epi32(lstep3[ 0], lstep3[ 6]); - lstep1[ 7] = _mm256_sub_epi32(lstep3[ 1], lstep3[ 7]); + lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]); + lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]); + lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]); + lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]); + lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]); + lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]); + lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]); + lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]); lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]); lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]); lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]); @@ -1386,57 +1615,62 @@ void FDCT32x32_2D_AVX2(const int16_t *input, lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]); } { - // to be continued... - // - const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64); - const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64); - - u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]); - u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]); - u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]); - u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]); - - // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide - // instruction latency. - v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_m16); - v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_m16); - v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_m16); - v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_m16); - v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_p16); - v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_p16); - v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_p16); - v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_p16); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - - lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + // to be continued... + // + const __m256i k32_p16_p16 = + pair256_set_epi32(cospi_16_64, cospi_16_64); + const __m256i k32_p16_m16 = + pair256_set_epi32(cospi_16_64, -cospi_16_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]); + u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]); + u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]); + u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]); + + // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide + // instruction latency. + v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16); + v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16); + v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16); + v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16); + v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16); + v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16); + v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16); + v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + + lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); } { - const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64); - const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64); - const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64); - - u[ 0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]); - u[ 1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]); - u[ 2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]); - u[ 3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]); - u[ 4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]); - u[ 5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]); - u[ 6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]); - u[ 7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]); - u[ 8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]); - u[ 9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]); + const __m256i k32_m08_p24 = + pair256_set_epi32(-cospi_8_64, cospi_24_64); + const __m256i k32_m24_m08 = + pair256_set_epi32(-cospi_24_64, -cospi_8_64); + const __m256i k32_p24_p08 = + pair256_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]); + u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]); + u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]); + u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]); + u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]); + u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]); + u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]); + u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]); + u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]); + u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]); u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]); u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]); u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]); @@ -1444,16 +1678,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]); u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]); - v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m08_p24); - v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m08_p24); - v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m08_p24); - v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m08_p24); - v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m08_p24); - v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m08_p24); - v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m08_p24); - v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m08_p24); - v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m24_m08); - v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m24_m08); + v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24); + v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24); + v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24); + v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24); + v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24); + v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24); + v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08); + v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08); v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08); v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08); v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08); @@ -1464,29 +1698,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24); v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24); v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24); - v[20] = k_madd_epi32_avx2(u[ 8], k32_m08_p24); - v[21] = k_madd_epi32_avx2(u[ 9], k32_m08_p24); + v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24); + v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24); v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24); v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24); - v[24] = k_madd_epi32_avx2(u[ 4], k32_p24_p08); - v[25] = k_madd_epi32_avx2(u[ 5], k32_p24_p08); - v[26] = k_madd_epi32_avx2(u[ 6], k32_p24_p08); - v[27] = k_madd_epi32_avx2(u[ 7], k32_p24_p08); - v[28] = k_madd_epi32_avx2(u[ 0], k32_p24_p08); - v[29] = k_madd_epi32_avx2(u[ 1], k32_p24_p08); - v[30] = k_madd_epi32_avx2(u[ 2], k32_p24_p08); - v[31] = k_madd_epi32_avx2(u[ 3], k32_p24_p08); - - u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]); - u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]); - u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]); - u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]); - u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]); - u[ 5] = k_packs_epi64_avx2(v[10], v[11]); - u[ 6] = k_packs_epi64_avx2(v[12], v[13]); - u[ 7] = k_packs_epi64_avx2(v[14], v[15]); - u[ 8] = k_packs_epi64_avx2(v[16], v[17]); - u[ 9] = k_packs_epi64_avx2(v[18], v[19]); + v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08); + v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08); + v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08); + v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08); + v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08); + v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08); + v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08); + v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); u[10] = k_packs_epi64_avx2(v[20], v[21]); u[11] = k_packs_epi64_avx2(v[22], v[23]); u[12] = k_packs_epi64_avx2(v[24], v[25]); @@ -1494,16 +1728,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = k_packs_epi64_avx2(v[28], v[29]); u[15] = k_packs_epi64_avx2(v[30], v[31]); - v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); - v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); - v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); - v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); - v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); - v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); - v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); - v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); - v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); - v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); @@ -1511,16 +1745,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - lstep1[36] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS); - lstep1[37] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS); - lstep1[38] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS); - lstep1[39] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS); - lstep1[40] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS); - lstep1[41] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS); - lstep1[42] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS); - lstep1[43] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS); - lstep1[52] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS); - lstep1[53] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS); + lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); @@ -1530,20 +1764,24 @@ void FDCT32x32_2D_AVX2(const int16_t *input, } // stage 5 { - lstep2[ 8] = _mm256_add_epi32(lstep1[10], lstep3[ 8]); - lstep2[ 9] = _mm256_add_epi32(lstep1[11], lstep3[ 9]); - lstep2[10] = _mm256_sub_epi32(lstep3[ 8], lstep1[10]); - lstep2[11] = _mm256_sub_epi32(lstep3[ 9], lstep1[11]); + lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]); + lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]); + lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]); + lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]); lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]); lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]); lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]); lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]); } { - const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64); - const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64); - const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64); - const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64); + const __m256i k32_p16_p16 = + pair256_set_epi32(cospi_16_64, cospi_16_64); + const __m256i k32_p16_m16 = + pair256_set_epi32(cospi_16_64, -cospi_16_64); + const __m256i k32_p24_p08 = + pair256_set_epi32(cospi_24_64, cospi_8_64); + const __m256i k32_m08_p24 = + pair256_set_epi32(-cospi_8_64, cospi_24_64); u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]); u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]); @@ -1556,16 +1794,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide // instruction latency. - v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_p16); - v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_p16); - v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_p16); - v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_p16); - v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_m16); - v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_m16); - v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_m16); - v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_m16); - v[ 8] = k_madd_epi32_avx2(u[4], k32_p24_p08); - v[ 9] = k_madd_epi32_avx2(u[5], k32_p24_p08); + v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16); + v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16); + v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16); + v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16); + v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16); + v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16); + v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16); + v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16); + v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08); + v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08); v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08); v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08); v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24); @@ -1600,14 +1838,14 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - sign[0] = _mm256_cmpgt_epi32(kZero,u[0]); - sign[1] = _mm256_cmpgt_epi32(kZero,u[1]); - sign[2] = _mm256_cmpgt_epi32(kZero,u[2]); - sign[3] = _mm256_cmpgt_epi32(kZero,u[3]); - sign[4] = _mm256_cmpgt_epi32(kZero,u[4]); - sign[5] = _mm256_cmpgt_epi32(kZero,u[5]); - sign[6] = _mm256_cmpgt_epi32(kZero,u[6]); - sign[7] = _mm256_cmpgt_epi32(kZero,u[7]); + sign[0] = _mm256_cmpgt_epi32(kZero, u[0]); + sign[1] = _mm256_cmpgt_epi32(kZero, u[1]); + sign[2] = _mm256_cmpgt_epi32(kZero, u[2]); + sign[3] = _mm256_cmpgt_epi32(kZero, u[3]); + sign[4] = _mm256_cmpgt_epi32(kZero, u[4]); + sign[5] = _mm256_cmpgt_epi32(kZero, u[5]); + sign[6] = _mm256_cmpgt_epi32(kZero, u[6]); + sign[7] = _mm256_cmpgt_epi32(kZero, u[7]); u[0] = _mm256_sub_epi32(u[0], sign[0]); u[1] = _mm256_sub_epi32(u[1], sign[1]); @@ -1637,15 +1875,18 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[7] = _mm256_srai_epi32(u[7], 2); // Combine - out[ 0] = _mm256_packs_epi32(u[0], u[1]); + out[0] = _mm256_packs_epi32(u[0], u[1]); out[16] = _mm256_packs_epi32(u[2], u[3]); - out[ 8] = _mm256_packs_epi32(u[4], u[5]); + out[8] = _mm256_packs_epi32(u[4], u[5]); out[24] = _mm256_packs_epi32(u[6], u[7]); } { - const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64); - const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64); - const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64); + const __m256i k32_m08_p24 = + pair256_set_epi32(-cospi_8_64, cospi_24_64); + const __m256i k32_m24_m08 = + pair256_set_epi32(-cospi_24_64, -cospi_8_64); + const __m256i k32_p24_p08 = + pair256_set_epi32(cospi_24_64, cospi_8_64); u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]); u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]); @@ -1664,8 +1905,8 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08); v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08); v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08); - v[ 8] = k_madd_epi32_avx2(u[4], k32_m08_p24); - v[ 9] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24); v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24); v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24); v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08); @@ -1736,15 +1977,19 @@ void FDCT32x32_2D_AVX2(const int16_t *input, } // stage 6 { - const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64); - const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64); - const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64); - const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64); - - u[0] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]); - u[1] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]); - u[2] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]); - u[3] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]); + const __m256i k32_p28_p04 = + pair256_set_epi32(cospi_28_64, cospi_4_64); + const __m256i k32_p12_p20 = + pair256_set_epi32(cospi_12_64, cospi_20_64); + const __m256i k32_m20_p12 = + pair256_set_epi32(-cospi_20_64, cospi_12_64); + const __m256i k32_m04_p28 = + pair256_set_epi32(-cospi_4_64, cospi_28_64); + + u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]); + u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]); + u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]); + u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]); u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); @@ -1753,10 +1998,10 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); - u[12] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]); - u[13] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]); - u[14] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]); - u[15] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]); + u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]); + u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]); + u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]); + u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]); v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04); v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04); @@ -1766,8 +2011,8 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20); v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20); v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20); - v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12); - v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12); + v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12); + v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12); v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28); @@ -1802,14 +2047,14 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - sign[0] = _mm256_cmpgt_epi32(kZero,u[0]); - sign[1] = _mm256_cmpgt_epi32(kZero,u[1]); - sign[2] = _mm256_cmpgt_epi32(kZero,u[2]); - sign[3] = _mm256_cmpgt_epi32(kZero,u[3]); - sign[4] = _mm256_cmpgt_epi32(kZero,u[4]); - sign[5] = _mm256_cmpgt_epi32(kZero,u[5]); - sign[6] = _mm256_cmpgt_epi32(kZero,u[6]); - sign[7] = _mm256_cmpgt_epi32(kZero,u[7]); + sign[0] = _mm256_cmpgt_epi32(kZero, u[0]); + sign[1] = _mm256_cmpgt_epi32(kZero, u[1]); + sign[2] = _mm256_cmpgt_epi32(kZero, u[2]); + sign[3] = _mm256_cmpgt_epi32(kZero, u[3]); + sign[4] = _mm256_cmpgt_epi32(kZero, u[4]); + sign[5] = _mm256_cmpgt_epi32(kZero, u[5]); + sign[6] = _mm256_cmpgt_epi32(kZero, u[6]); + sign[7] = _mm256_cmpgt_epi32(kZero, u[7]); u[0] = _mm256_sub_epi32(u[0], sign[0]); u[1] = _mm256_sub_epi32(u[1], sign[1]); @@ -1838,7 +2083,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[6] = _mm256_srai_epi32(u[6], 2); u[7] = _mm256_srai_epi32(u[7], 2); - out[ 4] = _mm256_packs_epi32(u[0], u[1]); + out[4] = _mm256_packs_epi32(u[0], u[1]); out[20] = _mm256_packs_epi32(u[2], u[3]); out[12] = _mm256_packs_epi32(u[4], u[5]); out[28] = _mm256_packs_epi32(u[6], u[7]); @@ -1862,24 +2107,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input, lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]); } { - const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64); - const __m256i k32_m28_m04 = pair256_set_epi32(-cospi_28_64, -cospi_4_64); - const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64); - const __m256i k32_m12_m20 = pair256_set_epi32(-cospi_12_64, - -cospi_20_64); - const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64); - const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64); - - u[ 0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]); - u[ 1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]); - u[ 2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]); - u[ 3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]); - u[ 4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]); - u[ 5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]); - u[ 6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]); - u[ 7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]); - u[ 8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]); - u[ 9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]); + const __m256i k32_m04_p28 = + pair256_set_epi32(-cospi_4_64, cospi_28_64); + const __m256i k32_m28_m04 = + pair256_set_epi32(-cospi_28_64, -cospi_4_64); + const __m256i k32_m20_p12 = + pair256_set_epi32(-cospi_20_64, cospi_12_64); + const __m256i k32_m12_m20 = + pair256_set_epi32(-cospi_12_64, -cospi_20_64); + const __m256i k32_p12_p20 = + pair256_set_epi32(cospi_12_64, cospi_20_64); + const __m256i k32_p28_p04 = + pair256_set_epi32(cospi_28_64, cospi_4_64); + + u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]); + u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]); + u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]); + u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]); + u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]); + u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]); + u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]); + u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]); + u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]); + u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]); u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]); u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]); u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]); @@ -1887,16 +2137,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]); u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]); - v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m04_p28); - v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m04_p28); - v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m04_p28); - v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m04_p28); - v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m28_m04); - v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m28_m04); - v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m28_m04); - v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m28_m04); - v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12); - v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12); + v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28); + v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28); + v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28); + v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28); + v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04); + v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04); + v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04); + v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04); + v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12); + v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12); v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20); @@ -1907,29 +2157,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12); v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12); v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12); - v[20] = k_madd_epi32_avx2(u[ 8], k32_p12_p20); - v[21] = k_madd_epi32_avx2(u[ 9], k32_p12_p20); + v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20); + v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20); v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20); v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20); - v[24] = k_madd_epi32_avx2(u[ 4], k32_m04_p28); - v[25] = k_madd_epi32_avx2(u[ 5], k32_m04_p28); - v[26] = k_madd_epi32_avx2(u[ 6], k32_m04_p28); - v[27] = k_madd_epi32_avx2(u[ 7], k32_m04_p28); - v[28] = k_madd_epi32_avx2(u[ 0], k32_p28_p04); - v[29] = k_madd_epi32_avx2(u[ 1], k32_p28_p04); - v[30] = k_madd_epi32_avx2(u[ 2], k32_p28_p04); - v[31] = k_madd_epi32_avx2(u[ 3], k32_p28_p04); - - u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]); - u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]); - u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]); - u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]); - u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]); - u[ 5] = k_packs_epi64_avx2(v[10], v[11]); - u[ 6] = k_packs_epi64_avx2(v[12], v[13]); - u[ 7] = k_packs_epi64_avx2(v[14], v[15]); - u[ 8] = k_packs_epi64_avx2(v[16], v[17]); - u[ 9] = k_packs_epi64_avx2(v[18], v[19]); + v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28); + v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28); + v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28); + v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28); + v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04); + v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04); + v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04); + v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); u[10] = k_packs_epi64_avx2(v[20], v[21]); u[11] = k_packs_epi64_avx2(v[22], v[23]); u[12] = k_packs_epi64_avx2(v[24], v[25]); @@ -1937,16 +2187,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = k_packs_epi64_avx2(v[28], v[29]); u[15] = k_packs_epi64_avx2(v[30], v[31]); - v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); - v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); - v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); - v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); - v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); - v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); - v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); - v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); - v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); - v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); @@ -1954,16 +2204,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - lstep3[34] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS); - lstep3[35] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS); - lstep3[36] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS); - lstep3[37] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS); - lstep3[42] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS); - lstep3[43] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS); - lstep3[44] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS); - lstep3[45] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS); - lstep3[50] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS); - lstep3[51] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS); + lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); @@ -1973,25 +2223,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input, } // stage 7 { - const __m256i k32_p30_p02 = pair256_set_epi32(cospi_30_64, cospi_2_64); - const __m256i k32_p14_p18 = pair256_set_epi32(cospi_14_64, cospi_18_64); - const __m256i k32_p22_p10 = pair256_set_epi32(cospi_22_64, cospi_10_64); - const __m256i k32_p06_p26 = pair256_set_epi32(cospi_6_64, cospi_26_64); - const __m256i k32_m26_p06 = pair256_set_epi32(-cospi_26_64, cospi_6_64); - const __m256i k32_m10_p22 = pair256_set_epi32(-cospi_10_64, cospi_22_64); - const __m256i k32_m18_p14 = pair256_set_epi32(-cospi_18_64, cospi_14_64); - const __m256i k32_m02_p30 = pair256_set_epi32(-cospi_2_64, cospi_30_64); - - u[ 0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]); - u[ 1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]); - u[ 2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]); - u[ 3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]); - u[ 4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]); - u[ 5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]); - u[ 6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]); - u[ 7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]); - u[ 8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]); - u[ 9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]); + const __m256i k32_p30_p02 = + pair256_set_epi32(cospi_30_64, cospi_2_64); + const __m256i k32_p14_p18 = + pair256_set_epi32(cospi_14_64, cospi_18_64); + const __m256i k32_p22_p10 = + pair256_set_epi32(cospi_22_64, cospi_10_64); + const __m256i k32_p06_p26 = + pair256_set_epi32(cospi_6_64, cospi_26_64); + const __m256i k32_m26_p06 = + pair256_set_epi32(-cospi_26_64, cospi_6_64); + const __m256i k32_m10_p22 = + pair256_set_epi32(-cospi_10_64, cospi_22_64); + const __m256i k32_m18_p14 = + pair256_set_epi32(-cospi_18_64, cospi_14_64); + const __m256i k32_m02_p30 = + pair256_set_epi32(-cospi_2_64, cospi_30_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]); + u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]); + u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]); + u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]); + u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]); + u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]); + u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]); + u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]); + u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]); + u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]); u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]); u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]); u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]); @@ -1999,16 +2257,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]); u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]); - v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p30_p02); - v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p30_p02); - v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p30_p02); - v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p30_p02); - v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p14_p18); - v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p14_p18); - v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p14_p18); - v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p14_p18); - v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p22_p10); - v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p22_p10); + v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02); + v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02); + v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02); + v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02); + v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18); + v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18); + v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18); + v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18); + v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10); + v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10); v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10); v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10); v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26); @@ -2019,29 +2277,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06); v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06); v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06); - v[20] = k_madd_epi32_avx2(u[ 8], k32_m10_p22); - v[21] = k_madd_epi32_avx2(u[ 9], k32_m10_p22); + v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22); + v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22); v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22); v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22); - v[24] = k_madd_epi32_avx2(u[ 4], k32_m18_p14); - v[25] = k_madd_epi32_avx2(u[ 5], k32_m18_p14); - v[26] = k_madd_epi32_avx2(u[ 6], k32_m18_p14); - v[27] = k_madd_epi32_avx2(u[ 7], k32_m18_p14); - v[28] = k_madd_epi32_avx2(u[ 0], k32_m02_p30); - v[29] = k_madd_epi32_avx2(u[ 1], k32_m02_p30); - v[30] = k_madd_epi32_avx2(u[ 2], k32_m02_p30); - v[31] = k_madd_epi32_avx2(u[ 3], k32_m02_p30); - - u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]); - u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]); - u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]); - u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]); - u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]); - u[ 5] = k_packs_epi64_avx2(v[10], v[11]); - u[ 6] = k_packs_epi64_avx2(v[12], v[13]); - u[ 7] = k_packs_epi64_avx2(v[14], v[15]); - u[ 8] = k_packs_epi64_avx2(v[16], v[17]); - u[ 9] = k_packs_epi64_avx2(v[18], v[19]); + v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14); + v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14); + v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14); + v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14); + v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30); + v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30); + v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30); + v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); u[10] = k_packs_epi64_avx2(v[20], v[21]); u[11] = k_packs_epi64_avx2(v[22], v[23]); u[12] = k_packs_epi64_avx2(v[24], v[25]); @@ -2049,16 +2307,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = k_packs_epi64_avx2(v[28], v[29]); u[15] = k_packs_epi64_avx2(v[30], v[31]); - v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); - v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); - v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); - v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); - v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); - v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); - v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); - v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); - v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); - v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); @@ -2066,16 +2324,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS); - u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS); - u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS); - u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS); - u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS); - u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS); - u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS); - u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS); - u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS); - u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS); + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); @@ -2083,33 +2341,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]); - v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]); - v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]); - v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]); - v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]); - v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]); - v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]); - v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]); - v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]); - v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]); - v[10] = _mm256_cmpgt_epi32(kZero,u[10]); - v[11] = _mm256_cmpgt_epi32(kZero,u[11]); - v[12] = _mm256_cmpgt_epi32(kZero,u[12]); - v[13] = _mm256_cmpgt_epi32(kZero,u[13]); - v[14] = _mm256_cmpgt_epi32(kZero,u[14]); - v[15] = _mm256_cmpgt_epi32(kZero,u[15]); - - u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]); - u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]); - u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]); - u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]); - u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]); - u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]); - u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]); - u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]); - u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]); - u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]); + v[0] = _mm256_cmpgt_epi32(kZero, u[0]); + v[1] = _mm256_cmpgt_epi32(kZero, u[1]); + v[2] = _mm256_cmpgt_epi32(kZero, u[2]); + v[3] = _mm256_cmpgt_epi32(kZero, u[3]); + v[4] = _mm256_cmpgt_epi32(kZero, u[4]); + v[5] = _mm256_cmpgt_epi32(kZero, u[5]); + v[6] = _mm256_cmpgt_epi32(kZero, u[6]); + v[7] = _mm256_cmpgt_epi32(kZero, u[7]); + v[8] = _mm256_cmpgt_epi32(kZero, u[8]); + v[9] = _mm256_cmpgt_epi32(kZero, u[9]); + v[10] = _mm256_cmpgt_epi32(kZero, u[10]); + v[11] = _mm256_cmpgt_epi32(kZero, u[11]); + v[12] = _mm256_cmpgt_epi32(kZero, u[12]); + v[13] = _mm256_cmpgt_epi32(kZero, u[13]); + v[14] = _mm256_cmpgt_epi32(kZero, u[14]); + v[15] = _mm256_cmpgt_epi32(kZero, u[15]); + + u[0] = _mm256_sub_epi32(u[0], v[0]); + u[1] = _mm256_sub_epi32(u[1], v[1]); + u[2] = _mm256_sub_epi32(u[2], v[2]); + u[3] = _mm256_sub_epi32(u[3], v[3]); + u[4] = _mm256_sub_epi32(u[4], v[4]); + u[5] = _mm256_sub_epi32(u[5], v[5]); + u[6] = _mm256_sub_epi32(u[6], v[6]); + u[7] = _mm256_sub_epi32(u[7], v[7]); + u[8] = _mm256_sub_epi32(u[8], v[8]); + u[9] = _mm256_sub_epi32(u[9], v[9]); u[10] = _mm256_sub_epi32(u[10], v[10]); u[11] = _mm256_sub_epi32(u[11], v[11]); u[12] = _mm256_sub_epi32(u[12], v[12]); @@ -2117,16 +2375,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = _mm256_sub_epi32(u[14], v[14]); u[15] = _mm256_sub_epi32(u[15], v[15]); - v[ 0] = _mm256_add_epi32(u[ 0], K32One); - v[ 1] = _mm256_add_epi32(u[ 1], K32One); - v[ 2] = _mm256_add_epi32(u[ 2], K32One); - v[ 3] = _mm256_add_epi32(u[ 3], K32One); - v[ 4] = _mm256_add_epi32(u[ 4], K32One); - v[ 5] = _mm256_add_epi32(u[ 5], K32One); - v[ 6] = _mm256_add_epi32(u[ 6], K32One); - v[ 7] = _mm256_add_epi32(u[ 7], K32One); - v[ 8] = _mm256_add_epi32(u[ 8], K32One); - v[ 9] = _mm256_add_epi32(u[ 9], K32One); + v[0] = _mm256_add_epi32(u[0], K32One); + v[1] = _mm256_add_epi32(u[1], K32One); + v[2] = _mm256_add_epi32(u[2], K32One); + v[3] = _mm256_add_epi32(u[3], K32One); + v[4] = _mm256_add_epi32(u[4], K32One); + v[5] = _mm256_add_epi32(u[5], K32One); + v[6] = _mm256_add_epi32(u[6], K32One); + v[7] = _mm256_add_epi32(u[7], K32One); + v[8] = _mm256_add_epi32(u[8], K32One); + v[9] = _mm256_add_epi32(u[9], K32One); v[10] = _mm256_add_epi32(u[10], K32One); v[11] = _mm256_add_epi32(u[11], K32One); v[12] = _mm256_add_epi32(u[12], K32One); @@ -2134,16 +2392,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[14] = _mm256_add_epi32(u[14], K32One); v[15] = _mm256_add_epi32(u[15], K32One); - u[ 0] = _mm256_srai_epi32(v[ 0], 2); - u[ 1] = _mm256_srai_epi32(v[ 1], 2); - u[ 2] = _mm256_srai_epi32(v[ 2], 2); - u[ 3] = _mm256_srai_epi32(v[ 3], 2); - u[ 4] = _mm256_srai_epi32(v[ 4], 2); - u[ 5] = _mm256_srai_epi32(v[ 5], 2); - u[ 6] = _mm256_srai_epi32(v[ 6], 2); - u[ 7] = _mm256_srai_epi32(v[ 7], 2); - u[ 8] = _mm256_srai_epi32(v[ 8], 2); - u[ 9] = _mm256_srai_epi32(v[ 9], 2); + u[0] = _mm256_srai_epi32(v[0], 2); + u[1] = _mm256_srai_epi32(v[1], 2); + u[2] = _mm256_srai_epi32(v[2], 2); + u[3] = _mm256_srai_epi32(v[3], 2); + u[4] = _mm256_srai_epi32(v[4], 2); + u[5] = _mm256_srai_epi32(v[5], 2); + u[6] = _mm256_srai_epi32(v[6], 2); + u[7] = _mm256_srai_epi32(v[7], 2); + u[8] = _mm256_srai_epi32(v[8], 2); + u[9] = _mm256_srai_epi32(v[9], 2); u[10] = _mm256_srai_epi32(v[10], 2); u[11] = _mm256_srai_epi32(v[11], 2); u[12] = _mm256_srai_epi32(v[12], 2); @@ -2151,11 +2409,11 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = _mm256_srai_epi32(v[14], 2); u[15] = _mm256_srai_epi32(v[15], 2); - out[ 2] = _mm256_packs_epi32(u[0], u[1]); + out[2] = _mm256_packs_epi32(u[0], u[1]); out[18] = _mm256_packs_epi32(u[2], u[3]); out[10] = _mm256_packs_epi32(u[4], u[5]); out[26] = _mm256_packs_epi32(u[6], u[7]); - out[ 6] = _mm256_packs_epi32(u[8], u[9]); + out[6] = _mm256_packs_epi32(u[8], u[9]); out[22] = _mm256_packs_epi32(u[10], u[11]); out[14] = _mm256_packs_epi32(u[12], u[13]); out[30] = _mm256_packs_epi32(u[14], u[15]); @@ -2196,25 +2454,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input, } // stage 8 { - const __m256i k32_p31_p01 = pair256_set_epi32(cospi_31_64, cospi_1_64); - const __m256i k32_p15_p17 = pair256_set_epi32(cospi_15_64, cospi_17_64); - const __m256i k32_p23_p09 = pair256_set_epi32(cospi_23_64, cospi_9_64); - const __m256i k32_p07_p25 = pair256_set_epi32(cospi_7_64, cospi_25_64); - const __m256i k32_m25_p07 = pair256_set_epi32(-cospi_25_64, cospi_7_64); - const __m256i k32_m09_p23 = pair256_set_epi32(-cospi_9_64, cospi_23_64); - const __m256i k32_m17_p15 = pair256_set_epi32(-cospi_17_64, cospi_15_64); - const __m256i k32_m01_p31 = pair256_set_epi32(-cospi_1_64, cospi_31_64); - - u[ 0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]); - u[ 1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]); - u[ 2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]); - u[ 3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]); - u[ 4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]); - u[ 5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]); - u[ 6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]); - u[ 7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]); - u[ 8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]); - u[ 9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]); + const __m256i k32_p31_p01 = + pair256_set_epi32(cospi_31_64, cospi_1_64); + const __m256i k32_p15_p17 = + pair256_set_epi32(cospi_15_64, cospi_17_64); + const __m256i k32_p23_p09 = + pair256_set_epi32(cospi_23_64, cospi_9_64); + const __m256i k32_p07_p25 = + pair256_set_epi32(cospi_7_64, cospi_25_64); + const __m256i k32_m25_p07 = + pair256_set_epi32(-cospi_25_64, cospi_7_64); + const __m256i k32_m09_p23 = + pair256_set_epi32(-cospi_9_64, cospi_23_64); + const __m256i k32_m17_p15 = + pair256_set_epi32(-cospi_17_64, cospi_15_64); + const __m256i k32_m01_p31 = + pair256_set_epi32(-cospi_1_64, cospi_31_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]); + u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]); + u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]); + u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]); + u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]); + u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]); + u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]); + u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]); + u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]); + u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]); u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]); u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]); u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]); @@ -2222,16 +2488,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]); u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]); - v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p31_p01); - v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p31_p01); - v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p31_p01); - v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p31_p01); - v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p15_p17); - v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p15_p17); - v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p15_p17); - v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p15_p17); - v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p23_p09); - v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p23_p09); + v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01); + v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01); + v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01); + v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01); + v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17); + v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17); + v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17); + v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17); + v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09); + v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09); v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09); v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09); v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25); @@ -2242,29 +2508,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07); v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07); v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07); - v[20] = k_madd_epi32_avx2(u[ 8], k32_m09_p23); - v[21] = k_madd_epi32_avx2(u[ 9], k32_m09_p23); + v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23); + v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23); v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23); v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23); - v[24] = k_madd_epi32_avx2(u[ 4], k32_m17_p15); - v[25] = k_madd_epi32_avx2(u[ 5], k32_m17_p15); - v[26] = k_madd_epi32_avx2(u[ 6], k32_m17_p15); - v[27] = k_madd_epi32_avx2(u[ 7], k32_m17_p15); - v[28] = k_madd_epi32_avx2(u[ 0], k32_m01_p31); - v[29] = k_madd_epi32_avx2(u[ 1], k32_m01_p31); - v[30] = k_madd_epi32_avx2(u[ 2], k32_m01_p31); - v[31] = k_madd_epi32_avx2(u[ 3], k32_m01_p31); - - u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]); - u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]); - u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]); - u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]); - u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]); - u[ 5] = k_packs_epi64_avx2(v[10], v[11]); - u[ 6] = k_packs_epi64_avx2(v[12], v[13]); - u[ 7] = k_packs_epi64_avx2(v[14], v[15]); - u[ 8] = k_packs_epi64_avx2(v[16], v[17]); - u[ 9] = k_packs_epi64_avx2(v[18], v[19]); + v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15); + v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15); + v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15); + v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15); + v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31); + v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31); + v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31); + v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); u[10] = k_packs_epi64_avx2(v[20], v[21]); u[11] = k_packs_epi64_avx2(v[22], v[23]); u[12] = k_packs_epi64_avx2(v[24], v[25]); @@ -2272,16 +2538,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = k_packs_epi64_avx2(v[28], v[29]); u[15] = k_packs_epi64_avx2(v[30], v[31]); - v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); - v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); - v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); - v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); - v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); - v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); - v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); - v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); - v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); - v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); @@ -2289,16 +2555,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS); - u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS); - u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS); - u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS); - u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS); - u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS); - u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS); - u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS); - u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS); - u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS); + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); @@ -2306,33 +2572,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]); - v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]); - v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]); - v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]); - v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]); - v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]); - v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]); - v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]); - v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]); - v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]); - v[10] = _mm256_cmpgt_epi32(kZero,u[10]); - v[11] = _mm256_cmpgt_epi32(kZero,u[11]); - v[12] = _mm256_cmpgt_epi32(kZero,u[12]); - v[13] = _mm256_cmpgt_epi32(kZero,u[13]); - v[14] = _mm256_cmpgt_epi32(kZero,u[14]); - v[15] = _mm256_cmpgt_epi32(kZero,u[15]); - - u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]); - u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]); - u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]); - u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]); - u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]); - u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]); - u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]); - u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]); - u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]); - u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]); + v[0] = _mm256_cmpgt_epi32(kZero, u[0]); + v[1] = _mm256_cmpgt_epi32(kZero, u[1]); + v[2] = _mm256_cmpgt_epi32(kZero, u[2]); + v[3] = _mm256_cmpgt_epi32(kZero, u[3]); + v[4] = _mm256_cmpgt_epi32(kZero, u[4]); + v[5] = _mm256_cmpgt_epi32(kZero, u[5]); + v[6] = _mm256_cmpgt_epi32(kZero, u[6]); + v[7] = _mm256_cmpgt_epi32(kZero, u[7]); + v[8] = _mm256_cmpgt_epi32(kZero, u[8]); + v[9] = _mm256_cmpgt_epi32(kZero, u[9]); + v[10] = _mm256_cmpgt_epi32(kZero, u[10]); + v[11] = _mm256_cmpgt_epi32(kZero, u[11]); + v[12] = _mm256_cmpgt_epi32(kZero, u[12]); + v[13] = _mm256_cmpgt_epi32(kZero, u[13]); + v[14] = _mm256_cmpgt_epi32(kZero, u[14]); + v[15] = _mm256_cmpgt_epi32(kZero, u[15]); + + u[0] = _mm256_sub_epi32(u[0], v[0]); + u[1] = _mm256_sub_epi32(u[1], v[1]); + u[2] = _mm256_sub_epi32(u[2], v[2]); + u[3] = _mm256_sub_epi32(u[3], v[3]); + u[4] = _mm256_sub_epi32(u[4], v[4]); + u[5] = _mm256_sub_epi32(u[5], v[5]); + u[6] = _mm256_sub_epi32(u[6], v[6]); + u[7] = _mm256_sub_epi32(u[7], v[7]); + u[8] = _mm256_sub_epi32(u[8], v[8]); + u[9] = _mm256_sub_epi32(u[9], v[9]); u[10] = _mm256_sub_epi32(u[10], v[10]); u[11] = _mm256_sub_epi32(u[11], v[11]); u[12] = _mm256_sub_epi32(u[12], v[12]); @@ -2374,35 +2640,43 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = _mm256_srai_epi32(v[14], 2); u[15] = _mm256_srai_epi32(v[15], 2); - out[ 1] = _mm256_packs_epi32(u[0], u[1]); + out[1] = _mm256_packs_epi32(u[0], u[1]); out[17] = _mm256_packs_epi32(u[2], u[3]); - out[ 9] = _mm256_packs_epi32(u[4], u[5]); + out[9] = _mm256_packs_epi32(u[4], u[5]); out[25] = _mm256_packs_epi32(u[6], u[7]); - out[ 7] = _mm256_packs_epi32(u[8], u[9]); + out[7] = _mm256_packs_epi32(u[8], u[9]); out[23] = _mm256_packs_epi32(u[10], u[11]); out[15] = _mm256_packs_epi32(u[12], u[13]); out[31] = _mm256_packs_epi32(u[14], u[15]); } { - const __m256i k32_p27_p05 = pair256_set_epi32(cospi_27_64, cospi_5_64); - const __m256i k32_p11_p21 = pair256_set_epi32(cospi_11_64, cospi_21_64); - const __m256i k32_p19_p13 = pair256_set_epi32(cospi_19_64, cospi_13_64); - const __m256i k32_p03_p29 = pair256_set_epi32(cospi_3_64, cospi_29_64); - const __m256i k32_m29_p03 = pair256_set_epi32(-cospi_29_64, cospi_3_64); - const __m256i k32_m13_p19 = pair256_set_epi32(-cospi_13_64, cospi_19_64); - const __m256i k32_m21_p11 = pair256_set_epi32(-cospi_21_64, cospi_11_64); - const __m256i k32_m05_p27 = pair256_set_epi32(-cospi_5_64, cospi_27_64); - - u[ 0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]); - u[ 1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]); - u[ 2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]); - u[ 3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]); - u[ 4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]); - u[ 5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]); - u[ 6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]); - u[ 7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]); - u[ 8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]); - u[ 9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]); + const __m256i k32_p27_p05 = + pair256_set_epi32(cospi_27_64, cospi_5_64); + const __m256i k32_p11_p21 = + pair256_set_epi32(cospi_11_64, cospi_21_64); + const __m256i k32_p19_p13 = + pair256_set_epi32(cospi_19_64, cospi_13_64); + const __m256i k32_p03_p29 = + pair256_set_epi32(cospi_3_64, cospi_29_64); + const __m256i k32_m29_p03 = + pair256_set_epi32(-cospi_29_64, cospi_3_64); + const __m256i k32_m13_p19 = + pair256_set_epi32(-cospi_13_64, cospi_19_64); + const __m256i k32_m21_p11 = + pair256_set_epi32(-cospi_21_64, cospi_11_64); + const __m256i k32_m05_p27 = + pair256_set_epi32(-cospi_5_64, cospi_27_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]); + u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]); + u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]); + u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]); + u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]); + u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]); + u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]); + u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]); + u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]); + u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]); u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]); u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]); u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]); @@ -2410,16 +2684,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]); u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]); - v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p27_p05); - v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p27_p05); - v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p27_p05); - v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p27_p05); - v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p11_p21); - v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p11_p21); - v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p11_p21); - v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p11_p21); - v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p19_p13); - v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p19_p13); + v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05); + v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05); + v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05); + v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05); + v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21); + v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21); + v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21); + v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21); + v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13); + v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13); v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13); v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13); v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29); @@ -2430,29 +2704,29 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03); v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03); v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03); - v[20] = k_madd_epi32_avx2(u[ 8], k32_m13_p19); - v[21] = k_madd_epi32_avx2(u[ 9], k32_m13_p19); + v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19); + v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19); v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19); v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19); - v[24] = k_madd_epi32_avx2(u[ 4], k32_m21_p11); - v[25] = k_madd_epi32_avx2(u[ 5], k32_m21_p11); - v[26] = k_madd_epi32_avx2(u[ 6], k32_m21_p11); - v[27] = k_madd_epi32_avx2(u[ 7], k32_m21_p11); - v[28] = k_madd_epi32_avx2(u[ 0], k32_m05_p27); - v[29] = k_madd_epi32_avx2(u[ 1], k32_m05_p27); - v[30] = k_madd_epi32_avx2(u[ 2], k32_m05_p27); - v[31] = k_madd_epi32_avx2(u[ 3], k32_m05_p27); - - u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]); - u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]); - u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]); - u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]); - u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]); - u[ 5] = k_packs_epi64_avx2(v[10], v[11]); - u[ 6] = k_packs_epi64_avx2(v[12], v[13]); - u[ 7] = k_packs_epi64_avx2(v[14], v[15]); - u[ 8] = k_packs_epi64_avx2(v[16], v[17]); - u[ 9] = k_packs_epi64_avx2(v[18], v[19]); + v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11); + v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11); + v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11); + v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11); + v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27); + v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27); + v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27); + v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); u[10] = k_packs_epi64_avx2(v[20], v[21]); u[11] = k_packs_epi64_avx2(v[22], v[23]); u[12] = k_packs_epi64_avx2(v[24], v[25]); @@ -2460,16 +2734,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = k_packs_epi64_avx2(v[28], v[29]); u[15] = k_packs_epi64_avx2(v[30], v[31]); - v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); - v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); - v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); - v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); - v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); - v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); - v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); - v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); - v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); - v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); @@ -2477,16 +2751,16 @@ void FDCT32x32_2D_AVX2(const int16_t *input, v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS); - u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS); - u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS); - u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS); - u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS); - u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS); - u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS); - u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS); - u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS); - u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS); + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); @@ -2494,33 +2768,33 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]); - v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]); - v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]); - v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]); - v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]); - v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]); - v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]); - v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]); - v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]); - v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]); - v[10] = _mm256_cmpgt_epi32(kZero,u[10]); - v[11] = _mm256_cmpgt_epi32(kZero,u[11]); - v[12] = _mm256_cmpgt_epi32(kZero,u[12]); - v[13] = _mm256_cmpgt_epi32(kZero,u[13]); - v[14] = _mm256_cmpgt_epi32(kZero,u[14]); - v[15] = _mm256_cmpgt_epi32(kZero,u[15]); - - u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]); - u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]); - u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]); - u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]); - u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]); - u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]); - u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]); - u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]); - u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]); - u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]); + v[0] = _mm256_cmpgt_epi32(kZero, u[0]); + v[1] = _mm256_cmpgt_epi32(kZero, u[1]); + v[2] = _mm256_cmpgt_epi32(kZero, u[2]); + v[3] = _mm256_cmpgt_epi32(kZero, u[3]); + v[4] = _mm256_cmpgt_epi32(kZero, u[4]); + v[5] = _mm256_cmpgt_epi32(kZero, u[5]); + v[6] = _mm256_cmpgt_epi32(kZero, u[6]); + v[7] = _mm256_cmpgt_epi32(kZero, u[7]); + v[8] = _mm256_cmpgt_epi32(kZero, u[8]); + v[9] = _mm256_cmpgt_epi32(kZero, u[9]); + v[10] = _mm256_cmpgt_epi32(kZero, u[10]); + v[11] = _mm256_cmpgt_epi32(kZero, u[11]); + v[12] = _mm256_cmpgt_epi32(kZero, u[12]); + v[13] = _mm256_cmpgt_epi32(kZero, u[13]); + v[14] = _mm256_cmpgt_epi32(kZero, u[14]); + v[15] = _mm256_cmpgt_epi32(kZero, u[15]); + + u[0] = _mm256_sub_epi32(u[0], v[0]); + u[1] = _mm256_sub_epi32(u[1], v[1]); + u[2] = _mm256_sub_epi32(u[2], v[2]); + u[3] = _mm256_sub_epi32(u[3], v[3]); + u[4] = _mm256_sub_epi32(u[4], v[4]); + u[5] = _mm256_sub_epi32(u[5], v[5]); + u[6] = _mm256_sub_epi32(u[6], v[6]); + u[7] = _mm256_sub_epi32(u[7], v[7]); + u[8] = _mm256_sub_epi32(u[8], v[8]); + u[9] = _mm256_sub_epi32(u[9], v[9]); u[10] = _mm256_sub_epi32(u[10], v[10]); u[11] = _mm256_sub_epi32(u[11], v[11]); u[12] = _mm256_sub_epi32(u[12], v[12]); @@ -2562,11 +2836,11 @@ void FDCT32x32_2D_AVX2(const int16_t *input, u[14] = _mm256_srai_epi32(v[14], 2); u[15] = _mm256_srai_epi32(v[15], 2); - out[ 5] = _mm256_packs_epi32(u[0], u[1]); + out[5] = _mm256_packs_epi32(u[0], u[1]); out[21] = _mm256_packs_epi32(u[2], u[3]); out[13] = _mm256_packs_epi32(u[4], u[5]); out[29] = _mm256_packs_epi32(u[6], u[7]); - out[ 3] = _mm256_packs_epi32(u[8], u[9]); + out[3] = _mm256_packs_epi32(u[8], u[9]); out[19] = _mm256_packs_epi32(u[10], u[11]); out[11] = _mm256_packs_epi32(u[12], u[13]); out[27] = _mm256_packs_epi32(u[14], u[15]); @@ -2576,13 +2850,13 @@ void FDCT32x32_2D_AVX2(const int16_t *input, // Transpose the results, do it as four 8x8 transposes. { int transpose_block; - int16_t *output_currStep,*output_nextStep; - if (0 == pass){ - output_currStep = &intermediate[column_start * 32]; - output_nextStep = &intermediate[(column_start + 8) * 32]; - } else{ - output_currStep = &output_org[column_start * 32]; - output_nextStep = &output_org[(column_start + 8) * 32]; + int16_t *output_currStep, *output_nextStep; + if (0 == pass) { + output_currStep = &intermediate[column_start * 32]; + output_nextStep = &intermediate[(column_start + 8) * 32]; + } else { + output_currStep = &output_org[column_start * 32]; + output_nextStep = &output_org[(column_start + 8) * 32]; } for (transpose_block = 0; transpose_block < 4; ++transpose_block) { __m256i *this_out = &out[8 * transpose_block]; @@ -2685,23 +2959,39 @@ void FDCT32x32_2D_AVX2(const int16_t *input, } // Note: even though all these stores are aligned, using the aligned // intrinsic make the code slightly slower. - _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), _mm256_castsi256_si128(tr2_0)); - _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), _mm256_castsi256_si128(tr2_1)); - _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), _mm256_castsi256_si128(tr2_2)); - _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), _mm256_castsi256_si128(tr2_3)); - _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), _mm256_castsi256_si128(tr2_4)); - _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), _mm256_castsi256_si128(tr2_5)); - _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), _mm256_castsi256_si128(tr2_6)); - _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), _mm256_castsi256_si128(tr2_7)); - - _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), _mm256_extractf128_si256(tr2_0,1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), _mm256_extractf128_si256(tr2_1,1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), _mm256_extractf128_si256(tr2_2,1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), _mm256_extractf128_si256(tr2_3,1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), _mm256_extractf128_si256(tr2_4,1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), _mm256_extractf128_si256(tr2_5,1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extractf128_si256(tr2_6,1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extractf128_si256(tr2_7,1)); + _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), + _mm256_castsi256_si128(tr2_0)); + _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), + _mm256_castsi256_si128(tr2_1)); + _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), + _mm256_castsi256_si128(tr2_2)); + _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), + _mm256_castsi256_si128(tr2_3)); + _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), + _mm256_castsi256_si128(tr2_4)); + _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), + _mm256_castsi256_si128(tr2_5)); + _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), + _mm256_castsi256_si128(tr2_6)); + _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), + _mm256_castsi256_si128(tr2_7)); + + _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), + _mm256_extractf128_si256(tr2_0, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), + _mm256_extractf128_si256(tr2_1, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), + _mm256_extractf128_si256(tr2_2, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), + _mm256_extractf128_si256(tr2_3, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), + _mm256_extractf128_si256(tr2_4, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), + _mm256_extractf128_si256(tr2_5, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), + _mm256_extractf128_si256(tr2_6, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), + _mm256_extractf128_si256(tr2_7, 1)); // Process next 8x8 output_currStep += 8; output_nextStep += 8; diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h index b85ae103fa470a6c2356e671653070a8ae0eef62..37443339094b30acd90437e44c3e8b6007d32055 100644 --- a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h +++ b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h @@ -22,42 +22,37 @@ #define SUB_EPI16 _mm_subs_epi16 #if FDCT32x32_HIGH_PRECISION void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) { - int i, j; - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = intermediate[j * 32 + i]; - vpx_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - out[j + i * 32] = - (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); - } + int i, j; + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; + vpx_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + out[j + i * 32] = + (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); + } } - #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c - #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c +#define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c +#define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c #else void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) { - int i, j; - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) - temp_in[j] = intermediate[j * 32 + i]; - vpx_fdct32(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) - out[j + i * 32] = (tran_low_t)temp_out[j]; - } + int i, j; + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; + vpx_fdct32(temp_in, temp_out, 1); + for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; + } } - #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c - #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c +#define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c +#define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c #endif // FDCT32x32_HIGH_PRECISION #else #define ADD_EPI16 _mm_add_epi16 #define SUB_EPI16 _mm_sub_epi16 #endif // DCT_HIGH_BIT_DEPTH - -void FDCT32x32_2D(const int16_t *input, - tran_low_t *output_org, int stride) { +void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) { // Calculate pre-multiplied strides const int str1 = stride; const int str2 = 2 * stride; @@ -70,42 +65,42 @@ void FDCT32x32_2D(const int16_t *input, // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); - const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); - const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); - const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); - const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); - const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); - const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); - const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); - const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); - const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); - const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); - const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); - const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); - const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); - const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); - const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); + const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); + const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); + const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); + const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); + const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); + const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); + const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); + const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); + const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); + const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); + const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); + const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); + const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); + const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); + const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i kZero = _mm_set1_epi16(0); - const __m128i kOne = _mm_set1_epi16(1); + const __m128i kOne = _mm_set1_epi16(1); // Do the two transform/transpose passes int pass; #if DCT_HIGH_BIT_DEPTH @@ -123,125 +118,125 @@ void FDCT32x32_2D(const int16_t *input, // Note: even though all the loads below are aligned, using the aligned // intrinsic make the code slightly slower. if (0 == pass) { - const int16_t *in = &input[column_start]; + const int16_t *in = &input[column_start]; // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; // Note: the next four blocks could be in a loop. That would help the // instruction cache but is actually slower. { - const int16_t *ina = in + 0 * str1; - const int16_t *inb = in + 31 * str1; - __m128i *step1a = &step1[ 0]; + const int16_t *ina = in + 0 * str1; + const int16_t *inb = in + 31 * str1; + __m128i *step1a = &step1[0]; __m128i *step1b = &step1[31]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[ 0] = _mm_add_epi16(ina0, inb0); - step1a[ 1] = _mm_add_epi16(ina1, inb1); - step1a[ 2] = _mm_add_epi16(ina2, inb2); - step1a[ 3] = _mm_add_epi16(ina3, inb3); + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); step1b[-3] = _mm_sub_epi16(ina3, inb3); step1b[-2] = _mm_sub_epi16(ina2, inb2); step1b[-1] = _mm_sub_epi16(ina1, inb1); step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); - step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); - step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); - step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); step1b[-3] = _mm_slli_epi16(step1b[-3], 2); step1b[-2] = _mm_slli_epi16(step1b[-2], 2); step1b[-1] = _mm_slli_epi16(step1b[-1], 2); step1b[-0] = _mm_slli_epi16(step1b[-0], 2); } { - const int16_t *ina = in + 4 * str1; - const int16_t *inb = in + 27 * str1; - __m128i *step1a = &step1[ 4]; + const int16_t *ina = in + 4 * str1; + const int16_t *inb = in + 27 * str1; + __m128i *step1a = &step1[4]; __m128i *step1b = &step1[27]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[ 0] = _mm_add_epi16(ina0, inb0); - step1a[ 1] = _mm_add_epi16(ina1, inb1); - step1a[ 2] = _mm_add_epi16(ina2, inb2); - step1a[ 3] = _mm_add_epi16(ina3, inb3); + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); step1b[-3] = _mm_sub_epi16(ina3, inb3); step1b[-2] = _mm_sub_epi16(ina2, inb2); step1b[-1] = _mm_sub_epi16(ina1, inb1); step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); - step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); - step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); - step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); step1b[-3] = _mm_slli_epi16(step1b[-3], 2); step1b[-2] = _mm_slli_epi16(step1b[-2], 2); step1b[-1] = _mm_slli_epi16(step1b[-1], 2); step1b[-0] = _mm_slli_epi16(step1b[-0], 2); } { - const int16_t *ina = in + 8 * str1; - const int16_t *inb = in + 23 * str1; - __m128i *step1a = &step1[ 8]; + const int16_t *ina = in + 8 * str1; + const int16_t *inb = in + 23 * str1; + __m128i *step1a = &step1[8]; __m128i *step1b = &step1[23]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[ 0] = _mm_add_epi16(ina0, inb0); - step1a[ 1] = _mm_add_epi16(ina1, inb1); - step1a[ 2] = _mm_add_epi16(ina2, inb2); - step1a[ 3] = _mm_add_epi16(ina3, inb3); + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); step1b[-3] = _mm_sub_epi16(ina3, inb3); step1b[-2] = _mm_sub_epi16(ina2, inb2); step1b[-1] = _mm_sub_epi16(ina1, inb1); step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); - step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); - step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); - step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); step1b[-3] = _mm_slli_epi16(step1b[-3], 2); step1b[-2] = _mm_slli_epi16(step1b[-2], 2); step1b[-1] = _mm_slli_epi16(step1b[-1], 2); step1b[-0] = _mm_slli_epi16(step1b[-0], 2); } { - const int16_t *ina = in + 12 * str1; - const int16_t *inb = in + 19 * str1; + const int16_t *ina = in + 12 * str1; + const int16_t *inb = in + 19 * str1; __m128i *step1a = &step1[12]; __m128i *step1b = &step1[19]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[ 0] = _mm_add_epi16(ina0, inb0); - step1a[ 1] = _mm_add_epi16(ina1, inb1); - step1a[ 2] = _mm_add_epi16(ina2, inb2); - step1a[ 3] = _mm_add_epi16(ina3, inb3); + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); step1b[-3] = _mm_sub_epi16(ina3, inb3); step1b[-2] = _mm_sub_epi16(ina2, inb2); step1b[-1] = _mm_sub_epi16(ina1, inb1); step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); - step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); - step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); - step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); step1b[-3] = _mm_slli_epi16(step1b[-3], 2); step1b[-2] = _mm_slli_epi16(step1b[-2], 2); step1b[-1] = _mm_slli_epi16(step1b[-1], 2); @@ -256,14 +251,14 @@ void FDCT32x32_2D(const int16_t *input, // Note: the next four blocks could be in a loop. That would help the // instruction cache but is actually slower. { - __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); - __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); - __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); - __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); - __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); - __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); - __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); - __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); + __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); + __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); + __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); + __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); + __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); + __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); + __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); + __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); step1[0] = ADD_EPI16(in00, in31); step1[1] = ADD_EPI16(in01, in30); step1[2] = ADD_EPI16(in02, in29); @@ -283,14 +278,14 @@ void FDCT32x32_2D(const int16_t *input, #endif // DCT_HIGH_BIT_DEPTH } { - __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); - __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); - __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); - __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); - __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); - __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); - __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); - __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); + __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); + __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); + __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); + __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); + __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); + __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); + __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); + __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); step1[4] = ADD_EPI16(in04, in27); step1[5] = ADD_EPI16(in05, in26); step1[6] = ADD_EPI16(in06, in25); @@ -310,14 +305,14 @@ void FDCT32x32_2D(const int16_t *input, #endif // DCT_HIGH_BIT_DEPTH } { - __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); - __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); - __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); - __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); - __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); - __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); - __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); - __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); + __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); + __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); + __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); + __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); + __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); + __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); + __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); + __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); step1[8] = ADD_EPI16(in08, in23); step1[9] = ADD_EPI16(in09, in22); step1[10] = ADD_EPI16(in10, in21); @@ -337,14 +332,14 @@ void FDCT32x32_2D(const int16_t *input, #endif // DCT_HIGH_BIT_DEPTH } { - __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); - __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); - __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); - __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); - __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); - __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); - __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); - __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); + __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); + __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); + __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); + __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); + __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); + __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); + __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); + __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); step1[12] = ADD_EPI16(in12, in19); step1[13] = ADD_EPI16(in13, in18); step1[14] = ADD_EPI16(in14, in17); @@ -372,10 +367,10 @@ void FDCT32x32_2D(const int16_t *input, step2[3] = ADD_EPI16(step1[3], step1[12]); step2[4] = ADD_EPI16(step1[4], step1[11]); step2[5] = ADD_EPI16(step1[5], step1[10]); - step2[6] = ADD_EPI16(step1[6], step1[ 9]); - step2[7] = ADD_EPI16(step1[7], step1[ 8]); - step2[8] = SUB_EPI16(step1[7], step1[ 8]); - step2[9] = SUB_EPI16(step1[6], step1[ 9]); + step2[6] = ADD_EPI16(step1[6], step1[9]); + step2[7] = ADD_EPI16(step1[7], step1[8]); + step2[8] = SUB_EPI16(step1[7], step1[8]); + step2[9] = SUB_EPI16(step1[6], step1[9]); step2[10] = SUB_EPI16(step1[5], step1[10]); step2[11] = SUB_EPI16(step1[4], step1[11]); step2[12] = SUB_EPI16(step1[3], step1[12]); @@ -384,9 +379,8 @@ void FDCT32x32_2D(const int16_t *input, step2[15] = SUB_EPI16(step1[0], step1[15]); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x16( - &step2[0], &step2[1], &step2[2], &step2[3], - &step2[4], &step2[5], &step2[6], &step2[7], - &step2[8], &step2[9], &step2[10], &step2[11], + &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], + &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], &step2[12], &step2[13], &step2[14], &step2[15]); if (overflow) { if (pass == 0) @@ -482,16 +476,16 @@ void FDCT32x32_2D(const int16_t *input, // dump the magnitude by half, hence the intermediate values are within // the range of 16 bits. if (1 == pass) { - __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero); - __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero); - __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero); - __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero); - __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero); - __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero); - __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero); - __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero); - __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero); - __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero); + __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero); + __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero); + __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero); + __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero); + __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero); + __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero); + __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero); + __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero); + __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero); + __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero); __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero); __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero); __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero); @@ -515,16 +509,16 @@ void FDCT32x32_2D(const int16_t *input, __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero); __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero); - step2[0] = SUB_EPI16(step2[ 0], s3_00_0); - step2[1] = SUB_EPI16(step2[ 1], s3_01_0); - step2[2] = SUB_EPI16(step2[ 2], s3_02_0); - step2[3] = SUB_EPI16(step2[ 3], s3_03_0); - step2[4] = SUB_EPI16(step2[ 4], s3_04_0); - step2[5] = SUB_EPI16(step2[ 5], s3_05_0); - step2[6] = SUB_EPI16(step2[ 6], s3_06_0); - step2[7] = SUB_EPI16(step2[ 7], s3_07_0); - step2[8] = SUB_EPI16(step2[ 8], s2_08_0); - step2[9] = SUB_EPI16(step2[ 9], s2_09_0); + step2[0] = SUB_EPI16(step2[0], s3_00_0); + step2[1] = SUB_EPI16(step2[1], s3_01_0); + step2[2] = SUB_EPI16(step2[2], s3_02_0); + step2[3] = SUB_EPI16(step2[3], s3_03_0); + step2[4] = SUB_EPI16(step2[4], s3_04_0); + step2[5] = SUB_EPI16(step2[5], s3_05_0); + step2[6] = SUB_EPI16(step2[6], s3_06_0); + step2[7] = SUB_EPI16(step2[7], s3_07_0); + step2[8] = SUB_EPI16(step2[8], s2_08_0); + step2[9] = SUB_EPI16(step2[9], s2_09_0); step2[10] = SUB_EPI16(step2[10], s3_10_0); step2[11] = SUB_EPI16(step2[11], s3_11_0); step2[12] = SUB_EPI16(step2[12], s3_12_0); @@ -549,29 +543,27 @@ void FDCT32x32_2D(const int16_t *input, step1[31] = SUB_EPI16(step1[31], s3_31_0); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x32( - &step2[0], &step2[1], &step2[2], &step2[3], - &step2[4], &step2[5], &step2[6], &step2[7], - &step2[8], &step2[9], &step2[10], &step2[11], - &step2[12], &step2[13], &step2[14], &step2[15], - &step1[16], &step1[17], &step1[18], &step1[19], - &step2[20], &step2[21], &step2[22], &step2[23], - &step2[24], &step2[25], &step2[26], &step2[27], - &step1[28], &step1[29], &step1[30], &step1[31]); + &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], + &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], + &step2[12], &step2[13], &step2[14], &step2[15], &step1[16], + &step1[17], &step1[18], &step1[19], &step2[20], &step2[21], + &step2[22], &step2[23], &step2[24], &step2[25], &step2[26], + &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; } #endif // DCT_HIGH_BIT_DEPTH - step2[0] = _mm_add_epi16(step2[ 0], kOne); - step2[1] = _mm_add_epi16(step2[ 1], kOne); - step2[2] = _mm_add_epi16(step2[ 2], kOne); - step2[3] = _mm_add_epi16(step2[ 3], kOne); - step2[4] = _mm_add_epi16(step2[ 4], kOne); - step2[5] = _mm_add_epi16(step2[ 5], kOne); - step2[6] = _mm_add_epi16(step2[ 6], kOne); - step2[7] = _mm_add_epi16(step2[ 7], kOne); - step2[8] = _mm_add_epi16(step2[ 8], kOne); - step2[9] = _mm_add_epi16(step2[ 9], kOne); + step2[0] = _mm_add_epi16(step2[0], kOne); + step2[1] = _mm_add_epi16(step2[1], kOne); + step2[2] = _mm_add_epi16(step2[2], kOne); + step2[3] = _mm_add_epi16(step2[3], kOne); + step2[4] = _mm_add_epi16(step2[4], kOne); + step2[5] = _mm_add_epi16(step2[5], kOne); + step2[6] = _mm_add_epi16(step2[6], kOne); + step2[7] = _mm_add_epi16(step2[7], kOne); + step2[8] = _mm_add_epi16(step2[8], kOne); + step2[9] = _mm_add_epi16(step2[9], kOne); step2[10] = _mm_add_epi16(step2[10], kOne); step2[11] = _mm_add_epi16(step2[11], kOne); step2[12] = _mm_add_epi16(step2[12], kOne); @@ -595,16 +587,16 @@ void FDCT32x32_2D(const int16_t *input, step1[30] = _mm_add_epi16(step1[30], kOne); step1[31] = _mm_add_epi16(step1[31], kOne); - step2[0] = _mm_srai_epi16(step2[ 0], 2); - step2[1] = _mm_srai_epi16(step2[ 1], 2); - step2[2] = _mm_srai_epi16(step2[ 2], 2); - step2[3] = _mm_srai_epi16(step2[ 3], 2); - step2[4] = _mm_srai_epi16(step2[ 4], 2); - step2[5] = _mm_srai_epi16(step2[ 5], 2); - step2[6] = _mm_srai_epi16(step2[ 6], 2); - step2[7] = _mm_srai_epi16(step2[ 7], 2); - step2[8] = _mm_srai_epi16(step2[ 8], 2); - step2[9] = _mm_srai_epi16(step2[ 9], 2); + step2[0] = _mm_srai_epi16(step2[0], 2); + step2[1] = _mm_srai_epi16(step2[1], 2); + step2[2] = _mm_srai_epi16(step2[2], 2); + step2[3] = _mm_srai_epi16(step2[3], 2); + step2[4] = _mm_srai_epi16(step2[4], 2); + step2[5] = _mm_srai_epi16(step2[5], 2); + step2[6] = _mm_srai_epi16(step2[6], 2); + step2[7] = _mm_srai_epi16(step2[7], 2); + step2[8] = _mm_srai_epi16(step2[8], 2); + step2[9] = _mm_srai_epi16(step2[9], 2); step2[10] = _mm_srai_epi16(step2[10], 2); step2[11] = _mm_srai_epi16(step2[11], 2); step2[12] = _mm_srai_epi16(step2[12], 2); @@ -633,821 +625,884 @@ void FDCT32x32_2D(const int16_t *input, #if FDCT32x32_HIGH_PRECISION if (pass == 0) { #endif - // Stage 3 - { - step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]); - step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]); - step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]); - step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]); - step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]); - step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]); - step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]); - step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]); + // Stage 3 + { + step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]); + step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]); + step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]); + step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]); + step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]); + step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]); + step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]); + step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2], - &step3[3], &step3[4], &step3[5], - &step3[6], &step3[7]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } + overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2], + &step3[3], &step3[4], &step3[5], + &step3[6], &step3[7]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); - const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); - const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); - const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); - const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); - const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); - const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); - const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); - const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); - const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); - const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); - const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); - // Combine - step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); - step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); - step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); - step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step3[10], &step3[11], - &step3[12], &step3[13]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step3[16] = ADD_EPI16(step2[23], step1[16]); - step3[17] = ADD_EPI16(step2[22], step1[17]); - step3[18] = ADD_EPI16(step2[21], step1[18]); - step3[19] = ADD_EPI16(step2[20], step1[19]); - step3[20] = SUB_EPI16(step1[19], step2[20]); - step3[21] = SUB_EPI16(step1[18], step2[21]); - step3[22] = SUB_EPI16(step1[17], step2[22]); - step3[23] = SUB_EPI16(step1[16], step2[23]); - step3[24] = SUB_EPI16(step1[31], step2[24]); - step3[25] = SUB_EPI16(step1[30], step2[25]); - step3[26] = SUB_EPI16(step1[29], step2[26]); - step3[27] = SUB_EPI16(step1[28], step2[27]); - step3[28] = ADD_EPI16(step2[27], step1[28]); - step3[29] = ADD_EPI16(step2[26], step1[29]); - step3[30] = ADD_EPI16(step2[25], step1[30]); - step3[31] = ADD_EPI16(step2[24], step1[31]); + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step3[16], &step3[17], &step3[18], &step3[19], - &step3[20], &step3[21], &step3[22], &step3[23], - &step3[24], &step3[25], &step3[26], &step3[27], - &step3[28], &step3[29], &step3[30], &step3[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; + overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12], + &step3[13]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH } + { + step3[16] = ADD_EPI16(step2[23], step1[16]); + step3[17] = ADD_EPI16(step2[22], step1[17]); + step3[18] = ADD_EPI16(step2[21], step1[18]); + step3[19] = ADD_EPI16(step2[20], step1[19]); + step3[20] = SUB_EPI16(step1[19], step2[20]); + step3[21] = SUB_EPI16(step1[18], step2[21]); + step3[22] = SUB_EPI16(step1[17], step2[22]); + step3[23] = SUB_EPI16(step1[16], step2[23]); + step3[24] = SUB_EPI16(step1[31], step2[24]); + step3[25] = SUB_EPI16(step1[30], step2[25]); + step3[26] = SUB_EPI16(step1[29], step2[26]); + step3[27] = SUB_EPI16(step1[28], step2[27]); + step3[28] = ADD_EPI16(step2[27], step1[28]); + step3[29] = ADD_EPI16(step2[26], step1[29]); + step3[30] = ADD_EPI16(step2[25], step1[30]); + step3[31] = ADD_EPI16(step2[24], step1[31]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step3[16], &step3[17], &step3[18], &step3[19], &step3[20], + &step3[21], &step3[22], &step3[23], &step3[24], &step3[25], + &step3[26], &step3[27], &step3[28], &step3[29], &step3[30], + &step3[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - } + } - // Stage 4 - { - step1[0] = ADD_EPI16(step3[ 3], step3[ 0]); - step1[1] = ADD_EPI16(step3[ 2], step3[ 1]); - step1[2] = SUB_EPI16(step3[ 1], step3[ 2]); - step1[3] = SUB_EPI16(step3[ 0], step3[ 3]); - step1[8] = ADD_EPI16(step3[11], step2[ 8]); - step1[9] = ADD_EPI16(step3[10], step2[ 9]); - step1[10] = SUB_EPI16(step2[ 9], step3[10]); - step1[11] = SUB_EPI16(step2[ 8], step3[11]); - step1[12] = SUB_EPI16(step2[15], step3[12]); - step1[13] = SUB_EPI16(step2[14], step3[13]); - step1[14] = ADD_EPI16(step3[13], step2[14]); - step1[15] = ADD_EPI16(step3[12], step2[15]); + // Stage 4 + { + step1[0] = ADD_EPI16(step3[3], step3[0]); + step1[1] = ADD_EPI16(step3[2], step3[1]); + step1[2] = SUB_EPI16(step3[1], step3[2]); + step1[3] = SUB_EPI16(step3[0], step3[3]); + step1[8] = ADD_EPI16(step3[11], step2[8]); + step1[9] = ADD_EPI16(step3[10], step2[9]); + step1[10] = SUB_EPI16(step2[9], step3[10]); + step1[11] = SUB_EPI16(step2[8], step3[11]); + step1[12] = SUB_EPI16(step2[15], step3[12]); + step1[13] = SUB_EPI16(step2[14], step3[13]); + step1[14] = ADD_EPI16(step3[13], step2[14]); + step1[15] = ADD_EPI16(step3[12], step2[15]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step1[0], &step1[1], &step1[2], &step1[3], - &step1[4], &step1[5], &step1[6], &step1[7], - &step1[8], &step1[9], &step1[10], &step1[11], - &step1[12], &step1[13], &step1[14], &step1[15]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } + overflow = check_epi16_overflow_x16( + &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5], + &step1[6], &step1[7], &step1[8], &step1[9], &step1[10], + &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); - const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); - const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); - const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); - const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); - const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); - const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); - const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); - const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); - const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); - const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); - const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); - const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); - // Combine - step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); - step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&step1[5], &step1[6]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); - const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); - const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); - const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); - const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); - const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); - const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); - const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); - const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); - const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); - const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); - const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); - const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); - const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); - const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); - const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); - const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); - const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); - const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); - const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); - const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); - const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); - const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); - const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); - const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); - const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); - const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); - const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); - const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); - const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); - const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); - const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); - const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); - const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); - const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); - const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); - const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); - const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); - const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); - const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); - const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); - const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); - const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); - const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); - const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); - const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); - const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); - const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); - const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); - const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); - const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); - const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); - const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); - const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); - const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); - // Combine - step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); - step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); - step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); - step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); - step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); - step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); - step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); - step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); + { + const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); + const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); + const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20], - &step1[21], &step1[26], &step1[27], - &step1[28], &step1[29]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } + overflow = check_epi16_overflow_x2(&step1[5], &step1[6]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - } - // Stage 5 - { - step2[4] = ADD_EPI16(step1[5], step3[4]); - step2[5] = SUB_EPI16(step3[4], step1[5]); - step2[6] = SUB_EPI16(step3[7], step1[6]); - step2[7] = ADD_EPI16(step1[6], step3[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step2[4], &step2[5], - &step2[6], &step2[7]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); - const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); - const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); - const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); - const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); - const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); - const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); - const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); - const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); - const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); - const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); - const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); - const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); - const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); - const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); - const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); - const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); - const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); - const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); - const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); - const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); - const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); - const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); - const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); - const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); - const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); - const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); - // Combine - out[ 0] = _mm_packs_epi32(out_00_6, out_00_7); - out[16] = _mm_packs_epi32(out_16_6, out_16_7); - out[ 8] = _mm_packs_epi32(out_08_6, out_08_7); - out[24] = _mm_packs_epi32(out_24_6, out_24_7); + { + const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); + const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); + const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); + const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); + const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); + const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); + const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); + const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); + const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&out[0], &out[16], - &out[8], &out[24]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } + overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20], + &step1[21], &step1[26], &step1[27], + &step1[28], &step1[29]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]); - const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]); - const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); - const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); - const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); - const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); - const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); - const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); - const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); - const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); - const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); - const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); - const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); - const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); - const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); - const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); - const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); - const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); - const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); - const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); - const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); - const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); - const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); - const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); - const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); - const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); - const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); - // Combine - step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7); - step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); - step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); - step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step2[9], &step2[10], - &step2[13], &step2[14]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step2[16] = ADD_EPI16(step1[19], step3[16]); - step2[17] = ADD_EPI16(step1[18], step3[17]); - step2[18] = SUB_EPI16(step3[17], step1[18]); - step2[19] = SUB_EPI16(step3[16], step1[19]); - step2[20] = SUB_EPI16(step3[23], step1[20]); - step2[21] = SUB_EPI16(step3[22], step1[21]); - step2[22] = ADD_EPI16(step1[21], step3[22]); - step2[23] = ADD_EPI16(step1[20], step3[23]); - step2[24] = ADD_EPI16(step1[27], step3[24]); - step2[25] = ADD_EPI16(step1[26], step3[25]); - step2[26] = SUB_EPI16(step3[25], step1[26]); - step2[27] = SUB_EPI16(step3[24], step1[27]); - step2[28] = SUB_EPI16(step3[31], step1[28]); - step2[29] = SUB_EPI16(step3[30], step1[29]); - step2[30] = ADD_EPI16(step1[29], step3[30]); - step2[31] = ADD_EPI16(step1[28], step3[31]); + // Stage 5 + { + step2[4] = ADD_EPI16(step1[5], step3[4]); + step2[5] = SUB_EPI16(step3[4], step1[5]); + step2[6] = SUB_EPI16(step3[7], step1[6]); + step2[7] = ADD_EPI16(step1[6], step3[7]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step2[16], &step2[17], &step2[18], &step2[19], - &step2[20], &step2[21], &step2[22], &step2[23], - &step2[24], &step2[25], &step2[26], &step2[27], - &step2[28], &step2[29], &step2[30], &step2[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } + overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6], + &step2[7]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - } - // Stage 6 - { - const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); - const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); - const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); - const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); - const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); - const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); - const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); - const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); - // dct_const_round_shift - const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); - const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); - const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); - const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); - const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); - const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); - const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); - const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); - const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); - const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); - const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); - const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); - const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); - const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); - const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); - const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); - // Combine - out[4] = _mm_packs_epi32(out_04_6, out_04_7); - out[20] = _mm_packs_epi32(out_20_6, out_20_7); - out[12] = _mm_packs_epi32(out_12_6, out_12_7); - out[28] = _mm_packs_epi32(out_28_6, out_28_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&out[4], &out[20], - &out[12], &out[28]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step3[8] = ADD_EPI16(step2[ 9], step1[ 8]); - step3[9] = SUB_EPI16(step1[ 8], step2[ 9]); - step3[10] = SUB_EPI16(step1[11], step2[10]); - step3[11] = ADD_EPI16(step2[10], step1[11]); - step3[12] = ADD_EPI16(step2[13], step1[12]); - step3[13] = SUB_EPI16(step1[12], step2[13]); - step3[14] = SUB_EPI16(step1[15], step2[14]); - step3[15] = ADD_EPI16(step2[14], step1[15]); + { + const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); + const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); + const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); + const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); + const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i out_00_4 = + _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m128i out_00_5 = + _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m128i out_16_4 = + _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m128i out_16_5 = + _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m128i out_08_4 = + _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m128i out_08_5 = + _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m128i out_24_4 = + _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m128i out_24_5 = + _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[0] = _mm_packs_epi32(out_00_6, out_00_7); + out[16] = _mm_packs_epi32(out_16_6, out_16_7); + out[8] = _mm_packs_epi32(out_08_6, out_08_7); + out[24] = _mm_packs_epi32(out_24_6, out_24_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10], - &step3[11], &step3[12], &step3[13], - &step3[14], &step3[15]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } + overflow = + check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); - const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); - const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); - const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); - const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); - const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); - const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); - const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); - const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); - const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); - const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); - const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); - const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); - const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); - const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); - const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); - const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); - const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); - const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); - const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); - const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); - const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); - const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); - const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); - // dct_const_round_shift - const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); - const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); - const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); - const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); - const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); - const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); - const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); - const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); - const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); - const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); - const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); - const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); - const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); - const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); - const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); - const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); - const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); - const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); - const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); - const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); - const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); - const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); - const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); - const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); - const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); - const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); - const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); - const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); - const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); - const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); - const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); - const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); - // Combine - step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); - step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); - step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); - step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); - // Combine - step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); - step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); - step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); - step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21], - &step3[22], &step3[25], &step3[26], - &step3[29], &step3[30]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; } + { + const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]); + const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]); + const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); + const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13], + &step2[14]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - } - // Stage 7 - { - const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]); - const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]); - const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]); - const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]); - const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); - const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); - const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); - const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); - const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); - const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); - const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); - const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); - const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); - const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); - const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); - const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); - const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); - const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); - const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); - const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); - const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); - const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); - const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); - const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); - // dct_const_round_shift - const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); - const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); - const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); - const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); - const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); - const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); - const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); - const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); - const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); - const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); - const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); - const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); - const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); - const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); - const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); - const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); - const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); - const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); - const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); - const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); - const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); - const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); - const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); - const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); - const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); - const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); - const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); - const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); - const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); - const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); - const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); - const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); - // Combine - out[ 2] = _mm_packs_epi32(out_02_6, out_02_7); - out[18] = _mm_packs_epi32(out_18_6, out_18_7); - out[10] = _mm_packs_epi32(out_10_6, out_10_7); - out[26] = _mm_packs_epi32(out_26_6, out_26_7); - out[ 6] = _mm_packs_epi32(out_06_6, out_06_7); - out[22] = _mm_packs_epi32(out_22_6, out_22_7); - out[14] = _mm_packs_epi32(out_14_6, out_14_7); - out[30] = _mm_packs_epi32(out_30_6, out_30_7); + } + { + step2[16] = ADD_EPI16(step1[19], step3[16]); + step2[17] = ADD_EPI16(step1[18], step3[17]); + step2[18] = SUB_EPI16(step3[17], step1[18]); + step2[19] = SUB_EPI16(step3[16], step1[19]); + step2[20] = SUB_EPI16(step3[23], step1[20]); + step2[21] = SUB_EPI16(step3[22], step1[21]); + step2[22] = ADD_EPI16(step1[21], step3[22]); + step2[23] = ADD_EPI16(step1[20], step3[23]); + step2[24] = ADD_EPI16(step1[27], step3[24]); + step2[25] = ADD_EPI16(step1[26], step3[25]); + step2[26] = SUB_EPI16(step3[25], step1[26]); + step2[27] = SUB_EPI16(step3[24], step1[27]); + step2[28] = SUB_EPI16(step3[31], step1[28]); + step2[29] = SUB_EPI16(step3[30], step1[29]); + step2[30] = ADD_EPI16(step1[29], step3[30]); + step2[31] = ADD_EPI16(step1[28], step3[31]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10], - &out[26], &out[6], &out[22], - &out[14], &out[30]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; + overflow = check_epi16_overflow_x16( + &step2[16], &step2[17], &step2[18], &step2[19], &step2[20], + &step2[21], &step2[22], &step2[23], &step2[24], &step2[25], + &step2[26], &step2[27], &step2[28], &step2[29], &step2[30], + &step2[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH } + // Stage 6 + { + const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m128i out_04_4 = + _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m128i out_04_5 = + _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m128i out_20_4 = + _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m128i out_20_5 = + _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m128i out_12_4 = + _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m128i out_12_5 = + _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m128i out_28_4 = + _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m128i out_28_5 = + _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[4] = _mm_packs_epi32(out_04_6, out_04_7); + out[20] = _mm_packs_epi32(out_20_6, out_20_7); + out[12] = _mm_packs_epi32(out_12_6, out_12_7); + out[28] = _mm_packs_epi32(out_28_6, out_28_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - } - { - step1[16] = ADD_EPI16(step3[17], step2[16]); - step1[17] = SUB_EPI16(step2[16], step3[17]); - step1[18] = SUB_EPI16(step2[19], step3[18]); - step1[19] = ADD_EPI16(step3[18], step2[19]); - step1[20] = ADD_EPI16(step3[21], step2[20]); - step1[21] = SUB_EPI16(step2[20], step3[21]); - step1[22] = SUB_EPI16(step2[23], step3[22]); - step1[23] = ADD_EPI16(step3[22], step2[23]); - step1[24] = ADD_EPI16(step3[25], step2[24]); - step1[25] = SUB_EPI16(step2[24], step3[25]); - step1[26] = SUB_EPI16(step2[27], step3[26]); - step1[27] = ADD_EPI16(step3[26], step2[27]); - step1[28] = ADD_EPI16(step3[29], step2[28]); - step1[29] = SUB_EPI16(step2[28], step3[29]); - step1[30] = SUB_EPI16(step2[31], step3[30]); - step1[31] = ADD_EPI16(step3[30], step2[31]); + } + { + step3[8] = ADD_EPI16(step2[9], step1[8]); + step3[9] = SUB_EPI16(step1[8], step2[9]); + step3[10] = SUB_EPI16(step1[11], step2[10]); + step3[11] = ADD_EPI16(step2[10], step1[11]); + step3[12] = ADD_EPI16(step2[13], step1[12]); + step3[13] = SUB_EPI16(step1[12], step2[13]); + step3[14] = SUB_EPI16(step1[15], step2[14]); + step3[15] = ADD_EPI16(step2[14], step1[15]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step1[16], &step1[17], &step1[18], &step1[19], - &step1[20], &step1[21], &step1[22], &step1[23], - &step1[24], &step1[25], &step1[26], &step1[27], - &step1[28], &step1[29], &step1[30], &step1[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; + overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10], + &step3[11], &step3[12], &step3[13], + &step3[14], &step3[15]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH } + { + const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); + const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); + const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); + const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); + const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); + const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); + const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); + const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); + const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21], + &step3[22], &step3[25], &step3[26], + &step3[29], &step3[30]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - } - // Final stage --- outputs indices are bit-reversed. - { - const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); - const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); - const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); - const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); - const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); - const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); - const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); - const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); - const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); - const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); - const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); - const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); - const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); - const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); - const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); - const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); - const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); - const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); - const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); - const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); - const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); - const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); - const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); - const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); - // dct_const_round_shift - const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); - const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); - const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); - const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); - const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); - const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); - const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); - const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); - const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); - const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); - const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); - const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); - const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); - const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); - const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); - const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); - const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); - const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); - const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); - const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); - const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); - const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); - const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); - const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); - const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); - const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); - const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); - const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); - const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); - const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); - const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); - const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); - // Combine - out[ 1] = _mm_packs_epi32(out_01_6, out_01_7); - out[17] = _mm_packs_epi32(out_17_6, out_17_7); - out[ 9] = _mm_packs_epi32(out_09_6, out_09_7); - out[25] = _mm_packs_epi32(out_25_6, out_25_7); - out[ 7] = _mm_packs_epi32(out_07_6, out_07_7); - out[23] = _mm_packs_epi32(out_23_6, out_23_7); - out[15] = _mm_packs_epi32(out_15_6, out_15_7); - out[31] = _mm_packs_epi32(out_31_6, out_31_7); + } + // Stage 7 + { + const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]); + const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]); + const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]); + const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]); + const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); + const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); + const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); + const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); + const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m128i out_02_4 = + _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m128i out_02_5 = + _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m128i out_18_4 = + _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m128i out_18_5 = + _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m128i out_10_4 = + _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m128i out_10_5 = + _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m128i out_26_4 = + _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m128i out_26_5 = + _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m128i out_06_4 = + _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m128i out_06_5 = + _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m128i out_22_4 = + _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m128i out_22_5 = + _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m128i out_14_4 = + _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m128i out_14_5 = + _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m128i out_30_4 = + _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m128i out_30_5 = + _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[2] = _mm_packs_epi32(out_02_6, out_02_7); + out[18] = _mm_packs_epi32(out_18_6, out_18_7); + out[10] = _mm_packs_epi32(out_10_6, out_10_7); + out[26] = _mm_packs_epi32(out_26_6, out_26_7); + out[6] = _mm_packs_epi32(out_06_6, out_06_7); + out[22] = _mm_packs_epi32(out_22_6, out_22_7); + out[14] = _mm_packs_epi32(out_14_6, out_14_7); + out[30] = _mm_packs_epi32(out_30_6, out_30_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9], - &out[25], &out[7], &out[23], - &out[15], &out[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; + overflow = + check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], + &out[6], &out[22], &out[14], &out[30]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH } + { + step1[16] = ADD_EPI16(step3[17], step2[16]); + step1[17] = SUB_EPI16(step2[16], step3[17]); + step1[18] = SUB_EPI16(step2[19], step3[18]); + step1[19] = ADD_EPI16(step3[18], step2[19]); + step1[20] = ADD_EPI16(step3[21], step2[20]); + step1[21] = SUB_EPI16(step2[20], step3[21]); + step1[22] = SUB_EPI16(step2[23], step3[22]); + step1[23] = ADD_EPI16(step3[22], step2[23]); + step1[24] = ADD_EPI16(step3[25], step2[24]); + step1[25] = SUB_EPI16(step2[24], step3[25]); + step1[26] = SUB_EPI16(step2[27], step3[26]); + step1[27] = ADD_EPI16(step3[26], step2[27]); + step1[28] = ADD_EPI16(step3[29], step2[28]); + step1[29] = SUB_EPI16(step2[28], step3[29]); + step1[30] = SUB_EPI16(step2[31], step3[30]); + step1[31] = ADD_EPI16(step3[30], step2[31]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step1[16], &step1[17], &step1[18], &step1[19], &step1[20], + &step1[21], &step1[22], &step1[23], &step1[24], &step1[25], + &step1[26], &step1[27], &step1[28], &step1[29], &step1[30], + &step1[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); - const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); - const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); - const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); - const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); - const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); - const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); - const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); - const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); - const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); - const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); - const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); - const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); - const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); - const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); - const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); - const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); - const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); - const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); - const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); - const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); - const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); - const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); - const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); - // dct_const_round_shift - const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); - const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); - const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); - const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); - const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); - const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); - const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); - const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); - const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); - const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); - const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); - const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); - const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); - const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); - const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); - const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); - const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); - const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); - const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); - const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); - const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); - const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); - const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); - const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); - const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); - const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); - const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); - const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); - const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); - const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); - const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); - const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); - // Combine - out[ 5] = _mm_packs_epi32(out_05_6, out_05_7); - out[21] = _mm_packs_epi32(out_21_6, out_21_7); - out[13] = _mm_packs_epi32(out_13_6, out_13_7); - out[29] = _mm_packs_epi32(out_29_6, out_29_7); - out[ 3] = _mm_packs_epi32(out_03_6, out_03_7); - out[19] = _mm_packs_epi32(out_19_6, out_19_7); - out[11] = _mm_packs_epi32(out_11_6, out_11_7); - out[27] = _mm_packs_epi32(out_27_6, out_27_7); + } + // Final stage --- outputs indices are bit-reversed. + { + const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); + const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); + const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); + const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); + const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); + const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); + const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); + const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); + const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m128i out_01_4 = + _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m128i out_01_5 = + _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m128i out_17_4 = + _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m128i out_17_5 = + _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m128i out_09_4 = + _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m128i out_09_5 = + _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m128i out_25_4 = + _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m128i out_25_5 = + _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m128i out_07_4 = + _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m128i out_07_5 = + _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m128i out_23_4 = + _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m128i out_23_5 = + _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m128i out_15_4 = + _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m128i out_15_5 = + _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m128i out_31_4 = + _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m128i out_31_5 = + _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[1] = _mm_packs_epi32(out_01_6, out_01_7); + out[17] = _mm_packs_epi32(out_17_6, out_17_7); + out[9] = _mm_packs_epi32(out_09_6, out_09_7); + out[25] = _mm_packs_epi32(out_25_6, out_25_7); + out[7] = _mm_packs_epi32(out_07_6, out_07_7); + out[23] = _mm_packs_epi32(out_23_6, out_23_7); + out[15] = _mm_packs_epi32(out_15_6, out_15_7); + out[31] = _mm_packs_epi32(out_31_6, out_31_7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13], - &out[29], &out[3], &out[19], - &out[11], &out[27]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; + overflow = + check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], + &out[7], &out[23], &out[15], &out[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH } + { + const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); + const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); + const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); + const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); + const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); + const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); + const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); + const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); + const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m128i out_05_4 = + _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m128i out_05_5 = + _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m128i out_21_4 = + _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m128i out_21_5 = + _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m128i out_13_4 = + _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m128i out_13_5 = + _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m128i out_29_4 = + _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m128i out_29_5 = + _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m128i out_03_4 = + _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m128i out_03_5 = + _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m128i out_19_4 = + _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m128i out_19_5 = + _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m128i out_11_4 = + _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m128i out_11_5 = + _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m128i out_27_4 = + _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m128i out_27_5 = + _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[5] = _mm_packs_epi32(out_05_6, out_05_7); + out[21] = _mm_packs_epi32(out_21_6, out_21_7); + out[13] = _mm_packs_epi32(out_13_6, out_13_7); + out[29] = _mm_packs_epi32(out_29_6, out_29_7); + out[3] = _mm_packs_epi32(out_03_6, out_03_7); + out[19] = _mm_packs_epi32(out_19_6, out_19_7); + out[11] = _mm_packs_epi32(out_11_6, out_11_7); + out[27] = _mm_packs_epi32(out_27_6, out_27_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], + &out[3], &out[19], &out[11], &out[27]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - } + } #if FDCT32x32_HIGH_PRECISION } else { __m128i lstep1[64], lstep2[64], lstep3[64]; @@ -1457,32 +1512,32 @@ void FDCT32x32_2D(const int16_t *input, // stage 3 { // expanding to 32-bit length priori to addition operations - lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero); - lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero); - lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero); - lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero); - lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero); - lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero); - lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero); - lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero); - lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero); - lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero); - lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero); - lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero); - lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero); - lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero); - lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero); - lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero); - lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne); - lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne); - lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne); - lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne); - lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne); - lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne); - lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne); - lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne); - lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne); - lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne); + lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero); + lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero); + lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero); + lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero); + lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero); + lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero); + lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero); + lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero); + lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero); + lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero); + lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero); + lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero); + lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero); + lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero); + lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero); + lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero); + lstep2[0] = _mm_madd_epi16(lstep2[0], kOne); + lstep2[1] = _mm_madd_epi16(lstep2[1], kOne); + lstep2[2] = _mm_madd_epi16(lstep2[2], kOne); + lstep2[3] = _mm_madd_epi16(lstep2[3], kOne); + lstep2[4] = _mm_madd_epi16(lstep2[4], kOne); + lstep2[5] = _mm_madd_epi16(lstep2[5], kOne); + lstep2[6] = _mm_madd_epi16(lstep2[6], kOne); + lstep2[7] = _mm_madd_epi16(lstep2[7], kOne); + lstep2[8] = _mm_madd_epi16(lstep2[8], kOne); + lstep2[9] = _mm_madd_epi16(lstep2[9], kOne); lstep2[10] = _mm_madd_epi16(lstep2[10], kOne); lstep2[11] = _mm_madd_epi16(lstep2[11], kOne); lstep2[12] = _mm_madd_epi16(lstep2[12], kOne); @@ -1490,22 +1545,22 @@ void FDCT32x32_2D(const int16_t *input, lstep2[14] = _mm_madd_epi16(lstep2[14], kOne); lstep2[15] = _mm_madd_epi16(lstep2[15], kOne); - lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]); - lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]); - lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]); - lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]); - lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]); - lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]); - lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]); - lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]); - lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]); - lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]); - lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]); - lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]); - lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]); - lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]); - lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]); - lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]); + lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]); + lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]); + lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]); + lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]); + lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]); + lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]); + lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]); + lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]); + lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]); + lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]); + lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]); + lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]); + lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]); + lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]); + lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]); + lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]); } { const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); @@ -1643,10 +1698,10 @@ void FDCT32x32_2D(const int16_t *input, // stage 4 { // expanding to 32-bit length priori to addition operations - lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero); - lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero); - lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero); - lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero); + lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero); + lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero); + lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero); + lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero); lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero); lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero); lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero); @@ -1660,14 +1715,14 @@ void FDCT32x32_2D(const int16_t *input, lstep2[30] = _mm_madd_epi16(lstep2[30], kOne); lstep2[31] = _mm_madd_epi16(lstep2[31], kOne); - lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]); - lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]); - lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]); - lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]); - lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]); - lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]); - lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]); - lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]); + lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]); + lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]); + lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]); + lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]); + lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]); + lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]); + lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]); + lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]); lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); @@ -1686,64 +1741,64 @@ void FDCT32x32_2D(const int16_t *input, lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); } { - // to be continued... - // - const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); - const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); - - u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); - u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); - u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); - u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); - - // TODO(jingning): manually inline k_madd_epi32_ to further hide - // instruction latency. - v[0] = k_madd_epi32(u[0], k32_p16_m16); - v[1] = k_madd_epi32(u[1], k32_p16_m16); - v[2] = k_madd_epi32(u[2], k32_p16_m16); - v[3] = k_madd_epi32(u[3], k32_p16_m16); - v[4] = k_madd_epi32(u[0], k32_p16_p16); - v[5] = k_madd_epi32(u[1], k32_p16_p16); - v[6] = k_madd_epi32(u[2], k32_p16_p16); - v[7] = k_madd_epi32(u[3], k32_p16_p16); + // to be continued... + // + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); + + u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); + u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); + u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); + u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); + + // TODO(jingning): manually inline k_madd_epi32_ to further hide + // instruction latency. + v[0] = k_madd_epi32(u[0], k32_p16_m16); + v[1] = k_madd_epi32(u[1], k32_p16_m16); + v[2] = k_madd_epi32(u[2], k32_p16_m16); + v[3] = k_madd_epi32(u[3], k32_p16_m16); + v[4] = k_madd_epi32(u[0], k32_p16_p16); + v[5] = k_madd_epi32(u[1], k32_p16_p16); + v[6] = k_madd_epi32(u[2], k32_p16_p16); + v[7] = k_madd_epi32(u[3], k32_p16_p16); #if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], - &v[4], &v[5], &v[6], &v[7], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } + overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4], + &v[5], &v[6], &v[7], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } #endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - - lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + + lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); } { const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); - u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); - u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); - u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); - u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); - u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]); - u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]); - u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]); - u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]); - u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]); - u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]); + u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); + u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); + u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); + u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); + u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]); + u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]); + u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]); + u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]); + u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]); + u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]); u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]); u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]); u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]); @@ -1751,16 +1806,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]); u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]); - v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24); - v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24); - v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24); - v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24); - v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24); - v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24); - v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24); - v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24); - v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08); - v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08); + v[0] = k_madd_epi32(u[0], k32_m08_p24); + v[1] = k_madd_epi32(u[1], k32_m08_p24); + v[2] = k_madd_epi32(u[2], k32_m08_p24); + v[3] = k_madd_epi32(u[3], k32_m08_p24); + v[4] = k_madd_epi32(u[4], k32_m08_p24); + v[5] = k_madd_epi32(u[5], k32_m08_p24); + v[6] = k_madd_epi32(u[6], k32_m08_p24); + v[7] = k_madd_epi32(u[7], k32_m08_p24); + v[8] = k_madd_epi32(u[8], k32_m24_m08); + v[9] = k_madd_epi32(u[9], k32_m24_m08); v[10] = k_madd_epi32(u[10], k32_m24_m08); v[11] = k_madd_epi32(u[11], k32_m24_m08); v[12] = k_madd_epi32(u[12], k32_m24_m08); @@ -1771,41 +1826,40 @@ void FDCT32x32_2D(const int16_t *input, v[17] = k_madd_epi32(u[13], k32_m08_p24); v[18] = k_madd_epi32(u[14], k32_m08_p24); v[19] = k_madd_epi32(u[15], k32_m08_p24); - v[20] = k_madd_epi32(u[ 8], k32_m08_p24); - v[21] = k_madd_epi32(u[ 9], k32_m08_p24); + v[20] = k_madd_epi32(u[8], k32_m08_p24); + v[21] = k_madd_epi32(u[9], k32_m08_p24); v[22] = k_madd_epi32(u[10], k32_m08_p24); v[23] = k_madd_epi32(u[11], k32_m08_p24); - v[24] = k_madd_epi32(u[ 4], k32_p24_p08); - v[25] = k_madd_epi32(u[ 5], k32_p24_p08); - v[26] = k_madd_epi32(u[ 6], k32_p24_p08); - v[27] = k_madd_epi32(u[ 7], k32_p24_p08); - v[28] = k_madd_epi32(u[ 0], k32_p24_p08); - v[29] = k_madd_epi32(u[ 1], k32_p24_p08); - v[30] = k_madd_epi32(u[ 2], k32_p24_p08); - v[31] = k_madd_epi32(u[ 3], k32_p24_p08); + v[24] = k_madd_epi32(u[4], k32_p24_p08); + v[25] = k_madd_epi32(u[5], k32_p24_p08); + v[26] = k_madd_epi32(u[6], k32_p24_p08); + v[27] = k_madd_epi32(u[7], k32_p24_p08); + v[28] = k_madd_epi32(u[0], k32_p24_p08); + v[29] = k_madd_epi32(u[1], k32_p24_p08); + v[30] = k_madd_epi32(u[2], k32_p24_p08); + v[31] = k_madd_epi32(u[3], k32_p24_p08); #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], - &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], - &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], - &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], - &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; } #endif // DCT_HIGH_BIT_DEPTH - u[ 0] = k_packs_epi64(v[ 0], v[ 1]); - u[ 1] = k_packs_epi64(v[ 2], v[ 3]); - u[ 2] = k_packs_epi64(v[ 4], v[ 5]); - u[ 3] = k_packs_epi64(v[ 6], v[ 7]); - u[ 4] = k_packs_epi64(v[ 8], v[ 9]); - u[ 5] = k_packs_epi64(v[10], v[11]); - u[ 6] = k_packs_epi64(v[12], v[13]); - u[ 7] = k_packs_epi64(v[14], v[15]); - u[ 8] = k_packs_epi64(v[16], v[17]); - u[ 9] = k_packs_epi64(v[18], v[19]); + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); u[10] = k_packs_epi64(v[20], v[21]); u[11] = k_packs_epi64(v[22], v[23]); u[12] = k_packs_epi64(v[24], v[25]); @@ -1813,16 +1867,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = k_packs_epi64(v[28], v[29]); u[15] = k_packs_epi64(v[30], v[31]); - v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); - v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); - v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); - v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); - v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); - v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); - v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); - v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); - v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); - v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); @@ -1830,16 +1884,16 @@ void FDCT32x32_2D(const int16_t *input, v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); - lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); - lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); - lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); - lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); - lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); - lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); - lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); - lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); - lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); + lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS); lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS); lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS); lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS); @@ -1849,10 +1903,10 @@ void FDCT32x32_2D(const int16_t *input, } // stage 5 { - lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]); - lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]); - lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]); - lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]); + lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]); + lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]); + lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]); + lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]); lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]); lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]); lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]); @@ -1875,16 +1929,16 @@ void FDCT32x32_2D(const int16_t *input, // TODO(jingning): manually inline k_madd_epi32_ to further hide // instruction latency. - v[ 0] = k_madd_epi32(u[0], k32_p16_p16); - v[ 1] = k_madd_epi32(u[1], k32_p16_p16); - v[ 2] = k_madd_epi32(u[2], k32_p16_p16); - v[ 3] = k_madd_epi32(u[3], k32_p16_p16); - v[ 4] = k_madd_epi32(u[0], k32_p16_m16); - v[ 5] = k_madd_epi32(u[1], k32_p16_m16); - v[ 6] = k_madd_epi32(u[2], k32_p16_m16); - v[ 7] = k_madd_epi32(u[3], k32_p16_m16); - v[ 8] = k_madd_epi32(u[4], k32_p24_p08); - v[ 9] = k_madd_epi32(u[5], k32_p24_p08); + v[0] = k_madd_epi32(u[0], k32_p16_p16); + v[1] = k_madd_epi32(u[1], k32_p16_p16); + v[2] = k_madd_epi32(u[2], k32_p16_p16); + v[3] = k_madd_epi32(u[3], k32_p16_p16); + v[4] = k_madd_epi32(u[0], k32_p16_m16); + v[5] = k_madd_epi32(u[1], k32_p16_m16); + v[6] = k_madd_epi32(u[2], k32_p16_m16); + v[7] = k_madd_epi32(u[3], k32_p16_m16); + v[8] = k_madd_epi32(u[4], k32_p24_p08); + v[9] = k_madd_epi32(u[5], k32_p24_p08); v[10] = k_madd_epi32(u[6], k32_p24_p08); v[11] = k_madd_epi32(u[7], k32_p24_p08); v[12] = k_madd_epi32(u[4], k32_m08_p24); @@ -1894,9 +1948,8 @@ void FDCT32x32_2D(const int16_t *input, #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_16( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], - &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], - &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -1966,13 +2019,13 @@ void FDCT32x32_2D(const int16_t *input, u[7] = _mm_srai_epi32(u[7], 2); // Combine - out[ 0] = _mm_packs_epi32(u[0], u[1]); + out[0] = _mm_packs_epi32(u[0], u[1]); out[16] = _mm_packs_epi32(u[2], u[3]); - out[ 8] = _mm_packs_epi32(u[4], u[5]); + out[8] = _mm_packs_epi32(u[4], u[5]); out[24] = _mm_packs_epi32(u[6], u[7]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&out[0], &out[16], - &out[8], &out[24]); + overflow = + check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2001,8 +2054,8 @@ void FDCT32x32_2D(const int16_t *input, v[5] = k_madd_epi32(u[5], k32_m24_m08); v[6] = k_madd_epi32(u[6], k32_m24_m08); v[7] = k_madd_epi32(u[7], k32_m24_m08); - v[ 8] = k_madd_epi32(u[4], k32_m08_p24); - v[ 9] = k_madd_epi32(u[5], k32_m08_p24); + v[8] = k_madd_epi32(u[4], k32_m08_p24); + v[9] = k_madd_epi32(u[5], k32_m08_p24); v[10] = k_madd_epi32(u[6], k32_m08_p24); v[11] = k_madd_epi32(u[7], k32_m08_p24); v[12] = k_madd_epi32(u[0], k32_p24_p08); @@ -2012,9 +2065,8 @@ void FDCT32x32_2D(const int16_t *input, #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_16( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], - &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], - &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2088,10 +2140,10 @@ void FDCT32x32_2D(const int16_t *input, const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); - u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]); - u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]); - u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]); - u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]); + u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); + u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); + u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); + u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); @@ -2100,10 +2152,10 @@ void FDCT32x32_2D(const int16_t *input, u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); - u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]); - u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]); - u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]); - u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]); + u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); + u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); + u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); + u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); v[0] = k_madd_epi32(u[0], k32_p28_p04); v[1] = k_madd_epi32(u[1], k32_p28_p04); @@ -2113,8 +2165,8 @@ void FDCT32x32_2D(const int16_t *input, v[5] = k_madd_epi32(u[5], k32_p12_p20); v[6] = k_madd_epi32(u[6], k32_p12_p20); v[7] = k_madd_epi32(u[7], k32_p12_p20); - v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12); - v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12); + v[8] = k_madd_epi32(u[8], k32_m20_p12); + v[9] = k_madd_epi32(u[9], k32_m20_p12); v[10] = k_madd_epi32(u[10], k32_m20_p12); v[11] = k_madd_epi32(u[11], k32_m20_p12); v[12] = k_madd_epi32(u[12], k32_m04_p28); @@ -2124,9 +2176,8 @@ void FDCT32x32_2D(const int16_t *input, #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_16( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], - &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], - &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2195,13 +2246,13 @@ void FDCT32x32_2D(const int16_t *input, u[6] = _mm_srai_epi32(u[6], 2); u[7] = _mm_srai_epi32(u[7], 2); - out[ 4] = _mm_packs_epi32(u[0], u[1]); + out[4] = _mm_packs_epi32(u[0], u[1]); out[20] = _mm_packs_epi32(u[2], u[3]); out[12] = _mm_packs_epi32(u[4], u[5]); out[28] = _mm_packs_epi32(u[6], u[7]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&out[4], &out[20], - &out[12], &out[28]); + overflow = + check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2230,21 +2281,21 @@ void FDCT32x32_2D(const int16_t *input, const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64); const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); - const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64, - -cospi_20_64); + const __m128i k32_m12_m20 = + pair_set_epi32(-cospi_12_64, -cospi_20_64); const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); - u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]); - u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]); - u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]); - u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]); - u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]); - u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]); - u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]); - u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]); - u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]); - u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]); + u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]); + u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]); + u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]); + u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]); + u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]); + u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]); + u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]); + u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]); + u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]); + u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]); u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]); u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]); u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]); @@ -2252,16 +2303,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]); u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]); - v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28); - v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28); - v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28); - v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28); - v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04); - v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04); - v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04); - v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04); - v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12); - v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12); + v[0] = k_madd_epi32(u[0], k32_m04_p28); + v[1] = k_madd_epi32(u[1], k32_m04_p28); + v[2] = k_madd_epi32(u[2], k32_m04_p28); + v[3] = k_madd_epi32(u[3], k32_m04_p28); + v[4] = k_madd_epi32(u[4], k32_m28_m04); + v[5] = k_madd_epi32(u[5], k32_m28_m04); + v[6] = k_madd_epi32(u[6], k32_m28_m04); + v[7] = k_madd_epi32(u[7], k32_m28_m04); + v[8] = k_madd_epi32(u[8], k32_m20_p12); + v[9] = k_madd_epi32(u[9], k32_m20_p12); v[10] = k_madd_epi32(u[10], k32_m20_p12); v[11] = k_madd_epi32(u[11], k32_m20_p12); v[12] = k_madd_epi32(u[12], k32_m12_m20); @@ -2272,41 +2323,40 @@ void FDCT32x32_2D(const int16_t *input, v[17] = k_madd_epi32(u[13], k32_m20_p12); v[18] = k_madd_epi32(u[14], k32_m20_p12); v[19] = k_madd_epi32(u[15], k32_m20_p12); - v[20] = k_madd_epi32(u[ 8], k32_p12_p20); - v[21] = k_madd_epi32(u[ 9], k32_p12_p20); + v[20] = k_madd_epi32(u[8], k32_p12_p20); + v[21] = k_madd_epi32(u[9], k32_p12_p20); v[22] = k_madd_epi32(u[10], k32_p12_p20); v[23] = k_madd_epi32(u[11], k32_p12_p20); - v[24] = k_madd_epi32(u[ 4], k32_m04_p28); - v[25] = k_madd_epi32(u[ 5], k32_m04_p28); - v[26] = k_madd_epi32(u[ 6], k32_m04_p28); - v[27] = k_madd_epi32(u[ 7], k32_m04_p28); - v[28] = k_madd_epi32(u[ 0], k32_p28_p04); - v[29] = k_madd_epi32(u[ 1], k32_p28_p04); - v[30] = k_madd_epi32(u[ 2], k32_p28_p04); - v[31] = k_madd_epi32(u[ 3], k32_p28_p04); + v[24] = k_madd_epi32(u[4], k32_m04_p28); + v[25] = k_madd_epi32(u[5], k32_m04_p28); + v[26] = k_madd_epi32(u[6], k32_m04_p28); + v[27] = k_madd_epi32(u[7], k32_m04_p28); + v[28] = k_madd_epi32(u[0], k32_p28_p04); + v[29] = k_madd_epi32(u[1], k32_p28_p04); + v[30] = k_madd_epi32(u[2], k32_p28_p04); + v[31] = k_madd_epi32(u[3], k32_p28_p04); #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], - &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], - &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], - &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], - &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; } #endif // DCT_HIGH_BIT_DEPTH - u[ 0] = k_packs_epi64(v[ 0], v[ 1]); - u[ 1] = k_packs_epi64(v[ 2], v[ 3]); - u[ 2] = k_packs_epi64(v[ 4], v[ 5]); - u[ 3] = k_packs_epi64(v[ 6], v[ 7]); - u[ 4] = k_packs_epi64(v[ 8], v[ 9]); - u[ 5] = k_packs_epi64(v[10], v[11]); - u[ 6] = k_packs_epi64(v[12], v[13]); - u[ 7] = k_packs_epi64(v[14], v[15]); - u[ 8] = k_packs_epi64(v[16], v[17]); - u[ 9] = k_packs_epi64(v[18], v[19]); + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); u[10] = k_packs_epi64(v[20], v[21]); u[11] = k_packs_epi64(v[22], v[23]); u[12] = k_packs_epi64(v[24], v[25]); @@ -2314,16 +2364,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = k_packs_epi64(v[28], v[29]); u[15] = k_packs_epi64(v[30], v[31]); - v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); - v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); - v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); - v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); - v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); - v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); - v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); - v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); - v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); - v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); @@ -2331,16 +2381,16 @@ void FDCT32x32_2D(const int16_t *input, v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); - lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); - lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); - lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); - lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); - lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); - lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); - lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); - lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); - lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); + lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS); lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS); lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS); lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS); @@ -2353,22 +2403,22 @@ void FDCT32x32_2D(const int16_t *input, const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64); const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64); const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64); - const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64); + const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64); const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64); const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64); const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64); const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64); - u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]); - u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]); - u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]); - u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]); - u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]); - u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]); - u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]); - u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]); - u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]); - u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]); + u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]); + u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]); + u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]); + u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]); + u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]); + u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]); + u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]); + u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]); + u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]); + u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]); u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]); u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]); u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]); @@ -2376,16 +2426,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]); u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]); - v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02); - v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02); - v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02); - v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02); - v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18); - v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18); - v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18); - v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18); - v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10); - v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10); + v[0] = k_madd_epi32(u[0], k32_p30_p02); + v[1] = k_madd_epi32(u[1], k32_p30_p02); + v[2] = k_madd_epi32(u[2], k32_p30_p02); + v[3] = k_madd_epi32(u[3], k32_p30_p02); + v[4] = k_madd_epi32(u[4], k32_p14_p18); + v[5] = k_madd_epi32(u[5], k32_p14_p18); + v[6] = k_madd_epi32(u[6], k32_p14_p18); + v[7] = k_madd_epi32(u[7], k32_p14_p18); + v[8] = k_madd_epi32(u[8], k32_p22_p10); + v[9] = k_madd_epi32(u[9], k32_p22_p10); v[10] = k_madd_epi32(u[10], k32_p22_p10); v[11] = k_madd_epi32(u[11], k32_p22_p10); v[12] = k_madd_epi32(u[12], k32_p06_p26); @@ -2396,41 +2446,40 @@ void FDCT32x32_2D(const int16_t *input, v[17] = k_madd_epi32(u[13], k32_m26_p06); v[18] = k_madd_epi32(u[14], k32_m26_p06); v[19] = k_madd_epi32(u[15], k32_m26_p06); - v[20] = k_madd_epi32(u[ 8], k32_m10_p22); - v[21] = k_madd_epi32(u[ 9], k32_m10_p22); + v[20] = k_madd_epi32(u[8], k32_m10_p22); + v[21] = k_madd_epi32(u[9], k32_m10_p22); v[22] = k_madd_epi32(u[10], k32_m10_p22); v[23] = k_madd_epi32(u[11], k32_m10_p22); - v[24] = k_madd_epi32(u[ 4], k32_m18_p14); - v[25] = k_madd_epi32(u[ 5], k32_m18_p14); - v[26] = k_madd_epi32(u[ 6], k32_m18_p14); - v[27] = k_madd_epi32(u[ 7], k32_m18_p14); - v[28] = k_madd_epi32(u[ 0], k32_m02_p30); - v[29] = k_madd_epi32(u[ 1], k32_m02_p30); - v[30] = k_madd_epi32(u[ 2], k32_m02_p30); - v[31] = k_madd_epi32(u[ 3], k32_m02_p30); + v[24] = k_madd_epi32(u[4], k32_m18_p14); + v[25] = k_madd_epi32(u[5], k32_m18_p14); + v[26] = k_madd_epi32(u[6], k32_m18_p14); + v[27] = k_madd_epi32(u[7], k32_m18_p14); + v[28] = k_madd_epi32(u[0], k32_m02_p30); + v[29] = k_madd_epi32(u[1], k32_m02_p30); + v[30] = k_madd_epi32(u[2], k32_m02_p30); + v[31] = k_madd_epi32(u[3], k32_m02_p30); #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], - &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], - &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], - &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], - &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; } #endif // DCT_HIGH_BIT_DEPTH - u[ 0] = k_packs_epi64(v[ 0], v[ 1]); - u[ 1] = k_packs_epi64(v[ 2], v[ 3]); - u[ 2] = k_packs_epi64(v[ 4], v[ 5]); - u[ 3] = k_packs_epi64(v[ 6], v[ 7]); - u[ 4] = k_packs_epi64(v[ 8], v[ 9]); - u[ 5] = k_packs_epi64(v[10], v[11]); - u[ 6] = k_packs_epi64(v[12], v[13]); - u[ 7] = k_packs_epi64(v[14], v[15]); - u[ 8] = k_packs_epi64(v[16], v[17]); - u[ 9] = k_packs_epi64(v[18], v[19]); + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); u[10] = k_packs_epi64(v[20], v[21]); u[11] = k_packs_epi64(v[22], v[23]); u[12] = k_packs_epi64(v[24], v[25]); @@ -2438,16 +2487,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = k_packs_epi64(v[28], v[29]); u[15] = k_packs_epi64(v[30], v[31]); - v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); - v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); - v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); - v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); - v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); - v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); - v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); - v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); - v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); - v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); @@ -2455,16 +2504,16 @@ void FDCT32x32_2D(const int16_t *input, v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); - u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); - u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); - u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); - u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); - u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); - u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); - u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); - u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); - u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); @@ -2472,16 +2521,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - v[ 0] = _mm_cmplt_epi32(u[ 0], kZero); - v[ 1] = _mm_cmplt_epi32(u[ 1], kZero); - v[ 2] = _mm_cmplt_epi32(u[ 2], kZero); - v[ 3] = _mm_cmplt_epi32(u[ 3], kZero); - v[ 4] = _mm_cmplt_epi32(u[ 4], kZero); - v[ 5] = _mm_cmplt_epi32(u[ 5], kZero); - v[ 6] = _mm_cmplt_epi32(u[ 6], kZero); - v[ 7] = _mm_cmplt_epi32(u[ 7], kZero); - v[ 8] = _mm_cmplt_epi32(u[ 8], kZero); - v[ 9] = _mm_cmplt_epi32(u[ 9], kZero); + v[0] = _mm_cmplt_epi32(u[0], kZero); + v[1] = _mm_cmplt_epi32(u[1], kZero); + v[2] = _mm_cmplt_epi32(u[2], kZero); + v[3] = _mm_cmplt_epi32(u[3], kZero); + v[4] = _mm_cmplt_epi32(u[4], kZero); + v[5] = _mm_cmplt_epi32(u[5], kZero); + v[6] = _mm_cmplt_epi32(u[6], kZero); + v[7] = _mm_cmplt_epi32(u[7], kZero); + v[8] = _mm_cmplt_epi32(u[8], kZero); + v[9] = _mm_cmplt_epi32(u[9], kZero); v[10] = _mm_cmplt_epi32(u[10], kZero); v[11] = _mm_cmplt_epi32(u[11], kZero); v[12] = _mm_cmplt_epi32(u[12], kZero); @@ -2489,16 +2538,16 @@ void FDCT32x32_2D(const int16_t *input, v[14] = _mm_cmplt_epi32(u[14], kZero); v[15] = _mm_cmplt_epi32(u[15], kZero); - u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]); - u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]); - u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]); - u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]); - u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]); - u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]); - u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]); - u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]); - u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]); - u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]); + u[0] = _mm_sub_epi32(u[0], v[0]); + u[1] = _mm_sub_epi32(u[1], v[1]); + u[2] = _mm_sub_epi32(u[2], v[2]); + u[3] = _mm_sub_epi32(u[3], v[3]); + u[4] = _mm_sub_epi32(u[4], v[4]); + u[5] = _mm_sub_epi32(u[5], v[5]); + u[6] = _mm_sub_epi32(u[6], v[6]); + u[7] = _mm_sub_epi32(u[7], v[7]); + u[8] = _mm_sub_epi32(u[8], v[8]); + u[9] = _mm_sub_epi32(u[9], v[9]); u[10] = _mm_sub_epi32(u[10], v[10]); u[11] = _mm_sub_epi32(u[11], v[11]); u[12] = _mm_sub_epi32(u[12], v[12]); @@ -2506,16 +2555,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = _mm_sub_epi32(u[14], v[14]); u[15] = _mm_sub_epi32(u[15], v[15]); - v[ 0] = _mm_add_epi32(u[ 0], K32One); - v[ 1] = _mm_add_epi32(u[ 1], K32One); - v[ 2] = _mm_add_epi32(u[ 2], K32One); - v[ 3] = _mm_add_epi32(u[ 3], K32One); - v[ 4] = _mm_add_epi32(u[ 4], K32One); - v[ 5] = _mm_add_epi32(u[ 5], K32One); - v[ 6] = _mm_add_epi32(u[ 6], K32One); - v[ 7] = _mm_add_epi32(u[ 7], K32One); - v[ 8] = _mm_add_epi32(u[ 8], K32One); - v[ 9] = _mm_add_epi32(u[ 9], K32One); + v[0] = _mm_add_epi32(u[0], K32One); + v[1] = _mm_add_epi32(u[1], K32One); + v[2] = _mm_add_epi32(u[2], K32One); + v[3] = _mm_add_epi32(u[3], K32One); + v[4] = _mm_add_epi32(u[4], K32One); + v[5] = _mm_add_epi32(u[5], K32One); + v[6] = _mm_add_epi32(u[6], K32One); + v[7] = _mm_add_epi32(u[7], K32One); + v[8] = _mm_add_epi32(u[8], K32One); + v[9] = _mm_add_epi32(u[9], K32One); v[10] = _mm_add_epi32(u[10], K32One); v[11] = _mm_add_epi32(u[11], K32One); v[12] = _mm_add_epi32(u[12], K32One); @@ -2523,16 +2572,16 @@ void FDCT32x32_2D(const int16_t *input, v[14] = _mm_add_epi32(u[14], K32One); v[15] = _mm_add_epi32(u[15], K32One); - u[ 0] = _mm_srai_epi32(v[ 0], 2); - u[ 1] = _mm_srai_epi32(v[ 1], 2); - u[ 2] = _mm_srai_epi32(v[ 2], 2); - u[ 3] = _mm_srai_epi32(v[ 3], 2); - u[ 4] = _mm_srai_epi32(v[ 4], 2); - u[ 5] = _mm_srai_epi32(v[ 5], 2); - u[ 6] = _mm_srai_epi32(v[ 6], 2); - u[ 7] = _mm_srai_epi32(v[ 7], 2); - u[ 8] = _mm_srai_epi32(v[ 8], 2); - u[ 9] = _mm_srai_epi32(v[ 9], 2); + u[0] = _mm_srai_epi32(v[0], 2); + u[1] = _mm_srai_epi32(v[1], 2); + u[2] = _mm_srai_epi32(v[2], 2); + u[3] = _mm_srai_epi32(v[3], 2); + u[4] = _mm_srai_epi32(v[4], 2); + u[5] = _mm_srai_epi32(v[5], 2); + u[6] = _mm_srai_epi32(v[6], 2); + u[7] = _mm_srai_epi32(v[7], 2); + u[8] = _mm_srai_epi32(v[8], 2); + u[9] = _mm_srai_epi32(v[9], 2); u[10] = _mm_srai_epi32(v[10], 2); u[11] = _mm_srai_epi32(v[11], 2); u[12] = _mm_srai_epi32(v[12], 2); @@ -2540,18 +2589,18 @@ void FDCT32x32_2D(const int16_t *input, u[14] = _mm_srai_epi32(v[14], 2); u[15] = _mm_srai_epi32(v[15], 2); - out[ 2] = _mm_packs_epi32(u[0], u[1]); + out[2] = _mm_packs_epi32(u[0], u[1]); out[18] = _mm_packs_epi32(u[2], u[3]); out[10] = _mm_packs_epi32(u[4], u[5]); out[26] = _mm_packs_epi32(u[6], u[7]); - out[ 6] = _mm_packs_epi32(u[8], u[9]); + out[6] = _mm_packs_epi32(u[8], u[9]); out[22] = _mm_packs_epi32(u[10], u[11]); out[14] = _mm_packs_epi32(u[12], u[13]); out[30] = _mm_packs_epi32(u[14], u[15]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10], - &out[26], &out[6], &out[22], - &out[14], &out[30]); + overflow = + check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], + &out[6], &out[22], &out[14], &out[30]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2603,16 +2652,16 @@ void FDCT32x32_2D(const int16_t *input, const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64); const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64); - u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]); - u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]); - u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]); - u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]); - u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]); - u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]); - u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]); - u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]); - u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]); - u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]); + u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]); + u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]); + u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]); + u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]); + u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]); + u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]); + u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]); + u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]); + u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]); + u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]); u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]); u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]); u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]); @@ -2620,16 +2669,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]); u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]); - v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01); - v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01); - v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01); - v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01); - v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17); - v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17); - v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17); - v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17); - v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09); - v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09); + v[0] = k_madd_epi32(u[0], k32_p31_p01); + v[1] = k_madd_epi32(u[1], k32_p31_p01); + v[2] = k_madd_epi32(u[2], k32_p31_p01); + v[3] = k_madd_epi32(u[3], k32_p31_p01); + v[4] = k_madd_epi32(u[4], k32_p15_p17); + v[5] = k_madd_epi32(u[5], k32_p15_p17); + v[6] = k_madd_epi32(u[6], k32_p15_p17); + v[7] = k_madd_epi32(u[7], k32_p15_p17); + v[8] = k_madd_epi32(u[8], k32_p23_p09); + v[9] = k_madd_epi32(u[9], k32_p23_p09); v[10] = k_madd_epi32(u[10], k32_p23_p09); v[11] = k_madd_epi32(u[11], k32_p23_p09); v[12] = k_madd_epi32(u[12], k32_p07_p25); @@ -2640,41 +2689,40 @@ void FDCT32x32_2D(const int16_t *input, v[17] = k_madd_epi32(u[13], k32_m25_p07); v[18] = k_madd_epi32(u[14], k32_m25_p07); v[19] = k_madd_epi32(u[15], k32_m25_p07); - v[20] = k_madd_epi32(u[ 8], k32_m09_p23); - v[21] = k_madd_epi32(u[ 9], k32_m09_p23); + v[20] = k_madd_epi32(u[8], k32_m09_p23); + v[21] = k_madd_epi32(u[9], k32_m09_p23); v[22] = k_madd_epi32(u[10], k32_m09_p23); v[23] = k_madd_epi32(u[11], k32_m09_p23); - v[24] = k_madd_epi32(u[ 4], k32_m17_p15); - v[25] = k_madd_epi32(u[ 5], k32_m17_p15); - v[26] = k_madd_epi32(u[ 6], k32_m17_p15); - v[27] = k_madd_epi32(u[ 7], k32_m17_p15); - v[28] = k_madd_epi32(u[ 0], k32_m01_p31); - v[29] = k_madd_epi32(u[ 1], k32_m01_p31); - v[30] = k_madd_epi32(u[ 2], k32_m01_p31); - v[31] = k_madd_epi32(u[ 3], k32_m01_p31); + v[24] = k_madd_epi32(u[4], k32_m17_p15); + v[25] = k_madd_epi32(u[5], k32_m17_p15); + v[26] = k_madd_epi32(u[6], k32_m17_p15); + v[27] = k_madd_epi32(u[7], k32_m17_p15); + v[28] = k_madd_epi32(u[0], k32_m01_p31); + v[29] = k_madd_epi32(u[1], k32_m01_p31); + v[30] = k_madd_epi32(u[2], k32_m01_p31); + v[31] = k_madd_epi32(u[3], k32_m01_p31); #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], - &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], - &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], - &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], - &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; } #endif // DCT_HIGH_BIT_DEPTH - u[ 0] = k_packs_epi64(v[ 0], v[ 1]); - u[ 1] = k_packs_epi64(v[ 2], v[ 3]); - u[ 2] = k_packs_epi64(v[ 4], v[ 5]); - u[ 3] = k_packs_epi64(v[ 6], v[ 7]); - u[ 4] = k_packs_epi64(v[ 8], v[ 9]); - u[ 5] = k_packs_epi64(v[10], v[11]); - u[ 6] = k_packs_epi64(v[12], v[13]); - u[ 7] = k_packs_epi64(v[14], v[15]); - u[ 8] = k_packs_epi64(v[16], v[17]); - u[ 9] = k_packs_epi64(v[18], v[19]); + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); u[10] = k_packs_epi64(v[20], v[21]); u[11] = k_packs_epi64(v[22], v[23]); u[12] = k_packs_epi64(v[24], v[25]); @@ -2682,16 +2730,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = k_packs_epi64(v[28], v[29]); u[15] = k_packs_epi64(v[30], v[31]); - v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); - v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); - v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); - v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); - v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); - v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); - v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); - v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); - v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); - v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); @@ -2699,16 +2747,16 @@ void FDCT32x32_2D(const int16_t *input, v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); - u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); - u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); - u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); - u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); - u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); - u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); - u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); - u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); - u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); @@ -2716,16 +2764,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - v[ 0] = _mm_cmplt_epi32(u[ 0], kZero); - v[ 1] = _mm_cmplt_epi32(u[ 1], kZero); - v[ 2] = _mm_cmplt_epi32(u[ 2], kZero); - v[ 3] = _mm_cmplt_epi32(u[ 3], kZero); - v[ 4] = _mm_cmplt_epi32(u[ 4], kZero); - v[ 5] = _mm_cmplt_epi32(u[ 5], kZero); - v[ 6] = _mm_cmplt_epi32(u[ 6], kZero); - v[ 7] = _mm_cmplt_epi32(u[ 7], kZero); - v[ 8] = _mm_cmplt_epi32(u[ 8], kZero); - v[ 9] = _mm_cmplt_epi32(u[ 9], kZero); + v[0] = _mm_cmplt_epi32(u[0], kZero); + v[1] = _mm_cmplt_epi32(u[1], kZero); + v[2] = _mm_cmplt_epi32(u[2], kZero); + v[3] = _mm_cmplt_epi32(u[3], kZero); + v[4] = _mm_cmplt_epi32(u[4], kZero); + v[5] = _mm_cmplt_epi32(u[5], kZero); + v[6] = _mm_cmplt_epi32(u[6], kZero); + v[7] = _mm_cmplt_epi32(u[7], kZero); + v[8] = _mm_cmplt_epi32(u[8], kZero); + v[9] = _mm_cmplt_epi32(u[9], kZero); v[10] = _mm_cmplt_epi32(u[10], kZero); v[11] = _mm_cmplt_epi32(u[11], kZero); v[12] = _mm_cmplt_epi32(u[12], kZero); @@ -2733,16 +2781,16 @@ void FDCT32x32_2D(const int16_t *input, v[14] = _mm_cmplt_epi32(u[14], kZero); v[15] = _mm_cmplt_epi32(u[15], kZero); - u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]); - u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]); - u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]); - u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]); - u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]); - u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]); - u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]); - u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]); - u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]); - u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]); + u[0] = _mm_sub_epi32(u[0], v[0]); + u[1] = _mm_sub_epi32(u[1], v[1]); + u[2] = _mm_sub_epi32(u[2], v[2]); + u[3] = _mm_sub_epi32(u[3], v[3]); + u[4] = _mm_sub_epi32(u[4], v[4]); + u[5] = _mm_sub_epi32(u[5], v[5]); + u[6] = _mm_sub_epi32(u[6], v[6]); + u[7] = _mm_sub_epi32(u[7], v[7]); + u[8] = _mm_sub_epi32(u[8], v[8]); + u[9] = _mm_sub_epi32(u[9], v[9]); u[10] = _mm_sub_epi32(u[10], v[10]); u[11] = _mm_sub_epi32(u[11], v[11]); u[12] = _mm_sub_epi32(u[12], v[12]); @@ -2784,18 +2832,18 @@ void FDCT32x32_2D(const int16_t *input, u[14] = _mm_srai_epi32(v[14], 2); u[15] = _mm_srai_epi32(v[15], 2); - out[ 1] = _mm_packs_epi32(u[0], u[1]); + out[1] = _mm_packs_epi32(u[0], u[1]); out[17] = _mm_packs_epi32(u[2], u[3]); - out[ 9] = _mm_packs_epi32(u[4], u[5]); + out[9] = _mm_packs_epi32(u[4], u[5]); out[25] = _mm_packs_epi32(u[6], u[7]); - out[ 7] = _mm_packs_epi32(u[8], u[9]); + out[7] = _mm_packs_epi32(u[8], u[9]); out[23] = _mm_packs_epi32(u[10], u[11]); out[15] = _mm_packs_epi32(u[12], u[13]); out[31] = _mm_packs_epi32(u[14], u[15]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9], - &out[25], &out[7], &out[23], - &out[15], &out[31]); + overflow = + check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], + &out[7], &out[23], &out[15], &out[31]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; @@ -2812,16 +2860,16 @@ void FDCT32x32_2D(const int16_t *input, const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64); const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64); - u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]); - u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]); - u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]); - u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]); - u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]); - u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]); - u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]); - u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]); - u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]); - u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]); + u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]); + u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]); + u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]); + u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]); + u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]); + u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]); + u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]); + u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]); + u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]); + u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]); u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]); u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]); u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]); @@ -2829,16 +2877,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]); u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]); - v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05); - v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05); - v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05); - v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05); - v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21); - v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21); - v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21); - v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21); - v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13); - v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13); + v[0] = k_madd_epi32(u[0], k32_p27_p05); + v[1] = k_madd_epi32(u[1], k32_p27_p05); + v[2] = k_madd_epi32(u[2], k32_p27_p05); + v[3] = k_madd_epi32(u[3], k32_p27_p05); + v[4] = k_madd_epi32(u[4], k32_p11_p21); + v[5] = k_madd_epi32(u[5], k32_p11_p21); + v[6] = k_madd_epi32(u[6], k32_p11_p21); + v[7] = k_madd_epi32(u[7], k32_p11_p21); + v[8] = k_madd_epi32(u[8], k32_p19_p13); + v[9] = k_madd_epi32(u[9], k32_p19_p13); v[10] = k_madd_epi32(u[10], k32_p19_p13); v[11] = k_madd_epi32(u[11], k32_p19_p13); v[12] = k_madd_epi32(u[12], k32_p03_p29); @@ -2849,41 +2897,40 @@ void FDCT32x32_2D(const int16_t *input, v[17] = k_madd_epi32(u[13], k32_m29_p03); v[18] = k_madd_epi32(u[14], k32_m29_p03); v[19] = k_madd_epi32(u[15], k32_m29_p03); - v[20] = k_madd_epi32(u[ 8], k32_m13_p19); - v[21] = k_madd_epi32(u[ 9], k32_m13_p19); + v[20] = k_madd_epi32(u[8], k32_m13_p19); + v[21] = k_madd_epi32(u[9], k32_m13_p19); v[22] = k_madd_epi32(u[10], k32_m13_p19); v[23] = k_madd_epi32(u[11], k32_m13_p19); - v[24] = k_madd_epi32(u[ 4], k32_m21_p11); - v[25] = k_madd_epi32(u[ 5], k32_m21_p11); - v[26] = k_madd_epi32(u[ 6], k32_m21_p11); - v[27] = k_madd_epi32(u[ 7], k32_m21_p11); - v[28] = k_madd_epi32(u[ 0], k32_m05_p27); - v[29] = k_madd_epi32(u[ 1], k32_m05_p27); - v[30] = k_madd_epi32(u[ 2], k32_m05_p27); - v[31] = k_madd_epi32(u[ 3], k32_m05_p27); + v[24] = k_madd_epi32(u[4], k32_m21_p11); + v[25] = k_madd_epi32(u[5], k32_m21_p11); + v[26] = k_madd_epi32(u[6], k32_m21_p11); + v[27] = k_madd_epi32(u[7], k32_m21_p11); + v[28] = k_madd_epi32(u[0], k32_m05_p27); + v[29] = k_madd_epi32(u[1], k32_m05_p27); + v[30] = k_madd_epi32(u[2], k32_m05_p27); + v[31] = k_madd_epi32(u[3], k32_m05_p27); #if DCT_HIGH_BIT_DEPTH overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], - &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], - &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], - &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], - &kZero); + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; } #endif // DCT_HIGH_BIT_DEPTH - u[ 0] = k_packs_epi64(v[ 0], v[ 1]); - u[ 1] = k_packs_epi64(v[ 2], v[ 3]); - u[ 2] = k_packs_epi64(v[ 4], v[ 5]); - u[ 3] = k_packs_epi64(v[ 6], v[ 7]); - u[ 4] = k_packs_epi64(v[ 8], v[ 9]); - u[ 5] = k_packs_epi64(v[10], v[11]); - u[ 6] = k_packs_epi64(v[12], v[13]); - u[ 7] = k_packs_epi64(v[14], v[15]); - u[ 8] = k_packs_epi64(v[16], v[17]); - u[ 9] = k_packs_epi64(v[18], v[19]); + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); u[10] = k_packs_epi64(v[20], v[21]); u[11] = k_packs_epi64(v[22], v[23]); u[12] = k_packs_epi64(v[24], v[25]); @@ -2891,16 +2938,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = k_packs_epi64(v[28], v[29]); u[15] = k_packs_epi64(v[30], v[31]); - v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); - v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); - v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); - v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); - v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); - v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); - v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); - v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); - v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); - v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); @@ -2908,16 +2955,16 @@ void FDCT32x32_2D(const int16_t *input, v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); - u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); - u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); - u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); - u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); - u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); - u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); - u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); - u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); - u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); @@ -2925,16 +2972,16 @@ void FDCT32x32_2D(const int16_t *input, u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - v[ 0] = _mm_cmplt_epi32(u[ 0], kZero); - v[ 1] = _mm_cmplt_epi32(u[ 1], kZero); - v[ 2] = _mm_cmplt_epi32(u[ 2], kZero); - v[ 3] = _mm_cmplt_epi32(u[ 3], kZero); - v[ 4] = _mm_cmplt_epi32(u[ 4], kZero); - v[ 5] = _mm_cmplt_epi32(u[ 5], kZero); - v[ 6] = _mm_cmplt_epi32(u[ 6], kZero); - v[ 7] = _mm_cmplt_epi32(u[ 7], kZero); - v[ 8] = _mm_cmplt_epi32(u[ 8], kZero); - v[ 9] = _mm_cmplt_epi32(u[ 9], kZero); + v[0] = _mm_cmplt_epi32(u[0], kZero); + v[1] = _mm_cmplt_epi32(u[1], kZero); + v[2] = _mm_cmplt_epi32(u[2], kZero); + v[3] = _mm_cmplt_epi32(u[3], kZero); + v[4] = _mm_cmplt_epi32(u[4], kZero); + v[5] = _mm_cmplt_epi32(u[5], kZero); + v[6] = _mm_cmplt_epi32(u[6], kZero); + v[7] = _mm_cmplt_epi32(u[7], kZero); + v[8] = _mm_cmplt_epi32(u[8], kZero); + v[9] = _mm_cmplt_epi32(u[9], kZero); v[10] = _mm_cmplt_epi32(u[10], kZero); v[11] = _mm_cmplt_epi32(u[11], kZero); v[12] = _mm_cmplt_epi32(u[12], kZero); @@ -2942,16 +2989,16 @@ void FDCT32x32_2D(const int16_t *input, v[14] = _mm_cmplt_epi32(u[14], kZero); v[15] = _mm_cmplt_epi32(u[15], kZero); - u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]); - u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]); - u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]); - u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]); - u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]); - u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]); - u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]); - u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]); - u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]); - u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]); + u[0] = _mm_sub_epi32(u[0], v[0]); + u[1] = _mm_sub_epi32(u[1], v[1]); + u[2] = _mm_sub_epi32(u[2], v[2]); + u[3] = _mm_sub_epi32(u[3], v[3]); + u[4] = _mm_sub_epi32(u[4], v[4]); + u[5] = _mm_sub_epi32(u[5], v[5]); + u[6] = _mm_sub_epi32(u[6], v[6]); + u[7] = _mm_sub_epi32(u[7], v[7]); + u[8] = _mm_sub_epi32(u[8], v[8]); + u[9] = _mm_sub_epi32(u[9], v[9]); u[10] = _mm_sub_epi32(u[10], v[10]); u[11] = _mm_sub_epi32(u[11], v[11]); u[12] = _mm_sub_epi32(u[12], v[12]); @@ -2993,18 +3040,18 @@ void FDCT32x32_2D(const int16_t *input, u[14] = _mm_srai_epi32(v[14], 2); u[15] = _mm_srai_epi32(v[15], 2); - out[ 5] = _mm_packs_epi32(u[0], u[1]); + out[5] = _mm_packs_epi32(u[0], u[1]); out[21] = _mm_packs_epi32(u[2], u[3]); out[13] = _mm_packs_epi32(u[4], u[5]); out[29] = _mm_packs_epi32(u[6], u[7]); - out[ 3] = _mm_packs_epi32(u[8], u[9]); + out[3] = _mm_packs_epi32(u[8], u[9]); out[19] = _mm_packs_epi32(u[10], u[11]); out[11] = _mm_packs_epi32(u[12], u[13]); out[27] = _mm_packs_epi32(u[14], u[15]); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13], - &out[29], &out[3], &out[19], - &out[11], &out[27]); + overflow = + check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], + &out[3], &out[19], &out[11], &out[27]); if (overflow) { HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); return; diff --git a/vpx_dsp/x86/fwd_txfm_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c index 6d9da6aa89e62009c14218dc49346bdbaf7d105e..21f11f0c3e5ff05e27ac8d48c74cf1bae6ef2c7f 100644 --- a/vpx_dsp/x86/fwd_txfm_avx2.c +++ b/vpx_dsp/x86/fwd_txfm_avx2.c @@ -13,11 +13,11 @@ #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2 #define FDCT32x32_HIGH_PRECISION 0 #include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" -#undef FDCT32x32_2D_AVX2 -#undef FDCT32x32_HIGH_PRECISION +#undef FDCT32x32_2D_AVX2 +#undef FDCT32x32_HIGH_PRECISION #define FDCT32x32_2D_AVX2 vpx_fdct32x32_avx2 #define FDCT32x32_HIGH_PRECISION 1 -#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT -#undef FDCT32x32_2D_AVX2 -#undef FDCT32x32_HIGH_PRECISION +#include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT +#undef FDCT32x32_2D_AVX2 +#undef FDCT32x32_HIGH_PRECISION diff --git a/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/vpx_dsp/x86/fwd_txfm_impl_sse2.h index 69889e2e98cdf6d10788db303f3266fbf10a3718..743e55e635c5dc5d85b199c4a6ea46e15826f695 100644 --- a/vpx_dsp/x86/fwd_txfm_impl_sse2.h +++ b/vpx_dsp/x86/fwd_txfm_impl_sse2.h @@ -43,44 +43,36 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { // These are the coefficients used for the multiplies. // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), // where cospi_N_64 = cos(N pi /64) - const __m128i k__cospi_A = octa_set_epi16(cospi_16_64, cospi_16_64, - cospi_16_64, cospi_16_64, - cospi_16_64, -cospi_16_64, - cospi_16_64, -cospi_16_64); - const __m128i k__cospi_B = octa_set_epi16(cospi_16_64, -cospi_16_64, - cospi_16_64, -cospi_16_64, - cospi_16_64, cospi_16_64, - cospi_16_64, cospi_16_64); - const __m128i k__cospi_C = octa_set_epi16(cospi_8_64, cospi_24_64, - cospi_8_64, cospi_24_64, - cospi_24_64, -cospi_8_64, - cospi_24_64, -cospi_8_64); - const __m128i k__cospi_D = octa_set_epi16(cospi_24_64, -cospi_8_64, - cospi_24_64, -cospi_8_64, - cospi_8_64, cospi_24_64, - cospi_8_64, cospi_24_64); - const __m128i k__cospi_E = octa_set_epi16(cospi_16_64, cospi_16_64, - cospi_16_64, cospi_16_64, - cospi_16_64, cospi_16_64, - cospi_16_64, cospi_16_64); - const __m128i k__cospi_F = octa_set_epi16(cospi_16_64, -cospi_16_64, - cospi_16_64, -cospi_16_64, - cospi_16_64, -cospi_16_64, - cospi_16_64, -cospi_16_64); - const __m128i k__cospi_G = octa_set_epi16(cospi_8_64, cospi_24_64, - cospi_8_64, cospi_24_64, - -cospi_8_64, -cospi_24_64, - -cospi_8_64, -cospi_24_64); - const __m128i k__cospi_H = octa_set_epi16(cospi_24_64, -cospi_8_64, - cospi_24_64, -cospi_8_64, - -cospi_24_64, cospi_8_64, - -cospi_24_64, cospi_8_64); + const __m128i k__cospi_A = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_B = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_C = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64); + const __m128i k__cospi_D = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64); + const __m128i k__cospi_E = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_F = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_G = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64); + const __m128i k__cospi_H = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); // This second rounding constant saves doing some extra adds at the end - const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING - +(DCT_CONST_ROUNDING << 1)); - const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; + const __m128i k__DCT_CONST_ROUNDING2 = + _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1)); + const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); __m128i in0, in1; @@ -90,14 +82,14 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { #endif // Load inputs. - in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) - (input + 2 * stride))); - in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) - (input + 3 * stride))); - // in0 = [i0 i1 i2 i3 iC iD iE iF] - // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in1 = _mm_unpacklo_epi64( + in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); + in0 = _mm_unpacklo_epi64( + in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); +// in0 = [i0 i1 i2 i3 iC iD iE iF] +// in1 = [i4 i5 i6 i7 i8 i9 iA iB] #if DCT_HIGH_BIT_DEPTH // Check inputs small enough to use optimised code cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)), @@ -194,8 +186,8 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i t0 = ADD_EPI16(in0, in1); const __m128i t1 = SUB_EPI16(in0, in1); - // t0 = [c0 c1 c8 c9 c4 c5 cC cD] - // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] +// t0 = [c0 c1 c8 c9 c4 c5 cC cD] +// t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x2(&t0, &t1); if (overflow) { @@ -263,7 +255,6 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { storeu_output(&in1, output + 2 * 4); } - void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { int pass; // Constants @@ -283,14 +274,14 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { int overflow; #endif // Load input - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); // Pre-condition input (shift by two) in0 = _mm_slli_epi16(in0, 2); in1 = _mm_slli_epi16(in1, 2); @@ -319,8 +310,8 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i q7 = SUB_EPI16(in0, in7); #if DCT_HIGH_BIT_DEPTH if (pass == 1) { - overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3, - &q4, &q5, &q6, &q7); + overflow = + check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); if (overflow) { vpx_highbd_fdct8x8_c(input, output, stride); return; @@ -630,22 +621,22 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { __m128i res08, res09, res10, res11, res12, res13, res14, res15; // Load and pre-condition input. if (0 == pass) { - in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); + in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); + in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); + in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); + in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); + in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); + in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); + in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); + in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); + in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); + in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); + in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); + in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); + in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); + in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); + in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); + in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); // x = x << 2 in00 = _mm_slli_epi16(in00, 2); in01 = _mm_slli_epi16(in01, 2); @@ -664,22 +655,22 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { in14 = _mm_slli_epi16(in14, 2); in15 = _mm_slli_epi16(in15, 2); } else { - in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); + in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); + in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); + in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); + in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); + in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); + in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); + in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); + in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); + in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); + in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); + in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); + in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); + in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); + in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); + in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); + in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); // x = (x + 1) >> 2 in00 = _mm_add_epi16(in00, kOne); in01 = _mm_add_epi16(in01, kOne); @@ -745,10 +736,9 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { step1_6 = SUB_EPI16(in01, in14); step1_7 = SUB_EPI16(in00, in15); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1_0, &step1_1, - &step1_2, &step1_3, - &step1_4, &step1_5, - &step1_6, &step1_7); + overflow = + check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, + &step1_4, &step1_5, &step1_6, &step1_7); if (overflow) { vpx_highbd_fdct16x16_c(input, output, stride); return; @@ -767,8 +757,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { const __m128i q6 = SUB_EPI16(input1, input6); const __m128i q7 = SUB_EPI16(input0, input7); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3, - &q4, &q5, &q6, &q7); + overflow = + check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); if (overflow) { vpx_highbd_fdct16x16_c(input, output, stride); return; @@ -818,12 +808,12 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { // into 32 bits. const __m128i d0 = _mm_unpacklo_epi16(q6, q5); const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, - DCT_CONST_BITS); - const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, - DCT_CONST_BITS); + const __m128i r0 = + mult_round_shift(&d0, &d1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + const __m128i r1 = + mult_round_shift(&d0, &d1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); #if DCT_HIGH_BIT_DEPTH overflow = check_epi16_overflow_x2(&r0, &r1); if (overflow) { @@ -860,8 +850,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res02, &res14, - &res10, &res06); + overflow = + check_epi16_overflow_x4(&res02, &res14, &res10, &res06); if (overflow) { vpx_highbd_fdct16x16_c(input, output, stride); return; @@ -888,8 +878,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, - &step2_4); + overflow = + check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4); if (overflow) { vpx_highbd_fdct16x16_c(input, output, stride); return; @@ -907,10 +897,9 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { step3_6 = ADD_EPI16(step1_6, step2_5); step3_7 = ADD_EPI16(step1_7, step2_4); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3_0, &step3_1, - &step3_2, &step3_3, - &step3_4, &step3_5, - &step3_6, &step3_7); + overflow = + check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3, + &step3_4, &step3_5, &step3_6, &step3_7); if (overflow) { vpx_highbd_fdct16x16_c(input, output, stride); return; @@ -932,8 +921,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, - &step2_5); + overflow = + check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5); if (overflow) { vpx_highbd_fdct16x16_c(input, output, stride); return; @@ -951,10 +940,9 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { step1_6 = SUB_EPI16(step3_7, step2_6); step1_7 = ADD_EPI16(step3_7, step2_6); #if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1_0, &step1_1, - &step1_2, &step1_3, - &step1_4, &step1_5, - &step1_6, &step1_7); + overflow = + check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, + &step1_4, &step1_5, &step1_6, &step1_7); if (overflow) { vpx_highbd_fdct16x16_c(input, output, stride); return; @@ -1006,16 +994,14 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { } } // Transpose the results, do it as two 8x8 transposes. - transpose_and_output8x8(&res00, &res01, &res02, &res03, - &res04, &res05, &res06, &res07, - pass, out0, out1); - transpose_and_output8x8(&res08, &res09, &res10, &res11, - &res12, &res13, &res14, &res15, - pass, out0 + 8, out1 + 8); + transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05, + &res06, &res07, pass, out0, out1); + transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13, + &res14, &res15, pass, out0 + 8, out1 + 8); if (pass == 0) { - out0 += 8*16; + out0 += 8 * 16; } else { - out1 += 8*16; + out1 += 8 * 16; } } // Setup in/out for next pass. diff --git a/vpx_dsp/x86/fwd_txfm_sse2.c b/vpx_dsp/x86/fwd_txfm_sse2.c index 3e4f49bd95262d4903e9e3a2c3d64dbca1a456fd..e14b99197f654ae84d312fd861b63e5de4a060ff 100644 --- a/vpx_dsp/x86/fwd_txfm_sse2.c +++ b/vpx_dsp/x86/fwd_txfm_sse2.c @@ -19,12 +19,12 @@ void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { __m128i in0, in1; __m128i tmp; const __m128i zero = _mm_setzero_si128(); - in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) - (input + 2 * stride))); - in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) - (input + 3 * stride))); + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in1 = _mm_unpacklo_epi64( + in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); + in0 = _mm_unpacklo_epi64( + in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); tmp = _mm_add_epi16(in0, in1); in0 = _mm_unpacklo_epi16(zero, tmp); @@ -45,19 +45,19 @@ void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { } void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); __m128i u0, u1, sum; u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); - in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); sum = _mm_add_epi16(u0, u1); @@ -65,7 +65,7 @@ void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { in2 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, in0); - u0 = _mm_setzero_si128(); + u0 = _mm_setzero_si128(); sum = _mm_add_epi16(sum, in2); in0 = _mm_unpacklo_epi16(u0, sum); @@ -92,50 +92,50 @@ void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int i; for (i = 0; i < 2; ++i) { - in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); - in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8)); + in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8)); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); - in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); - in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8)); + in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8)); sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); - in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0)); - in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8)); + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8)); sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); - in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0)); - in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8)); + in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8)); sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); sum = _mm_add_epi16(sum, u1); input += 8 * stride; } - u0 = _mm_setzero_si128(); + u0 = _mm_setzero_si128(); in0 = _mm_unpacklo_epi16(u0, sum); in1 = _mm_unpackhi_epi16(u0, sum); in0 = _mm_srai_epi32(in0, 16); @@ -161,53 +161,53 @@ void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, int i; for (i = 0; i < 8; ++i) { - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); input += stride; u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); input += stride; sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); input += stride; sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); input += stride; sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); sum = _mm_add_epi16(sum, u1); } - u0 = _mm_setzero_si128(); + u0 = _mm_setzero_si128(); in0 = _mm_unpacklo_epi16(u0, sum); in1 = _mm_unpackhi_epi16(u0, sum); in0 = _mm_srai_epi32(in0, 16); @@ -230,43 +230,43 @@ void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, #define FDCT8x8_2D vpx_fdct8x8_sse2 #define FDCT16x16_2D vpx_fdct16x16_sse2 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" -#undef FDCT4x4_2D -#undef FDCT8x8_2D -#undef FDCT16x16_2D +#undef FDCT4x4_2D +#undef FDCT8x8_2D +#undef FDCT16x16_2D #define FDCT32x32_2D vpx_fdct32x32_rd_sse2 #define FDCT32x32_HIGH_PRECISION 0 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION #define FDCT32x32_2D vpx_fdct32x32_sse2 #define FDCT32x32_HIGH_PRECISION 1 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION -#undef DCT_HIGH_BIT_DEPTH +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION +#undef DCT_HIGH_BIT_DEPTH #if CONFIG_VP9_HIGHBITDEPTH #define DCT_HIGH_BIT_DEPTH 1 #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2 #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2 #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2 -#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT -#undef FDCT4x4_2D -#undef FDCT8x8_2D -#undef FDCT16x16_2D +#include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT +#undef FDCT4x4_2D +#undef FDCT8x8_2D +#undef FDCT16x16_2D #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2 #define FDCT32x32_HIGH_PRECISION 0 -#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION +#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2 #define FDCT32x32_HIGH_PRECISION 1 -#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION -#undef DCT_HIGH_BIT_DEPTH +#include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION +#undef DCT_HIGH_BIT_DEPTH #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/fwd_txfm_sse2.h b/vpx_dsp/x86/fwd_txfm_sse2.h index 94d5befbfea52b47ccd42fb0fd5a05f2dd9b1270..5201e764c83af753fe84dbe2fc5ce91856f4c4e0 100644 --- a/vpx_dsp/x86/fwd_txfm_sse2.h +++ b/vpx_dsp/x86/fwd_txfm_sse2.h @@ -63,99 +63,57 @@ static INLINE int check_epi16_overflow_x4(const __m128i *preg0, return _mm_movemask_epi8(cmp0); } -static INLINE int check_epi16_overflow_x8(const __m128i *preg0, - const __m128i *preg1, - const __m128i *preg2, - const __m128i *preg3, - const __m128i *preg4, - const __m128i *preg5, - const __m128i *preg6, - const __m128i *preg7) { +static INLINE int check_epi16_overflow_x8( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7) { int res0, res1; res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); return res0 + res1; } -static INLINE int check_epi16_overflow_x12(const __m128i *preg0, - const __m128i *preg1, - const __m128i *preg2, - const __m128i *preg3, - const __m128i *preg4, - const __m128i *preg5, - const __m128i *preg6, - const __m128i *preg7, - const __m128i *preg8, - const __m128i *preg9, - const __m128i *preg10, - const __m128i *preg11) { +static INLINE int check_epi16_overflow_x12( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) { int res0, res1; res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); - if (!res0) - res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); return res0 + res1; } -static INLINE int check_epi16_overflow_x16(const __m128i *preg0, - const __m128i *preg1, - const __m128i *preg2, - const __m128i *preg3, - const __m128i *preg4, - const __m128i *preg5, - const __m128i *preg6, - const __m128i *preg7, - const __m128i *preg8, - const __m128i *preg9, - const __m128i *preg10, - const __m128i *preg11, - const __m128i *preg12, - const __m128i *preg13, - const __m128i *preg14, - const __m128i *preg15) { +static INLINE int check_epi16_overflow_x16( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15) { int res0, res1; res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); if (!res0) { res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); - if (!res1) - res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); } return res0 + res1; } -static INLINE int check_epi16_overflow_x32(const __m128i *preg0, - const __m128i *preg1, - const __m128i *preg2, - const __m128i *preg3, - const __m128i *preg4, - const __m128i *preg5, - const __m128i *preg6, - const __m128i *preg7, - const __m128i *preg8, - const __m128i *preg9, - const __m128i *preg10, - const __m128i *preg11, - const __m128i *preg12, - const __m128i *preg13, - const __m128i *preg14, - const __m128i *preg15, - const __m128i *preg16, - const __m128i *preg17, - const __m128i *preg18, - const __m128i *preg19, - const __m128i *preg20, - const __m128i *preg21, - const __m128i *preg22, - const __m128i *preg23, - const __m128i *preg24, - const __m128i *preg25, - const __m128i *preg26, - const __m128i *preg27, - const __m128i *preg28, - const __m128i *preg29, - const __m128i *preg30, - const __m128i *preg31) { +static INLINE int check_epi16_overflow_x32( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, + const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, + const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, + const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, + const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, + const __m128i *preg30, const __m128i *preg31) { int res0, res1; res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); @@ -190,36 +148,31 @@ static INLINE int k_check_epi32_overflow_4(const __m128i *preg0, __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1); __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1); __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1); - __m128i reg0_top_dwords = _mm_shuffle_epi32( - reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i reg1_top_dwords = _mm_shuffle_epi32( - reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i reg2_top_dwords = _mm_shuffle_epi32( - reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i reg3_top_dwords = _mm_shuffle_epi32( - reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg0_top_dwords = + _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg1_top_dwords = + _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg2_top_dwords = + _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg3_top_dwords = + _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1)); __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords); __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords); __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero); __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero); __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one); __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one); - int overflow_01 = _mm_movemask_epi8( - _mm_cmpeq_epi32(valid_positve_01, valid_negative_01)); - int overflow_23 = _mm_movemask_epi8( - _mm_cmpeq_epi32(valid_positve_23, valid_negative_23)); + int overflow_01 = + _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01)); + int overflow_23 = + _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23)); return (overflow_01 + overflow_23); } -static INLINE int k_check_epi32_overflow_8(const __m128i *preg0, - const __m128i *preg1, - const __m128i *preg2, - const __m128i *preg3, - const __m128i *preg4, - const __m128i *preg5, - const __m128i *preg6, - const __m128i *preg7, - const __m128i *zero) { +static INLINE int k_check_epi32_overflow_8( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *zero) { int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); if (!overflow) { overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); @@ -227,91 +180,59 @@ static INLINE int k_check_epi32_overflow_8(const __m128i *preg0, return overflow; } -static INLINE int k_check_epi32_overflow_16(const __m128i *preg0, - const __m128i *preg1, - const __m128i *preg2, - const __m128i *preg3, - const __m128i *preg4, - const __m128i *preg5, - const __m128i *preg6, - const __m128i *preg7, - const __m128i *preg8, - const __m128i *preg9, - const __m128i *preg10, - const __m128i *preg11, - const __m128i *preg12, - const __m128i *preg13, - const __m128i *preg14, - const __m128i *preg15, - const __m128i *zero) { +static INLINE int k_check_epi32_overflow_16( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *zero) { int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); if (!overflow) { overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, - zero); + overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, - zero); + overflow = + k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); } } } return overflow; } -static INLINE int k_check_epi32_overflow_32(const __m128i *preg0, - const __m128i *preg1, - const __m128i *preg2, - const __m128i *preg3, - const __m128i *preg4, - const __m128i *preg5, - const __m128i *preg6, - const __m128i *preg7, - const __m128i *preg8, - const __m128i *preg9, - const __m128i *preg10, - const __m128i *preg11, - const __m128i *preg12, - const __m128i *preg13, - const __m128i *preg14, - const __m128i *preg15, - const __m128i *preg16, - const __m128i *preg17, - const __m128i *preg18, - const __m128i *preg19, - const __m128i *preg20, - const __m128i *preg21, - const __m128i *preg22, - const __m128i *preg23, - const __m128i *preg24, - const __m128i *preg25, - const __m128i *preg26, - const __m128i *preg27, - const __m128i *preg28, - const __m128i *preg29, - const __m128i *preg30, - const __m128i *preg31, - const __m128i *zero) { +static INLINE int k_check_epi32_overflow_32( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, + const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, + const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, + const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, + const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, + const __m128i *preg30, const __m128i *preg31, const __m128i *zero) { int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); if (!overflow) { overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); if (!overflow) { overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, - zero); + overflow = + k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, - zero); + overflow = + k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(preg20, preg21, - preg22, preg23, zero); + overflow = + k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(preg24, preg25, - preg26, preg27, zero); + overflow = k_check_epi32_overflow_4(preg24, preg25, preg26, + preg27, zero); if (!overflow) { - overflow = k_check_epi32_overflow_4(preg28, preg29, - preg30, preg31, zero); + overflow = k_check_epi32_overflow_4(preg28, preg29, preg30, + preg31, zero); } } } @@ -322,7 +243,7 @@ static INLINE int k_check_epi32_overflow_32(const __m128i *preg0, return overflow; } -static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) { +static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { #if CONFIG_VP9_HIGHBITDEPTH const __m128i zero = _mm_setzero_si128(); const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); @@ -335,7 +256,7 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) { #endif // CONFIG_VP9_HIGHBITDEPTH } -static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) { +static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { #if CONFIG_VP9_HIGHBITDEPTH const __m128i zero = _mm_setzero_si128(); const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); @@ -348,9 +269,7 @@ static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) { #endif // CONFIG_VP9_HIGHBITDEPTH } - -static INLINE __m128i mult_round_shift(const __m128i *pin0, - const __m128i *pin1, +static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1, const __m128i *pmultiplier, const __m128i *prounding, const int shift) { @@ -364,12 +283,10 @@ static INLINE __m128i mult_round_shift(const __m128i *pin0, } static INLINE void transpose_and_output8x8( - const __m128i *pin00, const __m128i *pin01, - const __m128i *pin02, const __m128i *pin03, - const __m128i *pin04, const __m128i *pin05, - const __m128i *pin06, const __m128i *pin07, - const int pass, int16_t* out0_ptr, - tran_low_t* out1_ptr) { + const __m128i *pin00, const __m128i *pin01, const __m128i *pin02, + const __m128i *pin03, const __m128i *pin04, const __m128i *pin05, + const __m128i *pin06, const __m128i *pin07, const int pass, + int16_t *out0_ptr, tran_low_t *out1_ptr) { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 @@ -427,14 +344,14 @@ static INLINE void transpose_and_output8x8( // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 if (pass == 0) { - _mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0); - _mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1); - _mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2); - _mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3); - _mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4); - _mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5); - _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6); - _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7); + _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0); + _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1); + _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2); + _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3); + _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4); + _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5); + _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6); + _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7); } else { storeu_output(&tr2_0, (out1_ptr + 0 * 16)); storeu_output(&tr2_1, (out1_ptr + 1 * 16)); diff --git a/vpx_dsp/x86/halfpix_variance_sse2.c b/vpx_dsp/x86/halfpix_variance_sse2.c index 4a8fb6df7a3fd263c12b50ecf32ebf27bd4dfaca..b5c3f5fa2b1ec9d6b077400136f6ef9686b0f1ed 100644 --- a/vpx_dsp/x86/halfpix_variance_sse2.c +++ b/vpx_dsp/x86/halfpix_variance_sse2.c @@ -17,10 +17,8 @@ void vpx_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride, const unsigned char *src, - int src_stride, - unsigned int height, - int *sum, - unsigned int *sumsquared); + int src_stride, unsigned int height, + int *sum, unsigned int *sumsquared); void vpx_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride, const unsigned char *src, int src_stride, unsigned int height, int *sum, @@ -33,8 +31,7 @@ void vpx_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride, uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src, int src_stride, const unsigned char *dst, - int dst_stride, - uint32_t *sse) { + int dst_stride, uint32_t *sse) { int xsum0; unsigned int xxsum0; @@ -50,12 +47,11 @@ uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src, uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src, int src_stride, const unsigned char *dst, - int dst_stride, - uint32_t *sse) { + int dst_stride, uint32_t *sse) { int xsum0; unsigned int xxsum0; - vpx_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, - &xsum0, &xxsum0); + vpx_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0, + &xxsum0); *sse = xxsum0; assert(xsum0 <= 255 * 16 * 16); @@ -63,12 +59,10 @@ uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src, return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); } - uint32_t vpx_variance_halfpixvar16x16_hv_sse2(const unsigned char *src, int src_stride, const unsigned char *dst, - int dst_stride, - uint32_t *sse) { + int dst_stride, uint32_t *sse) { int xsum0; unsigned int xxsum0; diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c index 72e42adc91d7861f73a749f4efd3ee66502243f8..7d664110801bd403ebff57fb1de0e003b1f2250f 100644 --- a/vpx_dsp/x86/highbd_loopfilter_sse2.c +++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c @@ -25,16 +25,13 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { if (bd == 8) { t80 = _mm_set1_epi16(0x80); - max = _mm_subs_epi16( - _mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80); + max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80); } else if (bd == 10) { t80 = _mm_set1_epi16(0x200); - max = _mm_subs_epi16( - _mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80); + max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80); } else { // bd == 12 t80 = _mm_set1_epi16(0x800); - max = _mm_subs_epi16( - _mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80); + max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80); } min = _mm_subs_epi16(zero, t80); @@ -81,16 +78,16 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, blimit = _mm_slli_epi16( _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); } else { // bd == 12 blimit = _mm_slli_epi16( _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); } q4 = _mm_load_si128((__m128i *)(s + 4 * p)); @@ -118,25 +115,22 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, hev = _mm_subs_epu16(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); - abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); - work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0), - _mm_subs_epu16(p0, p1)), - _mm_or_si128(_mm_subs_epu16(q1, q0), - _mm_subs_epu16(q0, q1))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)), + _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1))); mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1), - _mm_subs_epu16(p1, p2)), - _mm_or_si128(_mm_subs_epu16(q2, q1), - _mm_subs_epu16(q1, q2))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2), - _mm_subs_epu16(p2, p3)), - _mm_or_si128(_mm_subs_epu16(q3, q2), - _mm_subs_epu16(q2, q3))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); mask = _mm_subs_epu16(mask, limit); @@ -160,8 +154,8 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, ps0 = _mm_subs_epi16(p0, t80); qs0 = _mm_subs_epi16(q0, t80); - filt = _mm_and_si128( - signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev); + filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), + hev); work_a = _mm_subs_epi16(qs0, ps0); filt = _mm_adds_epi16(filt, work_a); filt = _mm_adds_epi16(filt, work_a); @@ -175,33 +169,27 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, filter2 = _mm_srai_epi16(filter2, 0x3); qs0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), - t80); + signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); ps0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), - t80); + signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); filt = _mm_adds_epi16(filter1, t1); filt = _mm_srai_epi16(filt, 1); filt = _mm_andnot_si128(hev, filt); - qs1 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), - t80); - ps1 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), - t80); + qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), + t80); + ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), + t80); // end highbd_filter4 // loopfilter done // highbd_flat_mask4 - flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0), - _mm_subs_epu16(p0, p2)), - _mm_or_si128(_mm_subs_epu16(p3, p0), - _mm_subs_epu16(p0, p3))); - work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0), - _mm_subs_epu16(q0, q2)), - _mm_or_si128(_mm_subs_epu16(q3, q0), - _mm_subs_epu16(q0, q3))); + flat = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), + _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)), + _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); flat = _mm_max_epi16(work, flat); work = _mm_max_epi16(abs_p1p0, abs_q1q0); flat = _mm_max_epi16(work, flat); @@ -229,27 +217,23 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 // but referred to as p0-p4 & q0-q4 in fn) - flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0), - _mm_subs_epu16(p0, p4)), - _mm_or_si128(_mm_subs_epu16(q4, q0), - _mm_subs_epu16(q0, q4))); - - work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0), - _mm_subs_epu16(p0, p5)), - _mm_or_si128(_mm_subs_epu16(q5, q0), - _mm_subs_epu16(q0, q5))); + flat2 = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)), + _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4))); + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)), + _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5))); flat2 = _mm_max_epi16(work, flat2); - work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0), - _mm_subs_epu16(p0, p6)), - _mm_or_si128(_mm_subs_epu16(q6, q0), - _mm_subs_epu16(q0, q6))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)), + _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6))); flat2 = _mm_max_epi16(work, flat2); - work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0), - _mm_subs_epu16(p0, p7)), - _mm_or_si128(_mm_subs_epu16(q7, q0), - _mm_subs_epu16(q0, q7))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)), + _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7))); flat2 = _mm_max_epi16(work, flat2); if (bd == 8) @@ -268,29 +252,26 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, eight = _mm_set1_epi16(8); four = _mm_set1_epi16(4); - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), - _mm_add_epi16(p4, p3)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), - _mm_add_epi16(q4, q3)); + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3)); pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1)); pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1)); pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, - pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16(four, - _mm_add_epi16(pixetFilter_p2p1p0, - pixetFilter_q2q1q0)); - flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(p7, p0)), 4); - flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(q7, q0)), 4); - flat_p0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(p3, p0)), 3); - flat_q0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(q3, q0)), 3); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + flat2_p0 = + _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4); + flat2_q0 = + _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4); + flat_p0 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3); + flat_q0 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3); sum_p7 = _mm_add_epi16(p7, p7); sum_q7 = _mm_add_epi16(q7, q7); @@ -306,10 +287,10 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2); pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2); - flat_p1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(sum_p3, p1)), 3); - flat_q1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, - _mm_add_epi16(sum_q3, q1)), 3); + flat_p1 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3); + flat_q1 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3); sum_p7 = _mm_add_epi16(sum_p7, p7); sum_q7 = _mm_add_epi16(sum_q7, q7); @@ -318,53 +299,53 @@ void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5); - flat2_p2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p2)), 4); - flat2_q2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q2)), 4); + flat2_p2 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4); + flat2_q2 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4); pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1); - flat_p2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(sum_p3, p2)), 3); - flat_q2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, - _mm_add_epi16(sum_q3, q2)), 3); + flat_p2 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3); + flat_q2 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3); sum_p7 = _mm_add_epi16(sum_p7, p7); sum_q7 = _mm_add_epi16(sum_q7, q7); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4); - flat2_p3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p3)), 4); - flat2_q3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q3)), 4); + flat2_p3 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4); + flat2_q3 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4); sum_p7 = _mm_add_epi16(sum_p7, p7); sum_q7 = _mm_add_epi16(sum_q7, q7); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3); - flat2_p4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p4)), 4); - flat2_q4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q4)), 4); + flat2_p4 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4); + flat2_q4 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4); sum_p7 = _mm_add_epi16(sum_p7, p7); sum_q7 = _mm_add_epi16(sum_q7, q7); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2); - flat2_p5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p5)), 4); - flat2_q5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q5)), 4); + flat2_p5 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4); + flat2_q5 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4); sum_p7 = _mm_add_epi16(sum_p7, p7); sum_q7 = _mm_add_epi16(sum_q7, q7); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1); - flat2_p6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p6)), 4); - flat2_q6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q6)), 4); + flat2_p6 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4); + flat2_q6 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4); // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -505,8 +486,7 @@ void vpx_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p, void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh, - int bd) { + const uint8_t *_thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); @@ -546,19 +526,19 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, t80 = _mm_set1_epi16(0x80); } else if (bd == 10) { blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); t80 = _mm_set1_epi16(0x200); } else { // bd == 12 blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); t80 = _mm_set1_epi16(0x800); } @@ -568,20 +548,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, qs1 = _mm_subs_epi16(q1, t80); // filter_mask and hev_mask - abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), - _mm_subs_epu16(p0, p1)); - abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), - _mm_subs_epu16(q0, q1)); - - abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), - _mm_subs_epu16(q0, p0)); - abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), - _mm_subs_epu16(q1, p1)); + abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); + abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + + abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); flat = _mm_max_epi16(abs_p1p0, abs_q1q0); hev = _mm_subs_epu16(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); - abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); @@ -593,28 +569,24 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, mask = _mm_max_epi16(abs_q1q0, mask); // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1), - _mm_subs_epu16(p1, p2)), - _mm_or_si128(_mm_subs_epu16(q2, q1), - _mm_subs_epu16(q1, q2))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2), - _mm_subs_epu16(p2, p3)), - _mm_or_si128(_mm_subs_epu16(q3, q2), - _mm_subs_epu16(q2, q3))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); mask = _mm_subs_epu16(mask, limit); mask = _mm_cmpeq_epi16(mask, zero); // flat_mask4 - flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0), - _mm_subs_epu16(p0, p2)), - _mm_or_si128(_mm_subs_epu16(q2, q0), - _mm_subs_epu16(q0, q2))); - work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0), - _mm_subs_epu16(p0, p3)), - _mm_or_si128(_mm_subs_epu16(q3, q0), - _mm_subs_epu16(q0, q3))); + flat = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); flat = _mm_max_epi16(work, flat); flat = _mm_max_epi16(abs_p1p0, flat); flat = _mm_max_epi16(abs_q1q0, flat); @@ -737,14 +709,10 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, _mm_store_si128((__m128i *)(s + 2 * p), q2); } -void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p, - const uint8_t *_blimit0, - const uint8_t *_limit0, - const uint8_t *_thresh0, - const uint8_t *_blimit1, - const uint8_t *_limit1, - const uint8_t *_thresh1, - int bd) { +void vpx_highbd_lpf_horizontal_8_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd); vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); } @@ -752,8 +720,7 @@ void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p, void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh, - int bd) { + const uint8_t *_thresh, int bd) { const __m128i zero = _mm_set1_epi16(0); __m128i blimit, limit, thresh; __m128i mask, hev, flat; @@ -765,16 +732,16 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), - _mm_subs_epu16(p0, p1)); - const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), - _mm_subs_epu16(q0, q1)); + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); const __m128i one = _mm_set1_epi16(1); - __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), - _mm_subs_epu16(q0, p0)); - __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), - _mm_subs_epu16(q1, p1)); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); __m128i work; const __m128i t4 = _mm_set1_epi16(4); const __m128i t3 = _mm_set1_epi16(3); @@ -838,7 +805,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, hev = _mm_subs_epu16(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); - abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); @@ -848,15 +815,13 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, mask = _mm_max_epi16(flat, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1), - _mm_subs_epu16(p1, p2)), - _mm_or_si128(_mm_subs_epu16(p3, p2), - _mm_subs_epu16(p2, p3))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3))); mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1), - _mm_subs_epu16(q1, q2)), - _mm_or_si128(_mm_subs_epu16(q3, q2), - _mm_subs_epu16(q2, q3))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), + _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); mask = _mm_subs_epu16(mask, limit); mask = _mm_cmpeq_epi16(mask, zero); @@ -878,8 +843,8 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, // Filter1 >> 3 work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0 filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0 - filter1 = _mm_and_si128(filter1, t1f); // clamp the range + work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0 + filter1 = _mm_and_si128(filter1, t1f); // clamp the range filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits // Filter2 >> 3 @@ -901,12 +866,12 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, q0 = _mm_adds_epi16( signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); - q1 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80); + q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), + t80); p0 = _mm_adds_epi16( signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); - p1 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80); + p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), + t80); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); @@ -914,35 +879,38 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, _mm_storeu_si128((__m128i *)(s + 1 * p), q1); } -void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p, - const uint8_t *_blimit0, - const uint8_t *_limit0, - const uint8_t *_thresh0, - const uint8_t *_blimit1, - const uint8_t *_limit1, - const uint8_t *_thresh1, - int bd) { +void vpx_highbd_lpf_horizontal_4_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd); vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); } -static INLINE void highbd_transpose(uint16_t *src[], int in_p, - uint16_t *dst[], int out_p, - int num_8x8_to_transpose) { +static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], + int out_p, int num_8x8_to_transpose) { int idx8x8 = 0; __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; do { uint16_t *in = src[idx8x8]; uint16_t *out = dst[idx8x8]; - p0 = _mm_loadu_si128((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 - p1 = _mm_loadu_si128((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 - p2 = _mm_loadu_si128((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 - p3 = _mm_loadu_si128((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 - p4 = _mm_loadu_si128((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 - p5 = _mm_loadu_si128((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 - p6 = _mm_loadu_si128((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 - p7 = _mm_loadu_si128((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 + p0 = + _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 + p1 = + _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 + p2 = + _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 + p3 = + _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 + p4 = + _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 + p5 = + _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 + p6 = + _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 + p7 = + _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 // 00 10 01 11 02 12 03 13 x0 = _mm_unpacklo_epi16(p0, p1); // 20 30 21 31 22 32 23 33 @@ -960,9 +928,9 @@ static INLINE void highbd_transpose(uint16_t *src[], int in_p, // 01 11 21 31 41 51 61 71 x7 = _mm_unpackhi_epi64(x4, x5); - _mm_storeu_si128((__m128i *)(out + 0*out_p), x6); + _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); // 00 10 20 30 40 50 60 70 - _mm_storeu_si128((__m128i *)(out + 1*out_p), x7); + _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); // 01 11 21 31 41 51 61 71 // 02 12 22 32 03 13 23 33 @@ -974,9 +942,9 @@ static INLINE void highbd_transpose(uint16_t *src[], int in_p, // 03 13 23 33 43 53 63 73 x7 = _mm_unpackhi_epi64(x4, x5); - _mm_storeu_si128((__m128i *)(out + 2*out_p), x6); + _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); // 02 12 22 32 42 52 62 72 - _mm_storeu_si128((__m128i *)(out + 3*out_p), x7); + _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); // 03 13 23 33 43 53 63 73 // 04 14 05 15 06 16 07 17 @@ -996,9 +964,9 @@ static INLINE void highbd_transpose(uint16_t *src[], int in_p, // 05 15 25 35 45 55 65 75 x7 = _mm_unpackhi_epi64(x4, x5); - _mm_storeu_si128((__m128i *)(out + 4*out_p), x6); + _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); // 04 14 24 34 44 54 64 74 - _mm_storeu_si128((__m128i *)(out + 5*out_p), x7); + _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); // 05 15 25 35 45 55 65 75 // 06 16 26 36 07 17 27 37 @@ -1010,15 +978,15 @@ static INLINE void highbd_transpose(uint16_t *src[], int in_p, // 07 17 27 37 47 57 67 77 x7 = _mm_unpackhi_epi64(x4, x5); - _mm_storeu_si128((__m128i *)(out + 6*out_p), x6); + _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); // 06 16 26 36 46 56 66 76 - _mm_storeu_si128((__m128i *)(out + 7*out_p), x7); + _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); // 07 17 27 37 47 57 67 77 } while (++idx8x8 < num_8x8_to_transpose); } -static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, - int in_p, uint16_t *out, int out_p) { +static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, + uint16_t *out, int out_p) { uint16_t *src0[1]; uint16_t *src1[1]; uint16_t *dest0[1]; @@ -1031,10 +999,8 @@ static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, highbd_transpose(src1, in_p, dest1, out_p, 1); } -void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, +void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); uint16_t *src[1]; @@ -1056,14 +1022,10 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, highbd_transpose(src, 8, dst, p, 1); } -void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1, - int bd) { +void vpx_highbd_lpf_vertical_4_dual_sse2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); uint16_t *src[2]; uint16_t *dst[2]; @@ -1083,10 +1045,8 @@ void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p, highbd_transpose(src, 16, dst, p, 2); } -void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, +void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); uint16_t *src[1]; @@ -1108,14 +1068,10 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, highbd_transpose(src, 8, dst, p, 1); } -void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1, - int bd) { +void vpx_highbd_lpf_vertical_8_dual_sse2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); uint16_t *src[2]; uint16_t *dst[2]; @@ -1136,11 +1092,9 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p, highbd_transpose(src, 16, dst, p, 2); } -void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, - const uint8_t *blimit, +void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int bd) { + const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]); uint16_t *src[2]; uint16_t *dst[2]; @@ -1154,8 +1108,8 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, highbd_transpose(src, p, dst, 8, 2); // Loop filtering - vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, - thresh, bd); + vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh, + bd); src[0] = t_dst; src[1] = t_dst + 8 * 8; dst[0] = s - 8; @@ -1165,12 +1119,10 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, highbd_transpose(src, 8, dst, p, 2); } -void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, - int p, +void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh, - int bd) { + const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[256]); // Transpose 16x16 diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 164ffcff2bd7fdac2ba80305d59c2449b8b09cc2..dad00dfe97162abc877ff149f2ab176cd40a69f4 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -16,26 +16,19 @@ #include "vpx_ports/mem.h" #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, - intptr_t count, - int skip_block, - const int16_t *zbin_ptr, +void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, - uint16_t *eob_ptr, - const int16_t *scan, - const int16_t *iscan) { + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { int i, j, non_zero_regs = (int)count / 4, eob_i = -1; __m128i zbins[2]; __m128i nzbins[2]; - zbins[0] = _mm_set_epi32((int)zbin_ptr[1], - (int)zbin_ptr[1], - (int)zbin_ptr[1], + zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[0]); zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); @@ -74,14 +67,13 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); coeffs_sign = _mm_srai_epi32(coeffs, 31); - coeffs = _mm_sub_epi32( - _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); tmp1 = _mm_or_si128(tmp1, tmp2); test = _mm_movemask_epi8(tmp1); - _mm_storeu_si128((__m128i*)abs_coeff, coeffs); - _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); for (j = 0; j < 4; j++) { if (test & (1 << (4 * j))) { @@ -92,8 +84,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16); qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; - if (abs_qcoeff) - eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } @@ -101,20 +92,12 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, *eob_ptr = eob_i + 1; } - -void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, - intptr_t n_coeffs, - int skip_block, - const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, - uint16_t *eob_ptr, - const int16_t *scan, - const int16_t *iscan) { +void vpx_highbd_quantize_b_32x32_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; @@ -123,10 +106,7 @@ void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); (void)scan; - zbins[0] = _mm_set_epi32(zbin1_tmp, - zbin1_tmp, - zbin1_tmp, - zbin0_tmp); + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); nzbins[0] = _mm_setzero_si128(); @@ -147,14 +127,10 @@ void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); cmp1 = _mm_and_si128(cmp1, cmp2); test = _mm_movemask_epi8(cmp1); - if (!(test & 0xf)) - idx_arr[idx++] = i * 4; - if (!(test & 0xf0)) - idx_arr[idx++] = i * 4 + 1; - if (!(test & 0xf00)) - idx_arr[idx++] = i * 4 + 2; - if (!(test & 0xf000)) - idx_arr[idx++] = i * 4 + 3; + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; } // Quantization pass: only process the coefficients selected in @@ -164,15 +140,14 @@ void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = abs_coeff - + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; const uint32_t abs_qcoeff = (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) - eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } } *eob_ptr = eob + 1; diff --git a/vpx_dsp/x86/highbd_subtract_sse2.c b/vpx_dsp/x86/highbd_subtract_sse2.c index 33e464b7842d16b331bf977b66f52e8fb5b6a4fd..e7d5ac2982f23bd7f039b66bab74b76c82e2489f 100644 --- a/vpx_dsp/x86/highbd_subtract_sse2.c +++ b/vpx_dsp/x86/highbd_subtract_sse2.c @@ -15,10 +15,10 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" -typedef void (*SubtractWxHFuncType)( - int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride); +typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, + ptrdiff_t pred_stride); static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride, const uint16_t *src, ptrdiff_t src_stride, @@ -26,17 +26,17 @@ static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride, __m128i u0, u1, u2, u3; __m128i v0, v1, v2, v3; __m128i x0, x1, x2, x3; - int64_t *store_diff = (int64_t *) (diff + 0 * diff_stride); + int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); - u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride)); - u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride)); - u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride)); - u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride)); + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); - v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride)); - v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride)); - v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride)); - v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride)); + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); x0 = _mm_sub_epi16(u0, v0); x1 = _mm_sub_epi16(u1, v1); @@ -44,11 +44,11 @@ static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride, x3 = _mm_sub_epi16(u3, v3); _mm_storel_epi64((__m128i *)store_diff, x0); - store_diff = (int64_t *) (diff + 1 * diff_stride); + store_diff = (int64_t *)(diff + 1 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x1); - store_diff = (int64_t *) (diff + 2 * diff_stride); + store_diff = (int64_t *)(diff + 2 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x2); - store_diff = (int64_t *) (diff + 3 * diff_stride); + store_diff = (int64_t *)(diff + 3 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x3); } @@ -58,25 +58,25 @@ static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride, __m128i u0, u1, u2, u3, u4, u5, u6, u7; __m128i v0, v1, v2, v3, v4, v5, v6, v7; __m128i x0, x1, x2, x3, x4, x5, x6, x7; - int64_t *store_diff = (int64_t *) (diff + 0 * diff_stride); - - u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride)); - u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride)); - u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride)); - u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride)); - u4 = _mm_loadu_si128((__m128i const *) (src + 4 * src_stride)); - u5 = _mm_loadu_si128((__m128i const *) (src + 5 * src_stride)); - u6 = _mm_loadu_si128((__m128i const *) (src + 6 * src_stride)); - u7 = _mm_loadu_si128((__m128i const *) (src + 7 * src_stride)); - - v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride)); - v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride)); - v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride)); - v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride)); - v4 = _mm_loadu_si128((__m128i const *) (pred + 4 * pred_stride)); - v5 = _mm_loadu_si128((__m128i const *) (pred + 5 * pred_stride)); - v6 = _mm_loadu_si128((__m128i const *) (pred + 6 * pred_stride)); - v7 = _mm_loadu_si128((__m128i const *) (pred + 7 * pred_stride)); + int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); x0 = _mm_sub_epi16(u0, v0); x1 = _mm_sub_epi16(u1, v1); @@ -88,19 +88,19 @@ static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride, x7 = _mm_sub_epi16(u7, v7); _mm_storel_epi64((__m128i *)store_diff, x0); - store_diff = (int64_t *) (diff + 1 * diff_stride); + store_diff = (int64_t *)(diff + 1 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x1); - store_diff = (int64_t *) (diff + 2 * diff_stride); + store_diff = (int64_t *)(diff + 2 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x2); - store_diff = (int64_t *) (diff + 3 * diff_stride); + store_diff = (int64_t *)(diff + 3 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x3); - store_diff = (int64_t *) (diff + 4 * diff_stride); + store_diff = (int64_t *)(diff + 4 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x4); - store_diff = (int64_t *) (diff + 5 * diff_stride); + store_diff = (int64_t *)(diff + 5 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x5); - store_diff = (int64_t *) (diff + 6 * diff_stride); + store_diff = (int64_t *)(diff + 6 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x6); - store_diff = (int64_t *) (diff + 7 * diff_stride); + store_diff = (int64_t *)(diff + 7 * diff_stride); _mm_storel_epi64((__m128i *)store_diff, x7); } @@ -111,25 +111,25 @@ static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride, __m128i v0, v1, v2, v3; __m128i x0, x1, x2, x3; - u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride)); - u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride)); - u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride)); - u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride)); + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); - v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride)); - v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride)); - v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride)); - v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride)); + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); x0 = _mm_sub_epi16(u0, v0); x1 = _mm_sub_epi16(u1, v1); x2 = _mm_sub_epi16(u2, v2); x3 = _mm_sub_epi16(u3, v3); - _mm_storeu_si128((__m128i *) (diff + 0 * diff_stride), x0); - _mm_storeu_si128((__m128i *) (diff + 1 * diff_stride), x1); - _mm_storeu_si128((__m128i *) (diff + 2 * diff_stride), x2); - _mm_storeu_si128((__m128i *) (diff + 3 * diff_stride), x3); + _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); + _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); + _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); + _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); } static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, @@ -139,23 +139,23 @@ static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, __m128i v0, v1, v2, v3, v4, v5, v6, v7; __m128i x0, x1, x2, x3, x4, x5, x6, x7; - u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride)); - u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride)); - u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride)); - u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride)); - u4 = _mm_loadu_si128((__m128i const *) (src + 4 * src_stride)); - u5 = _mm_loadu_si128((__m128i const *) (src + 5 * src_stride)); - u6 = _mm_loadu_si128((__m128i const *) (src + 6 * src_stride)); - u7 = _mm_loadu_si128((__m128i const *) (src + 7 * src_stride)); - - v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride)); - v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride)); - v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride)); - v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride)); - v4 = _mm_loadu_si128((__m128i const *) (pred + 4 * pred_stride)); - v5 = _mm_loadu_si128((__m128i const *) (pred + 5 * pred_stride)); - v6 = _mm_loadu_si128((__m128i const *) (pred + 6 * pred_stride)); - v7 = _mm_loadu_si128((__m128i const *) (pred + 7 * pred_stride)); + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); x0 = _mm_sub_epi16(u0, v0); x1 = _mm_sub_epi16(u1, v1); @@ -166,14 +166,14 @@ static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, x6 = _mm_sub_epi16(u6, v6); x7 = _mm_sub_epi16(u7, v7); - _mm_storeu_si128((__m128i *) (diff + 0 * diff_stride), x0); - _mm_storeu_si128((__m128i *) (diff + 1 * diff_stride), x1); - _mm_storeu_si128((__m128i *) (diff + 2 * diff_stride), x2); - _mm_storeu_si128((__m128i *) (diff + 3 * diff_stride), x3); - _mm_storeu_si128((__m128i *) (diff + 4 * diff_stride), x4); - _mm_storeu_si128((__m128i *) (diff + 5 * diff_stride), x5); - _mm_storeu_si128((__m128i *) (diff + 6 * diff_stride), x6); - _mm_storeu_si128((__m128i *) (diff + 7 * diff_stride), x7); + _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); + _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); + _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); + _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); + _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4); + _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5); + _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6); + _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7); } static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride, @@ -349,17 +349,14 @@ static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { return ret_func_ptr; } -void vpx_highbd_subtract_block_sse2( - int rows, int cols, - int16_t *diff, ptrdiff_t diff_stride, - const uint8_t *src8, ptrdiff_t src_stride, - const uint8_t *pred8, - ptrdiff_t pred_stride, - int bd) { +void vpx_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride, int bd) { uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); SubtractWxHFuncType func; - (void) bd; + (void)bd; func = getSubtractFunc(rows, cols); func(diff, diff_stride, src, src_stride, pred, pred_stride); diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c index 364391578592582b06efd67e4183dfe29a7c1766..76e8816db9686c21aa4ac9883a2461e0b4acf160 100644 --- a/vpx_dsp/x86/highbd_variance_sse2.c +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -15,9 +15,9 @@ #include "vpx_ports/mem.h" -typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); +typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, @@ -28,8 +28,8 @@ uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, uint32_t *sse, int *sum); static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int w, int h, uint32_t *sse, int *sum, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn, int block_size) { int i, j; @@ -40,8 +40,8 @@ static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, for (j = 0; j < w; j += block_size) { unsigned int sse0; int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); *sse += sse0; *sum += sum0; } @@ -49,8 +49,8 @@ static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, } static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int w, int h, uint32_t *sse, int *sum, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn, int block_size) { int i, j; uint64_t sse_long = 0; @@ -60,8 +60,8 @@ static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, for (j = 0; j < w; j += block_size) { unsigned int sse0; int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); sse_long += sse0; sum_long += sum0; } @@ -71,8 +71,8 @@ static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, } static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int w, int h, uint32_t *sse, int *sum, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, high_variance_fn_t var_fn, int block_size) { int i, j; uint64_t sse_long = 0; @@ -82,8 +82,8 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, for (j = 0; j < w; j += block_size) { unsigned int sse0; int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); sse_long += sse0; sum_long += sum0; } @@ -92,84 +92,83 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); } - -#define HIGH_GET_VAR(S) \ -void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ - sse, sum); \ -} \ -\ -void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ - sse, sum); \ - *sum = ROUND_POWER_OF_TWO(*sum, 2); \ - *sse = ROUND_POWER_OF_TWO(*sse, 4); \ -} \ -\ -void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ - sse, sum); \ - *sum = ROUND_POWER_OF_TWO(*sum, 4); \ - *sse = ROUND_POWER_OF_TWO(*sse, 8); \ -} +#define HIGH_GET_VAR(S) \ + void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + } HIGH_GET_VAR(16); HIGH_GET_VAR(8); #undef HIGH_GET_VAR -#define VAR_FN(w, h, block_size, shift) \ -uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ - int sum; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - vpx_highbd_calc##block_size##x##block_size##var_sse2, \ - block_size); \ - return *sse - (((int64_t)sum * sum) >> shift); \ -} \ -\ -uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_10_variance_sse2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ - return (var >= 0) ? (uint32_t)var : 0; \ -} \ -\ -uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_12_variance_sse2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ - return (var >= 0) ? (uint32_t)var : 0; \ -} +#define VAR_FN(w, h, block_size, shift) \ + uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_8_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ + } \ + \ + uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } VAR_FN(64, 64, 16, 12); VAR_FN(64, 32, 16, 11); @@ -185,13 +184,13 @@ VAR_FN(8, 8, 8, 6); #undef VAR_FN unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + vpx_highbd_calc16x16var_sse2, 16); return *sse; } @@ -201,8 +200,8 @@ unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + vpx_highbd_calc16x16var_sse2, 16); return *sse; } @@ -212,19 +211,19 @@ unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + vpx_highbd_calc16x16var_sse2, 16); return *sse; } unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + vpx_highbd_calc8x8var_sse2, 8); return *sse; } @@ -234,8 +233,8 @@ unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + vpx_highbd_calc8x8var_sse2, 8); return *sse; } @@ -245,25 +244,21 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + vpx_highbd_calc8x8var_sse2, 8); return *sse; } // The 2 unused parameters are place holders for PIC enabled build. // These definitions are for functions defined in // highbd_subpel_variance_impl_sse2.asm -#define DECL(w, opt) \ - int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \ - ptrdiff_t src_stride, \ - int x_offset, int y_offset, \ - const uint16_t *dst, \ - ptrdiff_t dst_stride, \ - int height, \ - unsigned int *sse, \ - void *unused0, void *unused); +#define DECL(w, opt) \ + int vpx_highbd_sub_pixel_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *dst, ptrdiff_t dst_stride, int height, \ + unsigned int *sse, void *unused0, void *unused); #define DECLS(opt) \ - DECL(8, opt); \ + DECL(8, opt); \ DECL(16, opt) DECLS(sse2); @@ -271,152 +266,134 @@ DECLS(sse2); #undef DECLS #undef DECL -#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ -uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \ - int src_stride, \ - int x_offset, \ - int y_offset, \ - const uint8_t *dst8, \ - int dst_stride, \ - uint32_t *sse_ptr) { \ - uint32_t sse; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, h, \ - &sse, NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ - src_stride, \ - x_offset, y_offset, \ - dst + 16, \ - dst_stride, \ - h, &sse2, \ - NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ - x_offset, y_offset, \ - dst + 32, dst_stride, \ - h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, \ - dst + 48, dst_stride, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} \ -\ -uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ - uint32_t sse; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - h, &sse, NULL, NULL); \ - if (w > wf) { \ - uint32_t sse2; \ - int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ - src_stride, \ - x_offset, y_offset, \ - dst + 16, \ - dst_stride, \ - h, &sse2, \ - NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ - x_offset, y_offset, \ - dst + 32, dst_stride, \ - h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ - x_offset, y_offset, \ - dst + 48, dst_stride, \ - h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - se = ROUND_POWER_OF_TWO(se, 2); \ - sse = ROUND_POWER_OF_TWO(sse, 4); \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} \ -\ -uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ - int start_row; \ - uint32_t sse; \ - int se = 0; \ - uint64_t long_sse = 0; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - for (start_row = 0; start_row < h; start_row +=16) { \ - uint32_t sse2; \ - int height = h - start_row < 16 ? h - start_row : 16; \ - int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + (start_row * src_stride), src_stride, \ - x_offset, y_offset, dst + (start_row * dst_stride), \ - dst_stride, height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf) { \ - se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16 + (start_row * src_stride), src_stride, \ - x_offset, y_offset, dst + 16 + (start_row * dst_stride), \ - dst_stride, height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32 + (start_row * src_stride), src_stride, \ - x_offset, y_offset, dst + 32 + (start_row * dst_stride), \ - dst_stride, height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48 + (start_row * src_stride), src_stride, \ - x_offset, y_offset, dst + 48 + (start_row * dst_stride), \ - dst_stride, height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - }\ - } \ - } \ - se = ROUND_POWER_OF_TWO(se, 4); \ - sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} - -#define FNS(opt) \ -FN(64, 64, 16, 6, 6, opt, (int64_t)); \ -FN(64, 32, 16, 6, 5, opt, (int64_t)); \ -FN(32, 64, 16, 5, 6, opt, (int64_t)); \ -FN(32, 32, 16, 5, 5, opt, (int64_t)); \ -FN(32, 16, 16, 5, 4, opt, (int64_t)); \ -FN(16, 32, 16, 4, 5, opt, (int64_t)); \ -FN(16, 16, 16, 4, 4, opt, (int64_t)); \ -FN(16, 8, 16, 4, 3, opt, (int64_t)); \ -FN(8, 16, 8, 3, 4, opt, (int64_t)); \ -FN(8, 8, 8, 3, 3, opt, (int64_t)); \ -FN(8, 4, 8, 3, 2, opt, (int64_t)); +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int start_row; \ + uint32_t sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, y_offset, \ + dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \ + NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \ + &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + } +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int64_t)); \ + FN(8, 16, 8, 3, 4, opt, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt, (int64_t)); FNS(sse2); @@ -424,183 +401,162 @@ FNS(sse2); #undef FN // The 2 unused parameters are place holders for PIC enabled build. -#define DECL(w, opt) \ -int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \ - ptrdiff_t src_stride, \ - int x_offset, int y_offset, \ - const uint16_t *dst, \ - ptrdiff_t dst_stride, \ - const uint16_t *sec, \ - ptrdiff_t sec_stride, \ - int height, \ - unsigned int *sse, \ - void *unused0, void *unused); +#define DECL(w, opt) \ + int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused); #define DECLS(opt1) \ -DECL(16, opt1) \ -DECL(8, opt1) + DECL(16, opt1) \ + DECL(8, opt1) DECLS(sse2); #undef DECL #undef DECLS -#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ -uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ - const uint8_t *sec8) { \ - uint32_t sse; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ - int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, \ - y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \ - if (w > wf) { \ - uint32_t sse2; \ - int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, \ - dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, \ - dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, \ - dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} \ -\ -uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ - const uint8_t *sec8) { \ - uint32_t sse; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ - int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, \ - y_offset, dst, dst_stride, \ - sec, w, h, &sse, NULL, NULL); \ - if (w > wf) { \ - uint32_t sse2; \ - int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, \ - x_offset, y_offset, \ - dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, \ - NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, \ - x_offset, y_offset, \ - dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, \ - NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, \ - x_offset, y_offset, \ - dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, \ - NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - se = ROUND_POWER_OF_TWO(se, 2); \ - sse = ROUND_POWER_OF_TWO(sse, 4); \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} \ -\ -uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ - const uint8_t *sec8) { \ - int start_row; \ - uint32_t sse; \ - int se = 0; \ - uint64_t long_sse = 0; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ - for (start_row = 0; start_row < h; start_row +=16) { \ - uint32_t sse2; \ - int height = h - start_row < 16 ? h - start_row : 16; \ - int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + (start_row * dst_stride), dst_stride, \ - sec + (start_row * w), w, height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf) { \ - se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16 + (start_row * src_stride), src_stride, \ - x_offset, y_offset, \ - dst + 16 + (start_row * dst_stride), dst_stride, \ - sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32 + (start_row * src_stride), src_stride, \ - x_offset, y_offset, \ - dst + 32 + (start_row * dst_stride), dst_stride, \ - sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48 + (start_row * src_stride), src_stride, \ - x_offset, y_offset, \ - dst + 48 + (start_row * dst_stride), dst_stride, \ - sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - } \ - } \ - } \ - se = ROUND_POWER_OF_TWO(se, 4); \ - sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} - +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int start_row; \ + uint32_t sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, y_offset, \ + dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ + w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \ + sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + } -#define FNS(opt1) \ -FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ -FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ -FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ -FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ -FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ -FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ -FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ -FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ -FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ -FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ -FN(8, 4, 8, 3, 2, opt1, (int64_t)); +#define FNS(opt1) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ + FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt1, (int64_t)); FNS(sse2); #undef FNS #undef FN -void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred, - int width, int height, - const uint8_t *ref8, - int ref_stride) { +void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height, + const uint8_t *ref8, int ref_stride) { int i, j; int stride = ref_stride << 3; uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); @@ -608,7 +564,7 @@ void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred, if (width >= 8) { // read 8 points at one time for (i = 0; i < height; i++) { - for (j = 0; j < width; j+= 8) { + for (j = 0; j < width; j += 8) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); @@ -629,14 +585,14 @@ void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred, _mm_storeu_si128((__m128i *)(comp_pred), t0); comp_pred += 8; - ref += 64; // 8 * 8; + ref += 64; // 8 * 8; } ref += stride - (width << 3); } } else { // read 4 points at one time for (i = 0; i < height; i++) { - for (j = 0; j < width; j+= 4) { + for (j = 0; j < width; j += 4) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); @@ -657,9 +613,8 @@ void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred, } void vpx_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, - const uint8_t *pred8, - int width, int height, - const uint8_t *ref8, + const uint8_t *pred8, int width, + int height, const uint8_t *ref8, int ref_stride) { const __m128i one = _mm_set1_epi16(1); int i, j; @@ -670,7 +625,7 @@ void vpx_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, if (width >= 8) { // read 8 points at one time for (i = 0; i < height; i++) { - for (j = 0; j < width; j+= 8) { + for (j = 0; j < width; j += 8) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); @@ -704,7 +659,7 @@ void vpx_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, } else { // read 4 points at one time for (i = 0; i < height; i++) { - for (j = 0; j < width; j+= 4) { + for (j = 0; j < width; j += 4) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); diff --git a/vpx_dsp/x86/highbd_variance_sse4.c b/vpx_dsp/x86/highbd_variance_sse4.c index 4d0b75deab782f90b3efc9d27b7e0e88568a6aa1..d1d2146b473ba2f4e765d293525e95fe1dd5450a 100644 --- a/vpx_dsp/x86/highbd_variance_sse4.c +++ b/vpx_dsp/x86/highbd_variance_sse4.c @@ -65,10 +65,8 @@ static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride, *sum = (int64_t)_mm_extract_epi32(y0, 0); } -uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a, - int a_stride, - const uint8_t *b, - int b_stride, +uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, uint32_t *sse) { int64_t sum; uint64_t local_sse; @@ -79,10 +77,8 @@ uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a, return *sse - (uint32_t)((sum * sum) >> 4); } -uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a, - int a_stride, - const uint8_t *b, - int b_stride, +uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, uint32_t *sse) { int64_t sum; uint64_t local_sse; @@ -94,10 +90,8 @@ uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a, return *sse - (uint32_t)((sum * sum) >> 4); } -uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a, - int a_stride, - const uint8_t *b, - int b_stride, +uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, uint32_t *sse) { int64_t sum; uint64_t local_sse; @@ -111,136 +105,108 @@ uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a, // Sub-pixel uint32_t vpx_highbd_8_sub_pixel_variance4x4_sse4_1( - const uint8_t *src, int src_stride, - int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, - uint32_t *sse) { - + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { uint16_t fdata3[(4 + 1) * 4]; uint16_t temp2[4 * 4]; vpx_highbd_var_filter_block2d_bil_first_pass( - src, fdata3, src_stride, 1, 4 + 1, - 4, bilinear_filters_2t[xoffset]); - vpx_highbd_var_filter_block2d_bil_second_pass( - fdata3, temp2, 4, 4, 4, 4, - bilinear_filters_2t[yoffset]); - - return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), - 4, dst, dst_stride, sse); + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride, + sse); } uint32_t vpx_highbd_10_sub_pixel_variance4x4_sse4_1( - const uint8_t *src, int src_stride, - int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, - uint32_t *sse) { + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { uint16_t fdata3[(4 + 1) * 4]; uint16_t temp2[4 * 4]; vpx_highbd_var_filter_block2d_bil_first_pass( - src, fdata3, src_stride, 1, 4 + 1, - 4, bilinear_filters_2t[xoffset]); - vpx_highbd_var_filter_block2d_bil_second_pass( - fdata3, temp2, 4, 4, 4, 4, - bilinear_filters_2t[yoffset]); - - return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), - 4, dst, dst_stride, sse); + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, + dst_stride, sse); } uint32_t vpx_highbd_12_sub_pixel_variance4x4_sse4_1( - const uint8_t *src, int src_stride, - int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, - uint32_t *sse) { + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { uint16_t fdata3[(4 + 1) * 4]; uint16_t temp2[4 * 4]; vpx_highbd_var_filter_block2d_bil_first_pass( - src, fdata3, src_stride, 1, 4 + 1, - 4, bilinear_filters_2t[xoffset]); - vpx_highbd_var_filter_block2d_bil_second_pass( - fdata3, temp2, 4, 4, 4, 4, - bilinear_filters_2t[yoffset]); - - return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), - 4, dst, dst_stride, sse); + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, + dst_stride, sse); } // Sub-pixel average uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1( - const uint8_t *src, int src_stride, - int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, - uint32_t *sse, + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, const uint8_t *second_pred) { - uint16_t fdata3[(4 + 1) * 4]; uint16_t temp2[4 * 4]; DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); vpx_highbd_var_filter_block2d_bil_first_pass( - src, fdata3, src_stride, 1, 4 + 1, - 4, bilinear_filters_2t[xoffset]); - vpx_highbd_var_filter_block2d_bil_second_pass( - fdata3, temp2, 4, 4, 4, 4, - bilinear_filters_2t[yoffset]); + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); - vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4, - CONVERT_TO_BYTEPTR(temp2), 4); + vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), + 4); - return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), - 4, dst, dst_stride, sse); + return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, + sse); } uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_sse4_1( - const uint8_t *src, int src_stride, - int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, - uint32_t *sse, + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, const uint8_t *second_pred) { - uint16_t fdata3[(4 + 1) * 4]; uint16_t temp2[4 * 4]; DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); vpx_highbd_var_filter_block2d_bil_first_pass( - src, fdata3, src_stride, 1, 4 + 1, - 4, bilinear_filters_2t[xoffset]); - vpx_highbd_var_filter_block2d_bil_second_pass( - fdata3, temp2, 4, 4, 4, 4, - bilinear_filters_2t[yoffset]); + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); - vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4, - CONVERT_TO_BYTEPTR(temp2), 4); + vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), + 4); - return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), - 4, dst, dst_stride, sse); + return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, + dst_stride, sse); } uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_sse4_1( - const uint8_t *src, int src_stride, - int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, - uint32_t *sse, + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, const uint8_t *second_pred) { - uint16_t fdata3[(4 + 1) * 4]; uint16_t temp2[4 * 4]; DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); vpx_highbd_var_filter_block2d_bil_first_pass( - src, fdata3, src_stride, 1, 4 + 1, - 4, bilinear_filters_2t[xoffset]); - vpx_highbd_var_filter_block2d_bil_second_pass( - fdata3, temp2, 4, 4, 4, 4, - bilinear_filters_2t[yoffset]); + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); - vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4, - CONVERT_TO_BYTEPTR(temp2), 4); + vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), + 4); - return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), - 4, dst, dst_stride, sse); + return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, + dst_stride, sse); } diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index df5068c624b483f8d148b356e0b25f6aabdd17b2..a6fc1161f120b99db58b6331ff2660bd9d5d02cb 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -12,14 +12,14 @@ #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -#define RECON_AND_STORE4X4(dest, in_x) \ -{ \ - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - *(int *)(dest) = _mm_cvtsi128_si32(d0); \ -} +#define RECON_AND_STORE4X4(dest, in_x) \ + { \ + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + *(int *)(dest) = _mm_cvtsi128_si32(d0); \ + } void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { @@ -263,192 +263,189 @@ void iadst4_sse2(__m128i *in) { in[1] = _mm_packs_epi32(u[2], u[3]); } -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ - const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ - out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ - out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ - out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ + const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ } -#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ - out0, out1, out2, out3) \ - { \ - const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ - const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ +#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ + { \ + const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ + const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ } #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ } // Define Macro for multiplying elements by constants and adding them together. -#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ - cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - tmp4 = _mm_madd_epi16(lo_1, cst2); \ - tmp5 = _mm_madd_epi16(hi_1, cst2); \ - tmp6 = _mm_madd_epi16(lo_1, cst3); \ - tmp7 = _mm_madd_epi16(hi_1, cst3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - res2 = _mm_packs_epi32(tmp4, tmp5); \ - res3 = _mm_packs_epi32(tmp6, tmp7); \ +#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ + res0, res1, res2, res3) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + tmp4 = _mm_madd_epi16(lo_1, cst2); \ + tmp5 = _mm_madd_epi16(hi_1, cst2); \ + tmp6 = _mm_madd_epi16(lo_1, cst3); \ + tmp7 = _mm_madd_epi16(hi_1, cst3); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp4 = _mm_add_epi32(tmp4, rounding); \ + tmp5 = _mm_add_epi32(tmp5, rounding); \ + tmp6 = _mm_add_epi32(tmp6, rounding); \ + tmp7 = _mm_add_epi32(tmp7, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + res2 = _mm_packs_epi32(tmp4, tmp5); \ + res3 = _mm_packs_epi32(tmp6, tmp7); \ } #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ } -#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) \ - { \ - /* Stage1 */ \ - { \ - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ - \ - MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_4, \ - stp1_7, stp1_5, stp1_6) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ - const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ - \ - MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_0, \ - stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ - tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ - tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ - tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - } \ - \ - /* Stage4 */ \ - out0 = _mm_adds_epi16(stp1_0, stp2_7); \ - out1 = _mm_adds_epi16(stp1_1, stp1_6); \ - out2 = _mm_adds_epi16(stp1_2, stp1_5); \ - out3 = _mm_adds_epi16(stp1_3, stp2_4); \ - out4 = _mm_subs_epi16(stp1_3, stp2_4); \ - out5 = _mm_subs_epi16(stp1_2, stp1_5); \ - out6 = _mm_subs_epi16(stp1_1, stp1_6); \ - out7 = _mm_subs_epi16(stp1_0, stp2_7); \ +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \ + out4, out5, out6, out7) \ + { \ + /* Stage1 */ \ + { \ + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ + \ + MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \ + stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ + const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ + stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ + tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ + tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ + tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + } \ + \ + /* Stage4 */ \ + out0 = _mm_adds_epi16(stp1_0, stp2_7); \ + out1 = _mm_adds_epi16(stp1_1, stp1_6); \ + out2 = _mm_adds_epi16(stp1_2, stp1_5); \ + out3 = _mm_adds_epi16(stp1_3, stp2_4); \ + out4 = _mm_subs_epi16(stp1_3, stp2_4); \ + out5 = _mm_subs_epi16(stp1_2, stp1_5); \ + out6 = _mm_subs_epi16(stp1_1, stp1_6); \ + out7 = _mm_subs_epi16(stp1_0, stp2_7); \ } void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, @@ -484,12 +481,12 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, // 2-D for (i = 0; i < 2; i++) { // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, + in6, in7); } // Final rounding and shift @@ -560,12 +557,12 @@ void idct8_sse2(__m128i *in) { __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], - in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0, + in1, in2, in3, in4, in5, in6, in7); // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, - in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3], + in[4], in[5], in[6], in[7]); } void iadst8_sse2(__m128i *in) { @@ -906,8 +903,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, - in0, in1, in2, in3, in4, in5, in6, in7); + IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4, + in5, in6, in7); // Final rounding and shift in0 = _mm_adds_epi16(in0, final_rounding); in1 = _mm_adds_epi16(in1, final_rounding); @@ -937,242 +934,234 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, RECON_AND_STORE(dest + 7 * stride, in7); } -#define IDCT16 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ - const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ - const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ - const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ - const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ - stg2_0, stg2_1, stg2_2, stg2_3, \ - stp2_8, stp2_15, stp2_9, stp2_14) \ - \ - MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ - stg2_4, stg2_5, stg2_6, stg2_7, \ - stp2_10, stp2_13, stp2_11, stp2_12) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ - const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ - const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ - stg3_0, stg3_1, stg3_2, stg3_3, \ - stp1_4, stp1_7, stp1_5, stp1_6) \ - \ - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - \ - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ - const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ - const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ - stg4_0, stg4_1, stg4_2, stg4_3, \ - stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ - stg4_4, stg4_5, stg4_6, stg4_7, \ - stp2_9, stp2_14, stp2_10, stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ - stg6_0, stg4_0, stg6_0, stg4_0, \ - stp2_10, stp2_13, stp2_11, stp2_12) \ +#define IDCT16 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ + const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ + const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ + const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ + const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \ + stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \ + \ + MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \ + stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ + const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ + const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ + \ + MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \ + stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \ + \ + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + \ + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ + const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ + const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \ + stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ + stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ } -#define IDCT16_10 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ - stg2_0, stg2_1, stg2_6, stg2_7, \ - stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ - stg3_0, stg3_1, \ - stp2_4, stp2_7) \ - \ - stp1_9 = stp1_8_0; \ - stp1_10 = stp1_11; \ - \ - stp1_13 = stp1_12_0; \ - stp1_14 = stp1_15; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ - stg4_0, stg4_1, \ - stp1_0, stp1_1) \ - stp2_5 = stp2_4; \ - stp2_6 = stp2_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ - stg4_4, stg4_5, stg4_6, stg4_7, \ - stp2_9, stp2_14, stp2_10, stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_2 = stp1_1; \ - stp1_3 = stp1_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ - stg6_0, stg4_0, stg6_0, stg4_0, \ - stp2_10, stp2_13, stp2_11, stp2_12) \ - } +#define IDCT16_10 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \ + stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \ + stp1_12_0) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \ + \ + stp1_9 = stp1_8_0; \ + stp1_10 = stp1_11; \ + \ + stp1_13 = stp1_12_0; \ + stp1_14 = stp1_15; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \ + stp2_5 = stp2_4; \ + stp2_6 = stp2_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ + stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_2 = stp1_1; \ + stp1_3 = stp1_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + } void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { @@ -1207,10 +1196,10 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, __m128i in[16], l[16], r[16], *curr1; __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i; @@ -1312,8 +1301,8 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, dc_value = _mm_set1_epi16(a); for (i = 0; i < 16; ++i) { - RECON_AND_STORE(dest + 0, dc_value); - RECON_AND_STORE(dest + 8, dc_value); + RECON_AND_STORE(dest + 0, dc_value); + RECON_AND_STORE(dest + 8, dc_value); dest += stride; } } @@ -1891,9 +1880,9 @@ static void idct16_8col(__m128i *in) { u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - s[8] = _mm_packs_epi32(u[0], u[1]); + s[8] = _mm_packs_epi32(u[0], u[1]); s[15] = _mm_packs_epi32(u[2], u[3]); - s[9] = _mm_packs_epi32(u[4], u[5]); + s[9] = _mm_packs_epi32(u[4], u[5]); s[14] = _mm_packs_epi32(u[6], u[7]); s[10] = _mm_packs_epi32(u[8], u[9]); s[13] = _mm_packs_epi32(u[10], u[11]); @@ -2021,7 +2010,7 @@ static void idct16_8col(__m128i *in) { s[7] = _mm_add_epi16(t[6], t[7]); s[8] = t[8]; s[15] = t[15]; - s[9] = _mm_packs_epi32(u[8], u[9]); + s[9] = _mm_packs_epi32(u[8], u[9]); s[14] = _mm_packs_epi32(u[10], u[11]); s[10] = _mm_packs_epi32(u[12], u[13]); s[13] = _mm_packs_epi32(u[14], u[15]); @@ -2167,11 +2156,11 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); __m128i in[16], l[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, + stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, + stp1_12_0; __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i; // First 1-D inverse DCT @@ -2203,7 +2192,7 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - stp2_8 = _mm_packs_epi32(tmp0, tmp2); + stp2_8 = _mm_packs_epi32(tmp0, tmp2); stp2_11 = _mm_packs_epi32(tmp5, tmp7); } @@ -2267,9 +2256,9 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, tmp2 = _mm_add_epi16(stp2_9, stp2_10); tmp3 = _mm_sub_epi16(stp2_9, stp2_10); - stp1_9 = _mm_unpacklo_epi64(tmp2, zero); + stp1_9 = _mm_unpacklo_epi64(tmp2, zero); stp1_10 = _mm_unpacklo_epi64(tmp3, zero); - stp1_8 = _mm_unpacklo_epi64(tmp0, zero); + stp1_8 = _mm_unpacklo_epi64(tmp0, zero); stp1_11 = _mm_unpacklo_epi64(tmp1, zero); stp1_13 = _mm_unpackhi_epi64(tmp3, zero); @@ -2381,650 +2370,647 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, } } -#define LOAD_DQCOEFF(reg, input) \ - { \ +#define LOAD_DQCOEFF(reg, input) \ + { \ reg = load_input_data(input); \ - input += 8; \ - } \ - -#define IDCT32_34 \ -/* Stage1 */ \ -{ \ - const __m128i zero = _mm_setzero_si128();\ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ - \ - const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ - \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ - stg1_1, stp1_16, stp1_31); \ - MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ - stg1_7, stp1_19, stp1_28); \ - MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ - stg1_9, stp1_20, stp1_27); \ - MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ - stg1_15, stp1_23, stp1_24); \ -} \ -\ -/* Stage2 */ \ -{ \ - const __m128i zero = _mm_setzero_si128();\ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ - \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ - stg2_1, stp2_8, stp2_15); \ - MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ - stg2_7, stp2_11, stp2_12); \ - \ - stp2_16 = stp1_16; \ - stp2_19 = stp1_19; \ - \ - stp2_20 = stp1_20; \ - stp2_23 = stp1_23; \ - \ - stp2_24 = stp1_24; \ - stp2_27 = stp1_27; \ - \ - stp2_28 = stp1_28; \ - stp2_31 = stp1_31; \ -} \ -\ -/* Stage3 */ \ -{ \ - const __m128i zero = _mm_setzero_si128();\ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ - \ - MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ - stg3_1, stp1_4, stp1_7); \ - \ - stp1_8 = stp2_8; \ - stp1_11 = stp2_11; \ - stp1_12 = stp2_12; \ - stp1_15 = stp2_15; \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ - stp1_18, stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ - stp1_22, stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ -} \ -\ -/* Stage4 */ \ -{ \ - const __m128i zero = _mm_setzero_si128();\ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ - stg4_1, stp2_0, stp2_1); \ - \ - stp2_4 = stp1_4; \ - stp2_5 = stp1_4; \ - stp2_6 = stp1_7; \ - stp2_7 = stp1_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ - stp2_10, stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ -} \ -\ -/* Stage5 */ \ -{ \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = stp2_0; \ - stp1_1 = stp2_1; \ - stp1_2 = stp2_1; \ - stp1_3 = stp2_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ - stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ -} \ -\ -/* Stage6 */ \ -{ \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ - stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ - stp2_13, stp2_11, stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ -} \ -\ -/* Stage7 */ \ -{ \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ -} + input += 8; \ + } +#define IDCT32_34 \ + /* Stage1 */ \ + { \ + const __m128i zero = _mm_setzero_si128(); \ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ + \ + const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ + \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \ + stp1_31); \ + MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \ + stp1_28); \ + MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \ + stp1_27); \ + MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \ + stp1_24); \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i zero = _mm_setzero_si128(); \ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ + \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \ + stp2_15); \ + MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \ + stp2_12); \ + \ + stp2_16 = stp1_16; \ + stp2_19 = stp1_19; \ + \ + stp2_20 = stp1_20; \ + stp2_23 = stp1_23; \ + \ + stp2_24 = stp1_24; \ + stp2_27 = stp1_27; \ + \ + stp2_28 = stp1_28; \ + stp2_31 = stp1_31; \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i zero = _mm_setzero_si128(); \ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ + \ + MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \ + stp1_7); \ + \ + stp1_8 = stp2_8; \ + stp1_11 = stp2_11; \ + stp1_12 = stp2_12; \ + stp1_15 = stp2_15; \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ + stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ + stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i zero = _mm_setzero_si128(); \ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \ + stp2_1); \ + \ + stp2_4 = stp1_4; \ + stp2_5 = stp1_4; \ + stp2_6 = stp1_7; \ + stp2_7 = stp1_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ + stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = stp2_0; \ + stp1_1 = stp2_1; \ + stp1_2 = stp2_1; \ + stp1_3 = stp2_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ + stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ + stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ + } \ + \ + /* Stage7 */ \ + { \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ + stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ + stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + } -#define IDCT32 \ -/* Stage1 */ \ -{ \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ - const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ - const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ - \ - const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ - const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ - const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ - const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ - const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ - \ - const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ - const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ - stp1_17, stp1_30) \ - MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ - stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ - stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ - stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ - stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ -} \ -\ -/* Stage2 */ \ -{ \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ - const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ - const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ - \ - const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ - const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ - stp2_14) \ - MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ - stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ - stp2_11, stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ - \ - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ - \ - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ -} \ -\ -/* Stage3 */ \ -{ \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ - const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ - const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - \ - MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ - stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ - stp1_6) \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ - stp1_18, stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ - stp1_22, stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ -} \ -\ -/* Stage4 */ \ -{ \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ - const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ - const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ - stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ - stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ - stp2_10, stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ -} \ -\ -/* Stage5 */ \ -{ \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ - stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ -} \ -\ -/* Stage6 */ \ -{ \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ - stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ - stp2_13, stp2_11, stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ -} \ -\ -/* Stage7 */ \ -{ \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ -} +#define IDCT32 \ + /* Stage1 */ \ + { \ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ + const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ + const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ + \ + const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ + const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ + const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ + const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ + const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ + \ + const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ + const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \ + stp1_30) \ + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \ + stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ + const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ + const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ + \ + const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ + const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ + \ + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ + stp2_14) \ + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ + \ + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ + \ + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ + const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ + const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + \ + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ + stp1_6) \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ + stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ + stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ + const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ + const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \ + stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ + stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ + stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ + stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ + } \ + \ + /* Stage7 */ \ + { \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ + stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ + stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + } // Only upper-left 8x8 has non-zero coeff void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); // idct constants for each stage const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); @@ -3060,15 +3046,13 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, __m128i in[32], col[32]; __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, - stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, - stp1_30, stp1_31; + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, + stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, - stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, - stp2_30, stp2_31; + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, + stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i; @@ -3236,15 +3220,13 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, __m128i in[32], col[128], zero_idx[16]; __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, - stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, - stp1_30, stp1_31; + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, + stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, - stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, - stp2_30, stp2_31; + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, + stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i, j, i32; @@ -3469,8 +3451,8 @@ void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, dc_value = _mm_set1_epi16(a); for (j = 0; j < 32; ++j) { - RECON_AND_STORE(dest + 0 + j * stride, dc_value); - RECON_AND_STORE(dest + 8 + j * stride, dc_value); + RECON_AND_STORE(dest + 0 + j * stride, dc_value); + RECON_AND_STORE(dest + 8 + j * stride, dc_value); RECON_AND_STORE(dest + 16 + j * stride, dc_value); RECON_AND_STORE(dest + 24 + j * stride, dc_value); } @@ -3595,8 +3577,7 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, tran_low_t temp_in[4], temp_out[4]; // Columns for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j * 4 + i]; + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; vpx_highbd_idct4_c(temp_in, temp_out, bd); for (j = 0; j < 4; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( @@ -3685,19 +3666,18 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, __m128i d[8]; for (i = 0; i < 8; i++) { inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); inptr[i] = _mm_srai_epi16(inptr[i], 5); d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); // Store - _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); + _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); } } } else { // Run the un-optimised column transform tran_low_t temp_in[8], temp_out[8]; for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; vpx_highbd_idct8_c(temp_in, temp_out, bd); for (j = 0; j < 8; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( @@ -3789,19 +3769,18 @@ void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, __m128i d[8]; for (i = 0; i < 8; i++) { inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); inptr[i] = _mm_srai_epi16(inptr[i], 5); d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); // Store - _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); + _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); } } } else { // Run the un-optimised column transform tran_low_t temp_in[8], temp_out[8]; for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; vpx_highbd_idct8_c(temp_in, temp_out, bd); for (j = 0; j < 8; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( @@ -3897,25 +3876,24 @@ void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, { __m128i d[2]; for (i = 0; i < 16; i++) { - inptr[i ] = _mm_add_epi16(inptr[i ], rounding); - inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); - inptr[i ] = _mm_srai_epi16(inptr[i ], 6); - inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); + inptr[i] = _mm_add_epi16(inptr[i], rounding); + inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); + d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); + d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); + inptr[i] = _mm_srai_epi16(inptr[i], 6); + inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); + d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); + d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); // Store - _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); + _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); + _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); } } } else { // Run the un-optimised column transform tran_low_t temp_in[16], temp_out[16]; for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; vpx_highbd_idct16_c(temp_in, temp_out, bd); for (j = 0; j < 16; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( @@ -4016,25 +3994,24 @@ void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, { __m128i d[2]; for (i = 0; i < 16; i++) { - inptr[i ] = _mm_add_epi16(inptr[i ], rounding); - inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); - inptr[i ] = _mm_srai_epi16(inptr[i ], 6); - inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); + inptr[i] = _mm_add_epi16(inptr[i], rounding); + inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); + d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); + d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); + inptr[i] = _mm_srai_epi16(inptr[i], 6); + inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); + d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); + d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); // Store - _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); + _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); + _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); } } } else { // Run the un-optimised column transform tran_low_t temp_in[16], temp_out[16]; for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; vpx_highbd_idct16_c(temp_in, temp_out, bd); for (j = 0; j < 16; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index bd520c18e56fea6363c07f384aed118722156754..d762a04abcd7b0f8d7b8d1ee0fee0c83b43b223c 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -47,16 +47,16 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); } -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ { \ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ } -static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { +static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); @@ -95,43 +95,43 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { static INLINE __m128i load_input_data(const tran_low_t *data) { #if CONFIG_VP9_HIGHBITDEPTH return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], - data[6], data[7]); + data[6], data[7]); #else return _mm_load_si128((const __m128i *)data); #endif } static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { - in[0] = load_input_data(input + 0 * 16); - in[1] = load_input_data(input + 1 * 16); - in[2] = load_input_data(input + 2 * 16); - in[3] = load_input_data(input + 3 * 16); - in[4] = load_input_data(input + 4 * 16); - in[5] = load_input_data(input + 5 * 16); - in[6] = load_input_data(input + 6 * 16); - in[7] = load_input_data(input + 7 * 16); - - in[8] = load_input_data(input + 8 * 16); - in[9] = load_input_data(input + 9 * 16); - in[10] = load_input_data(input + 10 * 16); - in[11] = load_input_data(input + 11 * 16); - in[12] = load_input_data(input + 12 * 16); - in[13] = load_input_data(input + 13 * 16); - in[14] = load_input_data(input + 14 * 16); - in[15] = load_input_data(input + 15 * 16); + in[0] = load_input_data(input + 0 * 16); + in[1] = load_input_data(input + 1 * 16); + in[2] = load_input_data(input + 2 * 16); + in[3] = load_input_data(input + 3 * 16); + in[4] = load_input_data(input + 4 * 16); + in[5] = load_input_data(input + 5 * 16); + in[6] = load_input_data(input + 6 * 16); + in[7] = load_input_data(input + 7 * 16); + + in[8] = load_input_data(input + 8 * 16); + in[9] = load_input_data(input + 9 * 16); + in[10] = load_input_data(input + 10 * 16); + in[11] = load_input_data(input + 11 * 16); + in[12] = load_input_data(input + 12 * 16); + in[13] = load_input_data(input + 13 * 16); + in[14] = load_input_data(input + 14 * 16); + in[15] = load_input_data(input + 15 * 16); } -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ +#define RECON_AND_STORE(dest, in_x) \ + { \ + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + _mm_storel_epi64((__m128i *)(dest), d0); \ } static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); const __m128i zero = _mm_setzero_si128(); // Final rounding and shift in[0] = _mm_adds_epi16(in[0], final_rounding); @@ -168,16 +168,16 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { in[14] = _mm_srai_epi16(in[14], 6); in[15] = _mm_srai_epi16(in[15], 6); - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); - RECON_AND_STORE(dest + 8 * stride, in[8]); - RECON_AND_STORE(dest + 9 * stride, in[9]); + RECON_AND_STORE(dest + 0 * stride, in[0]); + RECON_AND_STORE(dest + 1 * stride, in[1]); + RECON_AND_STORE(dest + 2 * stride, in[2]); + RECON_AND_STORE(dest + 3 * stride, in[3]); + RECON_AND_STORE(dest + 4 * stride, in[4]); + RECON_AND_STORE(dest + 5 * stride, in[5]); + RECON_AND_STORE(dest + 6 * stride, in[6]); + RECON_AND_STORE(dest + 7 * stride, in[7]); + RECON_AND_STORE(dest + 8 * stride, in[8]); + RECON_AND_STORE(dest + 9 * stride, in[9]); RECON_AND_STORE(dest + 10 * stride, in[10]); RECON_AND_STORE(dest + 11 * stride, in[11]); RECON_AND_STORE(dest + 12 * stride, in[12]); diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c index be1087c1e951a195ad867f2945853e4211dc8389..85923b4478ac7aa4e7a0f9ee75034b44d8f2cbb2 100644 --- a/vpx_dsp/x86/loopfilter_avx2.c +++ b/vpx_dsp/x86/loopfilter_avx2.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <immintrin.h> /* AVX2 */ +#include <immintrin.h> /* AVX2 */ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" @@ -17,387 +17,353 @@ void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; - __m128i abs_p1p0; - - const __m128i thresh = _mm_broadcastb_epi8( - _mm_cvtsi32_si128((int) _thresh[0])); - const __m128i limit = _mm_broadcastb_epi8( - _mm_cvtsi32_si128((int) _limit[0])); - const __m128i blimit = _mm_broadcastb_epi8( - _mm_cvtsi32_si128((int) _blimit[0])); - - q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p)); - q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p)); - q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p)); - q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p)); - q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p))); - p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p)); - q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p))); - p0q0 = _mm_shuffle_epi32(q0p0, 78); + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + const __m128i thresh = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); + const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); + const __m128i blimit = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); + + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3p3 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2p2 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1p1 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); + abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + /* Filter1 >> 3 */ + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), + filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done { - __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; - abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), - _mm_subs_epu8(q0p0, q1p1)); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - fe = _mm_set1_epi8(0xfe); - ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), - _mm_subs_epu8(p0q0, q0p0)); - abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), - _mm_subs_epu8(p1q1, q1p1)); - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), - _mm_subs_epu8(q1p1, q2p2)), - _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), - _mm_subs_epu8(q2p2, q3p3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); + __m128i work; + flat = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3))); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); + + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); + + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), + _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); + + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), + _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask } - // lp filter + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i t1 = _mm_set1_epi16(0x1); - __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); - __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); - __m128i qs0 = _mm_xor_si128(p0q0, t80); - __m128i qs1 = _mm_xor_si128(p1q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; - __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - - filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, qs0ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - filter1 = _mm_unpacklo_epi8(zero, filter1); - filter1 = _mm_srai_epi16(filter1, 0xB); - filter2 = _mm_unpacklo_epi8(zero, filter2); - filter2 = _mm_srai_epi16(filter2, 0xB); - - /* Filter1 >> 3 */ - filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); - qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); - - /* filt >> 1 */ - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128( - _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt); - filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); - qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); - // loopfilter done - - { - __m128i work; - flat = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), - _mm_subs_epu8(q0p0, q2p2)), - _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), - _mm_subs_epu8(q0p0, q3p3))); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p)); - q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), - (__m64 *) (s + 5 * p))); - - q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p)); - q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), - (__m64 *) (s + 6 * p))); - - flat2 = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), - _mm_subs_epu8(q0p0, q4p4)), - _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), - _mm_subs_epu8(q0p0, q5p5))); - - q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p)); - q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), - (__m64 *) (s + 7 * p))); - - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), - _mm_subs_epu8(q0p0, q6p6)), - _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), - _mm_subs_epu8(q0p0, q7p7))); - - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m128i eight = _mm_set1_epi16(8); - const __m128i four = _mm_set1_epi16(4); - __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; - __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - - p7_16 = _mm_unpacklo_epi8(q7p7, zero); - p6_16 = _mm_unpacklo_epi8(q6p6, zero); - p5_16 = _mm_unpacklo_epi8(q5p5, zero); - p4_16 = _mm_unpacklo_epi8(q4p4, zero); - p3_16 = _mm_unpacklo_epi8(q3p3, zero); - p2_16 = _mm_unpacklo_epi8(q2p2, zero); - p1_16 = _mm_unpacklo_epi8(q1p1, zero); - p0_16 = _mm_unpacklo_epi8(q0p0, zero); - q0_16 = _mm_unpackhi_epi8(q0p0, zero); - q1_16 = _mm_unpackhi_epi8(q1p1, zero); - q2_16 = _mm_unpackhi_epi8(q2p2, zero); - q3_16 = _mm_unpackhi_epi8(q3p3, zero); - q4_16 = _mm_unpackhi_epi8(q4p4, zero); - q5_16 = _mm_unpackhi_epi8(q5p5, zero); - q6_16 = _mm_unpackhi_epi8(q6p6, zero); - q7_16 = _mm_unpackhi_epi8(q7p7, zero); - - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), - _mm_add_epi16(p4_16, p3_16)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), - _mm_add_epi16(q4_16, q3_16)); - - pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, - _mm_add_epi16(p2_16, p1_16)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, - _mm_add_epi16(q2_16, q1_16)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = _mm_add_epi16(eight, - _mm_add_epi16(pixelFilter_p, pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16(four, - _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), - 4); - flat2_q0p0 = _mm_packus_epi16(res_p, res_q); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(p3_16, p0_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(q3_16, q0_16)), 3); - - flat_q0p0 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(p7_16, p7_16); - sum_q7 = _mm_add_epi16(q7_16, q7_16); - sum_p3 = _mm_add_epi16(p3_16, p3_16); - sum_q3 = _mm_add_epi16(q3_16, q3_16); - - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), - 4); - flat2_q1p1 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(sum_p3, p1_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, - _mm_add_epi16(sum_q3, q1_16)), 3); - flat_q1p1 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - sum_p3 = _mm_add_epi16(sum_p3, p3_16); - sum_q3 = _mm_add_epi16(sum_q3, q3_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), - 4); - flat2_q2p2 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(sum_p3, p2_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, - _mm_add_epi16(sum_q3, q2_16)), 3); - flat_q2p2 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), - 4); - flat2_q3p3 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), - 4); - flat2_q4p4 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), - 4); - flat2_q5p5 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), - 4); - flat2_q6p6 = _mm_packus_epi16(res_p, res_q); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - flat = _mm_shuffle_epi32(flat, 68); - flat2 = _mm_shuffle_epi32(flat2, 68); - - q2p2 = _mm_andnot_si128(flat, q2p2); - flat_q2p2 = _mm_and_si128(flat, flat_q2p2); - q2p2 = _mm_or_si128(q2p2, flat_q2p2); - - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_q1p1 = _mm_and_si128(flat, flat_q1p1); - q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); - - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_q0p0 = _mm_and_si128(flat, flat_q0p0); - q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); - - q6p6 = _mm_andnot_si128(flat2, q6p6); - flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); - q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6)); - - q5p5 = _mm_andnot_si128(flat2, q5p5); - flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); - q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5)); - - q4p4 = _mm_andnot_si128(flat2, q4p4); - flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); - q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4)); - - q3p3 = _mm_andnot_si128(flat2, q3p3); - flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); - q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3)); - - q2p2 = _mm_andnot_si128(flat2, q2p2); - flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); - q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2)); - - q1p1 = _mm_andnot_si128(flat2, q1p1); - flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); - q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1)); - - q0p0 = _mm_andnot_si128(flat2, q0p0); - flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); - q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0)); + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero); + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + } } DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { - 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, + 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 }; @@ -405,575 +371,543 @@ void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - __m128i p7, p6, p5; - __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; - __m128i q5, q6, q7; - __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, - q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, - p256_0, q256_0; - - const __m128i thresh = _mm_broadcastb_epi8( - _mm_cvtsi32_si128((int) _thresh[0])); - const __m128i limit = _mm_broadcastb_epi8( - _mm_cvtsi32_si128((int) _limit[0])); - const __m128i blimit = _mm_broadcastb_epi8( - _mm_cvtsi32_si128((int) _blimit[0])); - - p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 5 * p))); - p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 4 * p))); - p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 3 * p))); - p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 2 * p))); - p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 1 * p))); - q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 0 * p))); - q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 1 * p))); - q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 2 * p))); - q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 3 * p))); - q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 4 * p))); - - p4 = _mm256_castsi256_si128(p256_4); - p3 = _mm256_castsi256_si128(p256_3); - p2 = _mm256_castsi256_si128(p256_2); - p1 = _mm256_castsi256_si128(p256_1); - p0 = _mm256_castsi256_si128(p256_0); - q0 = _mm256_castsi256_si128(q256_0); - q1 = _mm256_castsi256_si128(q256_1); - q2 = _mm256_castsi256_si128(q256_2); - q3 = _mm256_castsi256_si128(q256_3); - q4 = _mm256_castsi256_si128(q256_4); + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, + p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; + + const __m128i thresh = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); + const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); + const __m128i blimit = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); + + p256_4 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p))); + p256_3 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); + p256_2 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); + p256_1 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); + p256_0 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); + q256_0 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); + q256_1 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); + q256_2 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); + q256_3 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); + q256_4 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p))); + + p4 = _mm256_castsi256_si128(p256_4); + p3 = _mm256_castsi256_si128(p256_3); + p2 = _mm256_castsi256_si128(p256_2); + p1 = _mm256_castsi256_si128(p256_1); + p0 = _mm256_castsi256_si128(p256_0); + q0 = _mm256_castsi256_si128(q256_0); + q1 = _mm256_castsi256_si128(q256_1); + q2 = _mm256_castsi256_si128(q256_2); + q3 = _mm256_castsi256_si128(q256_3); + q4 = _mm256_castsi256_si128(q256_4); + + { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, + flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5, + flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + // loopfilter done { - const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), - _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), - _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), - _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), - _mm_subs_epu8(q1, p1)); - __m128i work; - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); + __m128i work; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + p256_5 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); + q256_5 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); + p5 = _mm256_castsi256_si128(p256_5); + q5 = _mm256_castsi256_si128(q256_5); + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), + _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); + + flat2 = _mm_max_epu8(work, flat2); + p256_6 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); + q256_6 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); + p6 = _mm256_castsi256_si128(p256_6); + q6 = _mm256_castsi256_si128(q256_6); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), + _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); + + flat2 = _mm_max_epu8(work, flat2); + + p256_7 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 8 * p))); + q256_7 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 7 * p))); + p7 = _mm256_castsi256_si128(p256_7); + q7 = _mm256_castsi256_si128(q256_7); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), + _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask } - // lp filter + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - __m128i ps1 = _mm_xor_si128(p1, t80); - __m128i ps0 = _mm_xor_si128(p0, t80); - __m128i qs0 = _mm_xor_si128(q0, t80); - __m128i qs1 = _mm_xor_si128(q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, - flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, - flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, - flat_q2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - /* Filter1 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - - /* Filter2 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - - /* filt >> 1 */ - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - filt = _mm_andnot_si128(hev, filt); - ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - // loopfilter done - - { - __m128i work; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 6 * p))); - q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 5 * p))); - p5 = _mm256_castsi256_si128(p256_5); - q5 = _mm256_castsi256_si128(q256_5); - flat2 = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), - _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); - - flat2 = _mm_max_epu8(work, flat2); - p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 7 * p))); - q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 6 * p))); - p6 = _mm256_castsi256_si128(p256_6); - q6 = _mm256_castsi256_si128(q256_6); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), - _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); - - flat2 = _mm_max_epu8(work, flat2); - - p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 8 * p))); - q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 7 * p))); - p7 = _mm256_castsi256_si128(p256_7); - q7 = _mm256_castsi256_si128(q256_7); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), - _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); - - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m256i eight = _mm256_set1_epi16(8); - const __m256i four = _mm256_set1_epi16(4); - __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, - pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, - res_q; - - const __m256i filter = _mm256_load_si256( - (__m256i const *)filt_loopfilter_avx2); - p256_7 = _mm256_shuffle_epi8(p256_7, filter); - p256_6 = _mm256_shuffle_epi8(p256_6, filter); - p256_5 = _mm256_shuffle_epi8(p256_5, filter); - p256_4 = _mm256_shuffle_epi8(p256_4, filter); - p256_3 = _mm256_shuffle_epi8(p256_3, filter); - p256_2 = _mm256_shuffle_epi8(p256_2, filter); - p256_1 = _mm256_shuffle_epi8(p256_1, filter); - p256_0 = _mm256_shuffle_epi8(p256_0, filter); - q256_0 = _mm256_shuffle_epi8(q256_0, filter); - q256_1 = _mm256_shuffle_epi8(q256_1, filter); - q256_2 = _mm256_shuffle_epi8(q256_2, filter); - q256_3 = _mm256_shuffle_epi8(q256_3, filter); - q256_4 = _mm256_shuffle_epi8(q256_4, filter); - q256_5 = _mm256_shuffle_epi8(q256_5, filter); - q256_6 = _mm256_shuffle_epi8(q256_6, filter); - q256_7 = _mm256_shuffle_epi8(q256_7, filter); - - pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), - _mm256_add_epi16(p256_4, p256_3)); - pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), - _mm256_add_epi16(q256_4, q256_3)); - - pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0, - _mm256_add_epi16(p256_2, p256_1)); - pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0, - _mm256_add_epi16(q256_2, q256_1)); - pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - - pixelFilter_p = _mm256_add_epi16(eight, - _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); - - pixetFilter_p2p1p0 = _mm256_add_epi16(four, - _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(p256_7, p256_0)), 4); - - flat2_p0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(q256_7, q256_0)), 4); - - flat2_q0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(p256_3, p256_0)), 3); + const __m256i eight = _mm256_set1_epi16(8); + const __m256i four = _mm256_set1_epi16(4); + __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, + pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - flat_p0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); + const __m256i filter = + _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); + p256_7 = _mm256_shuffle_epi8(p256_7, filter); + p256_6 = _mm256_shuffle_epi8(p256_6, filter); + p256_5 = _mm256_shuffle_epi8(p256_5, filter); + p256_4 = _mm256_shuffle_epi8(p256_4, filter); + p256_3 = _mm256_shuffle_epi8(p256_3, filter); + p256_2 = _mm256_shuffle_epi8(p256_2, filter); + p256_1 = _mm256_shuffle_epi8(p256_1, filter); + p256_0 = _mm256_shuffle_epi8(p256_0, filter); + q256_0 = _mm256_shuffle_epi8(q256_0, filter); + q256_1 = _mm256_shuffle_epi8(q256_1, filter); + q256_2 = _mm256_shuffle_epi8(q256_2, filter); + q256_3 = _mm256_shuffle_epi8(q256_3, filter); + q256_4 = _mm256_shuffle_epi8(q256_4, filter); + q256_5 = _mm256_shuffle_epi8(q256_5, filter); + q256_6 = _mm256_shuffle_epi8(q256_6, filter); + q256_7 = _mm256_shuffle_epi8(q256_7, filter); - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(q256_3, q256_0)), 3); + pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), + _mm256_add_epi16(p256_4, p256_3)); + pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), + _mm256_add_epi16(q256_4, q256_3)); - flat_q0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); + pixetFilter_p2p1p0 = + _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - sum_p7 = _mm256_add_epi16(p256_7, p256_7); + pixetFilter_q2q1q0 = + _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - sum_q7 = _mm256_add_epi16(q256_7, q256_7); + pixelFilter_p = _mm256_add_epi16( + eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); - sum_p3 = _mm256_add_epi16(p256_3, p256_3); + pixetFilter_p2p1p0 = _mm256_add_epi16( + four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - sum_q3 = _mm256_add_epi16(q256_3, q256_3); + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4); - pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); + flat2_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4); - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(sum_p7, p256_1)), 4); + flat2_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - flat2_p1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(p256_3, p256_0)), + 3); - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, - _mm256_add_epi16(sum_q7, q256_1)), 4); + flat_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - flat2_q1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(q256_3, q256_0)), + 3); - pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); + flat_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); + sum_p7 = _mm256_add_epi16(p256_7, p256_7); - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(sum_p3, p256_1)), 3); + sum_q7 = _mm256_add_epi16(q256_7, q256_7); - flat_p1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); + sum_p3 = _mm256_add_epi16(p256_3, p256_3); - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixetFilter_q2q1q0, - _mm256_add_epi16(sum_q3, q256_1)), 3); + sum_q3 = _mm256_add_epi16(q256_3, q256_3); - flat_q1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); + pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4); - sum_p3 = _mm256_add_epi16(sum_p3, p256_3); + flat2_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - sum_q3 = _mm256_add_epi16(sum_q3, q256_3); + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4); - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); + flat2_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(sum_p7, p256_2)), 4); + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); - flat2_p2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_1)), + 3); - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, - _mm256_add_epi16(sum_q7, q256_2)), 4); + flat_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - flat2_q2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_1)), + 3); - pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); + flat_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(sum_p3, p256_2)), 3); + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - flat_p2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); + sum_p3 = _mm256_add_epi16(sum_p3, p256_3); - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixetFilter_q2q1q0, - _mm256_add_epi16(sum_q3, q256_2)), 3); + sum_q3 = _mm256_add_epi16(sum_q3, q256_3); - flat_q2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4); - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); + flat2_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4); - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(sum_p7, p256_3)), 4); + flat2_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - flat2_p3 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, - _mm256_add_epi16(sum_q7, q256_3)), 4); + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); - flat2_q3 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_2)), + 3); - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + flat_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_2)), + 3); - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); + flat_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(sum_p7, p256_4)), 4); + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - flat2_p4 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, - _mm256_add_epi16(sum_q7, q256_4)), 4); + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); - flat2_q4 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4); - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + flat2_p3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4); - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); + flat2_q3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(sum_p7, p256_5)), 4); + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - flat2_p5 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, - _mm256_add_epi16(sum_q7, q256_5)), 4); + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); - flat2_q5 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4); - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + flat2_p4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4); - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); + flat2_q4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(sum_p7, p256_6)), 4); + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - flat2_p6 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, - _mm256_add_epi16(sum_q7, q256_6)), 4); + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); - flat2_q6 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); - } + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4); - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2_p5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - p2 = _mm_andnot_si128(flat, p2); - flat_p2 = _mm_and_si128(flat, flat_p2); - p2 = _mm_or_si128(flat_p2, p2); + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4); - p1 = _mm_andnot_si128(flat, ps1); - flat_p1 = _mm_and_si128(flat, flat_p1); - p1 = _mm_or_si128(flat_p1, p1); + flat2_q5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - p0 = _mm_andnot_si128(flat, ps0); - flat_p0 = _mm_and_si128(flat, flat_p0); - p0 = _mm_or_si128(flat_p0, p0); + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - q0 = _mm_andnot_si128(flat, qs0); - flat_q0 = _mm_and_si128(flat, flat_q0); - q0 = _mm_or_si128(flat_q0, q0); + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - q1 = _mm_andnot_si128(flat, qs1); - flat_q1 = _mm_and_si128(flat, flat_q1); - q1 = _mm_or_si128(flat_q1, q1); + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); - q2 = _mm_andnot_si128(flat, q2); - flat_q2 = _mm_and_si128(flat, flat_q2); - q2 = _mm_or_si128(flat_q2, q2); + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); - p6 = _mm_andnot_si128(flat2, p6); - flat2_p6 = _mm_and_si128(flat2, flat2_p6); - p6 = _mm_or_si128(flat2_p6, p6); - _mm_storeu_si128((__m128i *) (s - 7 * p), p6); + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4); - p5 = _mm_andnot_si128(flat2, p5); - flat2_p5 = _mm_and_si128(flat2, flat2_p5); - p5 = _mm_or_si128(flat2_p5, p5); - _mm_storeu_si128((__m128i *) (s - 6 * p), p5); + flat2_p6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - p4 = _mm_andnot_si128(flat2, p4); - flat2_p4 = _mm_and_si128(flat2, flat2_p4); - p4 = _mm_or_si128(flat2_p4, p4); - _mm_storeu_si128((__m128i *) (s - 5 * p), p4); + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4); - p3 = _mm_andnot_si128(flat2, p3); - flat2_p3 = _mm_and_si128(flat2, flat2_p3); - p3 = _mm_or_si128(flat2_p3, p3); - _mm_storeu_si128((__m128i *) (s - 4 * p), p3); - - p2 = _mm_andnot_si128(flat2, p2); - flat2_p2 = _mm_and_si128(flat2, flat2_p2); - p2 = _mm_or_si128(flat2_p2, p2); - _mm_storeu_si128((__m128i *) (s - 3 * p), p2); - - p1 = _mm_andnot_si128(flat2, p1); - flat2_p1 = _mm_and_si128(flat2, flat2_p1); - p1 = _mm_or_si128(flat2_p1, p1); - _mm_storeu_si128((__m128i *) (s - 2 * p), p1); - - p0 = _mm_andnot_si128(flat2, p0); - flat2_p0 = _mm_and_si128(flat2, flat2_p0); - p0 = _mm_or_si128(flat2_p0, p0); - _mm_storeu_si128((__m128i *) (s - 1 * p), p0); - - q0 = _mm_andnot_si128(flat2, q0); - flat2_q0 = _mm_and_si128(flat2, flat2_q0); - q0 = _mm_or_si128(flat2_q0, q0); - _mm_storeu_si128((__m128i *) (s - 0 * p), q0); - - q1 = _mm_andnot_si128(flat2, q1); - flat2_q1 = _mm_and_si128(flat2, flat2_q1); - q1 = _mm_or_si128(flat2_q1, q1); - _mm_storeu_si128((__m128i *) (s + 1 * p), q1); - - q2 = _mm_andnot_si128(flat2, q2); - flat2_q2 = _mm_and_si128(flat2, flat2_q2); - q2 = _mm_or_si128(flat2_q2, q2); - _mm_storeu_si128((__m128i *) (s + 2 * p), q2); - - q3 = _mm_andnot_si128(flat2, q3); - flat2_q3 = _mm_and_si128(flat2, flat2_q3); - q3 = _mm_or_si128(flat2_q3, q3); - _mm_storeu_si128((__m128i *) (s + 3 * p), q3); - - q4 = _mm_andnot_si128(flat2, q4); - flat2_q4 = _mm_and_si128(flat2, flat2_q4); - q4 = _mm_or_si128(flat2_q4, q4); - _mm_storeu_si128((__m128i *) (s + 4 * p), q4); - - q5 = _mm_andnot_si128(flat2, q5); - flat2_q5 = _mm_and_si128(flat2, flat2_q5); - q5 = _mm_or_si128(flat2_q5, q5); - _mm_storeu_si128((__m128i *) (s + 5 * p), q5); - - q6 = _mm_andnot_si128(flat2, q6); - flat2_q6 = _mm_and_si128(flat2, flat2_q6); - q6 = _mm_or_si128(flat2_q6, q6); - _mm_storeu_si128((__m128i *) (s + 6 * p), q6); + flat2_q6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + p2 = _mm_andnot_si128(flat, p2); + flat_p2 = _mm_and_si128(flat, flat_p2); + p2 = _mm_or_si128(flat_p2, p2); + + p1 = _mm_andnot_si128(flat, ps1); + flat_p1 = _mm_and_si128(flat, flat_p1); + p1 = _mm_or_si128(flat_p1, p1); + + p0 = _mm_andnot_si128(flat, ps0); + flat_p0 = _mm_and_si128(flat, flat_p0); + p0 = _mm_or_si128(flat_p0, p0); + + q0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(flat_q0, q0); + + q1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(flat_q1, q1); + + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(flat_q2, q2); + + p6 = _mm_andnot_si128(flat2, p6); + flat2_p6 = _mm_and_si128(flat2, flat2_p6); + p6 = _mm_or_si128(flat2_p6, p6); + _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + + p5 = _mm_andnot_si128(flat2, p5); + flat2_p5 = _mm_and_si128(flat2, flat2_p5); + p5 = _mm_or_si128(flat2_p5, p5); + _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + + p4 = _mm_andnot_si128(flat2, p4); + flat2_p4 = _mm_and_si128(flat2, flat2_p4); + p4 = _mm_or_si128(flat2_p4, p4); + _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + + p3 = _mm_andnot_si128(flat2, p3); + flat2_p3 = _mm_and_si128(flat2, flat2_p3); + p3 = _mm_or_si128(flat2_p3, p3); + _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + + p2 = _mm_andnot_si128(flat2, p2); + flat2_p2 = _mm_and_si128(flat2, flat2_p2); + p2 = _mm_or_si128(flat2_p2, p2); + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + + p1 = _mm_andnot_si128(flat2, p1); + flat2_p1 = _mm_and_si128(flat2, flat2_p1); + p1 = _mm_or_si128(flat2_p1, p1); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + + p0 = _mm_andnot_si128(flat2, p0); + flat2_p0 = _mm_and_si128(flat2, flat2_p0); + p0 = _mm_or_si128(flat2_p0, p0); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + + q0 = _mm_andnot_si128(flat2, q0); + flat2_q0 = _mm_and_si128(flat2, flat2_q0); + q0 = _mm_or_si128(flat2_q0, q0); + _mm_storeu_si128((__m128i *)(s - 0 * p), q0); + + q1 = _mm_andnot_si128(flat2, q1); + flat2_q1 = _mm_and_si128(flat2, flat2_q1); + q1 = _mm_or_si128(flat2_q1, q1); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + + q2 = _mm_andnot_si128(flat2, q2); + flat2_q2 = _mm_and_si128(flat2, flat2_q2); + q2 = _mm_or_si128(flat2_q2, q2); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + + q3 = _mm_andnot_si128(flat2, q3); + flat2_q3 = _mm_and_si128(flat2, flat2_q3); + q3 = _mm_or_si128(flat2_q3, q3); + _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + + q4 = _mm_andnot_si128(flat2, q4); + flat2_q4 = _mm_and_si128(flat2, flat2_q4); + q4 = _mm_or_si128(flat2_q4, q4); + _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + + q5 = _mm_andnot_si128(flat2, q5); + flat2_q5 = _mm_and_si128(flat2, flat2_q5); + q5 = _mm_or_si128(flat2_q5, q5); + _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + + q6 = _mm_andnot_si128(flat2, q6); + flat2_q6 = _mm_and_si128(flat2, flat2_q6); + q6 = _mm_or_si128(flat2_q6, q6); + _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + } } diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c index 739adf31d067912d2c8c047f744800e7927ac70b..e13334ae0ea87c1d0f4f69950cf758674cf416f4 100644 --- a/vpx_dsp/x86/loopfilter_sse2.c +++ b/vpx_dsp/x86/loopfilter_sse2.c @@ -19,84 +19,89 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { } // filter_mask and hev_mask -#define FILTER_HEV_MASK do { \ - /* (abs(q1 - q0), abs(p1 - p0) */ \ - __m128i flat = abs_diff(q1p1, q0p0); \ - /* abs(p1 - q1), abs(p0 - q0) */ \ - const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ - __m128i abs_p0q0, abs_p1q1, work; \ - \ - /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ - hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ - hev = _mm_cmpgt_epi16(hev, thresh); \ - hev = _mm_packs_epi16(hev, hev); \ - \ - /* const int8_t mask = filter_mask(*limit, *blimit, */ \ - /* p3, p2, p1, p0, q0, q1, q2, q3); */ \ - abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */\ - abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */\ - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ - abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ - /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ - mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ - /* abs(p3 - p2), abs(p2 - p1) */ \ - work = abs_diff(p3p2, p2p1); \ - flat = _mm_max_epu8(work, flat); \ - /* abs(q3 - q2), abs(q2 - q1) */ \ - work = abs_diff(q3q2, q2q1); \ - flat = _mm_max_epu8(work, flat); \ - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ - mask = _mm_unpacklo_epi64(mask, flat); \ - mask = _mm_subs_epu8(mask, limit); \ - mask = _mm_cmpeq_epi8(mask, zero); \ - mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ -} while (0) - -#define FILTER4 do { \ - const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, \ - 4, 4, 4, 4, 4, 4, 4, 4); \ - const __m128i t80 = _mm_set1_epi8(0x80); \ - __m128i filter, filter2filter1, work; \ - \ - ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ - qs1qs0 = _mm_xor_si128(q1q0, t80); \ - \ - /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \ - work = _mm_subs_epi8(ps1ps0, qs1qs0); \ - filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \ - /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \ - filter = _mm_subs_epi8(filter, work); \ - filter = _mm_subs_epi8(filter, work); \ - filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \ - filter = _mm_and_si128(filter, mask); /* & mask */ \ - filter = _mm_unpacklo_epi64(filter, filter); \ - \ - /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \ - /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \ - filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \ - filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \ - filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \ - filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \ - filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \ - filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \ - \ - /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \ - filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \ - filter = _mm_unpacklo_epi8(filter, filter); \ - filter = _mm_srai_epi16(filter, 9); /* round */ \ - filter = _mm_packs_epi16(filter, filter); \ - filter = _mm_andnot_si128(hev, filter); \ - \ - hev = _mm_unpackhi_epi64(filter2filter1, filter); \ - filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \ - \ - /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \ - qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \ - /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \ - ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \ - qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \ - ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ -} while (0) +#define FILTER_HEV_MASK \ + do { \ + /* (abs(q1 - q0), abs(p1 - p0) */ \ + __m128i flat = abs_diff(q1p1, q0p0); \ + /* abs(p1 - q1), abs(p0 - q0) */ \ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ + __m128i abs_p0q0, abs_p1q1, work; \ + \ + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ + hev = \ + _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ + hev = _mm_cmpgt_epi16(hev, thresh); \ + hev = _mm_packs_epi16(hev, hev); \ + \ + /* const int8_t mask = filter_mask(*limit, *blimit, */ \ + /* p3, p2, p1, p0, q0, q1, q2, q3); */ \ + abs_p0q0 = \ + _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \ + abs_p1q1 = \ + _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ + /* abs(p3 - p2), abs(p2 - p1) */ \ + work = abs_diff(p3p2, p2p1); \ + flat = _mm_max_epu8(work, flat); \ + /* abs(q3 - q2), abs(q2 - q1) */ \ + work = abs_diff(q3q2, q2q1); \ + flat = _mm_max_epu8(work, flat); \ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ + mask = _mm_unpacklo_epi64(mask, flat); \ + mask = _mm_subs_epu8(mask, limit); \ + mask = _mm_cmpeq_epi8(mask, zero); \ + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ + } while (0) + +#define FILTER4 \ + do { \ + const __m128i t3t4 = \ + _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \ + const __m128i t80 = _mm_set1_epi8(0x80); \ + __m128i filter, filter2filter1, work; \ + \ + ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ + qs1qs0 = _mm_xor_si128(q1q0, t80); \ + \ + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \ + work = _mm_subs_epi8(ps1ps0, qs1qs0); \ + filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \ + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \ + filter = _mm_and_si128(filter, mask); /* & mask */ \ + filter = _mm_unpacklo_epi64(filter, filter); \ + \ + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \ + filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \ + filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \ + \ + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \ + filter = _mm_unpacklo_epi8(filter, filter); \ + filter = _mm_srai_epi16(filter, 9); /* round */ \ + filter = _mm_packs_epi16(filter, filter); \ + filter = _mm_andnot_si128(hev, filter); \ + \ + hev = _mm_unpackhi_epi64(filter2filter1, filter); \ + filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \ + \ + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \ + qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \ + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \ + ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \ + qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \ + ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ + } while (0) void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, const uint8_t *_blimit, const uint8_t *_limit, @@ -128,8 +133,8 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, FILTER4; _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 - _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 - _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 + _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 + _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 } @@ -238,27 +243,27 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, __m128i abs_p1p0; q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); - q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4), - (__m64 *)(s + 4 * p))); + q4p4 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3), - (__m64 *)(s + 3 * p))); + q3p3 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2), - (__m64 *)(s + 2 * p))); + q2p2 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1), - (__m64 *)(s + 1 * p))); + q1p1 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); p1q1 = _mm_shuffle_epi32(q1p1, 78); q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0), - (__m64 *)(s - 0 * p))); + q0p0 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); p0q0 = _mm_shuffle_epi32(q0p0, 78); { __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; abs_p1p0 = abs_diff(q1p1, q0p0); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); fe = _mm_set1_epi8(0xfe); ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); abs_p0q0 = abs_diff(q0p0, p0q0); @@ -267,7 +272,7 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); @@ -276,8 +281,7 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(abs_diff(q2p2, q1p1), - abs_diff(q3p3, q2p2)); + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, limit); @@ -339,17 +343,17 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, flat = _mm_and_si128(flat, mask); q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); - q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5), - (__m64 *)(s + 5 * p))); + q5p5 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); - q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6), - (__m64 *)(s + 6 * p))); + q6p6 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); - q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7), - (__m64 *)(s + 7 * p))); + q7p7 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); @@ -369,7 +373,7 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - p7_16 = _mm_unpacklo_epi8(q7p7, zero);; + p7_16 = _mm_unpacklo_epi8(q7p7, zero); p6_16 = _mm_unpacklo_epi8(q6p6, zero); p5_16 = _mm_unpacklo_epi8(q5p5, zero); p4_16 = _mm_unpacklo_epi8(q4p4, zero); @@ -392,24 +396,23 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, _mm_add_epi16(q4_16, q3_16)); pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, - pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16(four, - _mm_add_epi16(pixetFilter_p2p1p0, - pixetFilter_q2q1q0)); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(p7_16, p0_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(q7_16, q0_16)), 4); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); flat2_q0p0 = _mm_packus_epi16(res_p, res_q); - res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(p3_16, p0_16)), 3); - res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(q3_16, q0_16)), 3); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); flat_q0p0 = _mm_packus_epi16(res_p, res_q); @@ -420,18 +423,18 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p1_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q1_16)), 4); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); flat2_q1p1 = _mm_packus_epi16(res_p, res_q); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(sum_p3, p1_16)), 3); - res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, - _mm_add_epi16(sum_q3, q1_16)), 3); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); flat_q1p1 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); @@ -441,59 +444,59 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p2_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q2_16)), 4); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); flat2_q2p2 = _mm_packus_epi16(res_p, res_q); pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(sum_p3, p2_16)), 3); - res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, - _mm_add_epi16(sum_q3, q2_16)), 3); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); flat_q2p2 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p3_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q3_16)), 4); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); flat2_q3p3 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p4_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q4_16)), 4); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); flat2_q4p4 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p5_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q5_16)), 4); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); flat2_q5p5 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p6_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q6_16)), 4); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); flat2_q6p6 = _mm_packus_epi16(res_p, res_q); } // wide flat @@ -554,7 +557,7 @@ void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); } } @@ -572,8 +575,8 @@ static INLINE __m128i filter8_mask(const __m128i *const flat, const __m128i *const other_filt, const __m128i *const f8_lo, const __m128i *const f8_hi) { - const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), - _mm_srli_epi16(*f8_hi, 3)); + const __m128i f8 = + _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3)); const __m128i result = _mm_and_si128(*flat, f8); return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } @@ -582,8 +585,8 @@ static INLINE __m128i filter16_mask(const __m128i *const flat, const __m128i *const other_filt, const __m128i *const f_lo, const __m128i *const f_hi) { - const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), - _mm_srli_epi16(*f_hi, 4)); + const __m128i f = + _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4)); const __m128i result = _mm_and_si128(*flat, f); return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } @@ -633,7 +636,7 @@ void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, __m128i work; max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); - abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); @@ -832,16 +835,16 @@ void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, __m128i f_hi; f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7 - f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), - _mm_add_epi16(p4_lo, f_lo)); + f_lo = + _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo)); f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), _mm_add_epi16(p2_lo, p1_lo)); f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo); f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7 - f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), - _mm_add_epi16(p4_hi, f_hi)); + f_hi = + _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi)); f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), _mm_add_epi16(p2_hi, p1_hi)); f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); @@ -956,7 +959,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, const __m128i ff = _mm_cmpeq_epi8(fe, fe); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; abs_p1p0 = abs_diff(q1p1, q0p0); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); abs_p0q0 = abs_diff(q0p0, p0q0); abs_p1q1 = abs_diff(q1p1, p1q1); @@ -964,7 +967,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); @@ -973,8 +976,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(abs_diff(q2p2, q1p1), - abs_diff(q3p3, q2p2)); + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, limit); @@ -982,8 +984,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, // flat_mask4 - flat = _mm_max_epu8(abs_diff(q2p2, q0p0), - abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); flat = _mm_max_epu8(abs_p1p0, flat); flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); @@ -1048,14 +1049,14 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8(0x80); const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - t80); - const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - t80); - const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), - t80); - const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), - t80); + const __m128i ps1 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80); + const __m128i ps0 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80); + const __m128i qs0 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80); + const __m128i qs1 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -1134,8 +1135,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, } } -void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, - const uint8_t *_blimit0, +void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, @@ -1170,17 +1170,17 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); { - const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), - _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), - _mm_subs_epu8(q0, q1)); + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8(0xfe); const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), - _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), - _mm_subs_epu8(q1, p1)); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); __m128i work; // filter_mask and hev_mask @@ -1188,7 +1188,7 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); @@ -1196,29 +1196,25 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, mask = _mm_max_epu8(flat, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), - _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), - _mm_subs_epu8(p2, p3))); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), - _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), - _mm_subs_epu8(q2, q3))); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); mask = _mm_max_epu8(work, mask); mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); // flat_mask4 - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), - _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), - _mm_subs_epu8(q0, q2))); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), - _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), - _mm_subs_epu8(q0, q3))); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); flat = _mm_max_epu8(work, flat); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); @@ -1289,14 +1285,14 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); - const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), - t80); - const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), - t80); - const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), - t80); - const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), - t80); + const __m128i ps1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + const __m128i ps0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + const __m128i qs0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + const __m128i qs1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -1412,23 +1408,23 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, // filter_mask and hev_mask { - const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), - _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), - _mm_subs_epu8(q0, q1)); + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); const __m128i fe = _mm_set1_epi8(0xfe); const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), - _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), - _mm_subs_epu8(q1, p1)); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); __m128i work; flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); @@ -1436,15 +1432,13 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, mask = _mm_max_epu8(flat, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), - _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), - _mm_subs_epu8(p2, p3))); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), - _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), - _mm_subs_epu8(q2, q3))); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); mask = _mm_max_epu8(work, mask); mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); @@ -1460,14 +1454,14 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); - const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), - t80); - const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), - t80); - const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), - t80); - const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), - t80); + const __m128i ps1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + const __m128i ps0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + const __m128i qs0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + const __m128i qs1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -1525,44 +1519,44 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, __m128i x8, x9, x10, x11, x12, x13, x14, x15; // 2-way interleave w/hoisting of unpacks - x0 = _mm_loadl_epi64((__m128i *)in0); // 1 + x0 = _mm_loadl_epi64((__m128i *)in0); // 1 x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3 - x0 = _mm_unpacklo_epi8(x0, x1); // 1 + x0 = _mm_unpacklo_epi8(x0, x1); // 1 x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5 - x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); // 7 - x1 = _mm_unpacklo_epi8(x2, x3); // 2 + x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7 + x1 = _mm_unpacklo_epi8(x2, x3); // 2 - x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); // 9 - x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); // 11 - x2 = _mm_unpacklo_epi8(x4, x5); // 3 + x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9 + x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11 + x2 = _mm_unpacklo_epi8(x4, x5); // 3 - x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); // 13 - x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); // 15 - x3 = _mm_unpacklo_epi8(x6, x7); // 4 - x4 = _mm_unpacklo_epi16(x0, x1); // 9 + x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13 + x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15 + x3 = _mm_unpacklo_epi8(x6, x7); // 4 + x4 = _mm_unpacklo_epi16(x0, x1); // 9 - x8 = _mm_loadl_epi64((__m128i *)in1); // 2 + x8 = _mm_loadl_epi64((__m128i *)in1); // 2 x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4 - x8 = _mm_unpacklo_epi8(x8, x9); // 5 - x5 = _mm_unpacklo_epi16(x2, x3); // 10 + x8 = _mm_unpacklo_epi8(x8, x9); // 5 + x5 = _mm_unpacklo_epi16(x2, x3); // 10 x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6 - x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); // 8 - x9 = _mm_unpacklo_epi8(x10, x11); // 6 + x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8 + x9 = _mm_unpacklo_epi8(x10, x11); // 6 - x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); // 10 - x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); // 12 - x10 = _mm_unpacklo_epi8(x12, x13); // 7 - x12 = _mm_unpacklo_epi16(x8, x9); // 11 + x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10 + x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12 + x10 = _mm_unpacklo_epi8(x12, x13); // 7 + x12 = _mm_unpacklo_epi16(x8, x9); // 11 - x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); // 14 - x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); // 16 - x11 = _mm_unpacklo_epi8(x14, x15); // 8 - x13 = _mm_unpacklo_epi16(x10, x11); // 12 + x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14 + x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16 + x11 = _mm_unpacklo_epi8(x14, x15); // 8 + x13 = _mm_unpacklo_epi16(x10, x11); // 12 - x6 = _mm_unpacklo_epi32(x4, x5); // 13 - x7 = _mm_unpackhi_epi32(x4, x5); // 14 + x6 = _mm_unpacklo_epi32(x4, x5); // 13 + x7 = _mm_unpackhi_epi32(x4, x5); // 14 x14 = _mm_unpacklo_epi32(x12, x13); // 15 x15 = _mm_unpackhi_epi32(x12, x13); // 16 @@ -1598,23 +1592,31 @@ static INLINE void transpose(unsigned char *src[], int in_p, unsigned char *in = src[idx8x8]; unsigned char *out = dst[idx8x8]; - x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 - x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 + x0 = + _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 + x1 = + _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 x0 = _mm_unpacklo_epi8(x0, x1); - x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 - x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 + x2 = + _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 + x3 = + _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 x1 = _mm_unpacklo_epi8(x2, x3); - x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 - x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 + x4 = + _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 + x5 = + _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 x2 = _mm_unpacklo_epi8(x4, x5); - x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 - x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 + x6 = + _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 + x7 = + _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 x3 = _mm_unpacklo_epi8(x6, x7); @@ -1624,15 +1626,15 @@ static INLINE void transpose(unsigned char *src[], int in_p, x5 = _mm_unpacklo_epi16(x2, x3); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 0*out_p), + _mm_storel_pd((double *)(out + 0 * out_p), _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 - _mm_storeh_pd((double *)(out + 1*out_p), + _mm_storeh_pd((double *)(out + 1 * out_p), _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 x7 = _mm_unpackhi_epi32(x4, x5); - _mm_storel_pd((double *)(out + 2*out_p), + _mm_storel_pd((double *)(out + 2 * out_p), _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 - _mm_storeh_pd((double *)(out + 3*out_p), + _mm_storeh_pd((double *)(out + 3 * out_p), _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 @@ -1641,25 +1643,23 @@ static INLINE void transpose(unsigned char *src[], int in_p, x5 = _mm_unpackhi_epi16(x2, x3); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 4*out_p), + _mm_storel_pd((double *)(out + 4 * out_p), _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 - _mm_storeh_pd((double *)(out + 5*out_p), + _mm_storeh_pd((double *)(out + 5 * out_p), _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 x7 = _mm_unpackhi_epi32(x4, x5); - _mm_storel_pd((double *)(out + 6*out_p), + _mm_storel_pd((double *)(out + 6 * out_p), _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 - _mm_storeh_pd((double *)(out + 7*out_p), + _mm_storeh_pd((double *)(out + 7 * out_p), _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 } while (++idx8x8 < num_8x8_to_transpose); } void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); unsigned char *src[2]; @@ -1705,10 +1705,8 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, } void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); unsigned char *src[2]; diff --git a/vpx_dsp/x86/masked_sad_intrin_ssse3.c b/vpx_dsp/x86/masked_sad_intrin_ssse3.c index 8b9ff1099d30c77bedf53d0ff982259ad1ddb08f..e07ff5f4df2dce826264b490f18ecef1a06dd222 100644 --- a/vpx_dsp/x86/masked_sad_intrin_ssse3.c +++ b/vpx_dsp/x86/masked_sad_intrin_ssse3.c @@ -17,17 +17,17 @@ #include "vpx/vpx_integer.h" static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) { - __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr); - __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride)); + __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr); + __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride)); return _mm_unpacklo_epi64(temp1, temp2); } static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) { - __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t*)ptr); - __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride)); + __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr); + __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride)); __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2); - temp1 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 2)); - temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 3)); + temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2)); + temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3)); temp1 = _mm_unpacklo_epi32(temp1, temp2); return _mm_unpacklo_epi64(temp3, temp1); } @@ -37,32 +37,21 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, const uint8_t *m_ptr, int m_stride, int width, int height); -static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr, - int a_stride, - const uint8_t *b_ptr, - int b_stride, - const uint8_t *m_ptr, - int m_stride, - int height); - -static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr, - int a_stride, - const uint8_t *b_ptr, - int b_stride, - const uint8_t *m_ptr, - int m_stride, - int height); - -#define MASKSADMXN_SSSE3(m, n) \ -unsigned int vpx_masked_sad##m##x##n##_ssse3(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - const uint8_t *msk, \ - int msk_stride) { \ - return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \ - m, n); \ -} +static INLINE unsigned int masked_sad8xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height); + +static INLINE unsigned int masked_sad4xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height); + +#define MASKSADMXN_SSSE3(m, n) \ + unsigned int vpx_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \ + m, n); \ + } #if CONFIG_EXT_PARTITION MASKSADMXN_SSSE3(128, 128) @@ -78,28 +67,25 @@ MASKSADMXN_SSSE3(16, 32) MASKSADMXN_SSSE3(16, 16) MASKSADMXN_SSSE3(16, 8) -#define MASKSAD8XN_SSSE3(n) \ -unsigned int vpx_masked_sad8x##n##_ssse3(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - const uint8_t *msk, \ - int msk_stride) { \ - return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \ - msk_stride, n); \ -} +#define MASKSAD8XN_SSSE3(n) \ + unsigned int vpx_masked_sad8x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ + } MASKSAD8XN_SSSE3(16) MASKSAD8XN_SSSE3(8) MASKSAD8XN_SSSE3(4) -#define MASKSAD4XN_SSSE3(n) \ -unsigned int vpx_masked_sad4x##n##_ssse3(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *msk, int msk_stride) { \ - return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ - msk_stride, n); \ -} +#define MASKSAD4XN_SSSE3(n) \ + unsigned int vpx_masked_sad4x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ + } MASKSAD4XN_SSSE3(8) MASKSAD4XN_SSSE3(4) @@ -119,9 +105,9 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, // Covering the full width for (x = 0; x < width; x += 16) { // Load a, b, m in xmm registers - a = _mm_loadu_si128((const __m128i*)(a_ptr + x)); - b = _mm_loadu_si128((const __m128i*)(b_ptr + x)); - m = _mm_loadu_si128((const __m128i*)(m_ptr + x)); + a = _mm_loadu_si128((const __m128i *)(a_ptr + x)); + b = _mm_loadu_si128((const __m128i *)(b_ptr + x)); + m = _mm_loadu_si128((const __m128i *)(m_ptr + x)); // Calculate the difference between a & b temp1 = _mm_subs_epu8(a, b); @@ -144,13 +130,9 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, return (_mm_cvtsi128_si32(res) + 31) >> 6; } -static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr, - int a_stride, - const uint8_t *b_ptr, - int b_stride, - const uint8_t *m_ptr, - int m_stride, - int height) { +static INLINE unsigned int masked_sad8xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i a, b, m, temp1, temp2, row_res; __m128i res = _mm_setzero_si128(); @@ -184,13 +166,9 @@ static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr, return (_mm_cvtsi128_si32(res) + 31) >> 6; } -static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr, - int a_stride, - const uint8_t *b_ptr, - int b_stride, - const uint8_t *m_ptr, - int m_stride, - int height) { +static INLINE unsigned int masked_sad4xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i a, b, m, temp1, temp2, row_res; __m128i res = _mm_setzero_si128(); @@ -228,37 +206,26 @@ static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr, #if CONFIG_VP9_HIGHBITDEPTH static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr, int stride) { - __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr); - __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride)); + __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr); + __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride)); return _mm_unpacklo_epi64(temp1, temp2); } -static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr, - int a_stride, - const uint8_t *b8_ptr, - int b_stride, - const uint8_t *m_ptr, - int m_stride, - int width, int height); - -static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr, - int a_stride, - const uint8_t *b8_ptr, - int b_stride, - const uint8_t *m_ptr, - int m_stride, - int height); - -#define HIGHBD_MASKSADMXN_SSSE3(m, n) \ -unsigned int vpx_highbd_masked_sad##m##x##n##_ssse3(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - const uint8_t *msk, \ - int msk_stride) { \ - return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \ - msk_stride, m, n); \ -} +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, int height); + +static INLINE unsigned int highbd_masked_sad4xh_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height); + +#define HIGHBD_MASKSADMXN_SSSE3(m, n) \ + unsigned int vpx_highbd_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, m, n); \ + } #if CONFIG_EXT_PARTITION HIGHBD_MASKSADMXN_SSSE3(128, 128) @@ -277,29 +244,22 @@ HIGHBD_MASKSADMXN_SSSE3(8, 16) HIGHBD_MASKSADMXN_SSSE3(8, 8) HIGHBD_MASKSADMXN_SSSE3(8, 4) -#define HIGHBD_MASKSAD4XN_SSSE3(n) \ -unsigned int vpx_highbd_masked_sad4x##n##_ssse3(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - const uint8_t *msk, \ - int msk_stride) { \ - return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ - msk_stride, n); \ -} +#define HIGHBD_MASKSAD4XN_SSSE3(n) \ + unsigned int vpx_highbd_masked_sad4x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ + } HIGHBD_MASKSAD4XN_SSSE3(8) HIGHBD_MASKSAD4XN_SSSE3(4) // For width a multiple of 8 // Assumes values in m are <=64 -static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr, - int a_stride, - const uint8_t *b8_ptr, - int b_stride, - const uint8_t *m_ptr, - int m_stride, - int width, int height) { +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, int height) { int y, x; __m128i a, b, m, temp1, temp2; const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); @@ -310,9 +270,9 @@ static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr, // Covering the full width for (x = 0; x < width; x += 8) { // Load a, b, m in xmm registers - a = _mm_loadu_si128((const __m128i*)(a_ptr + x)); - b = _mm_loadu_si128((const __m128i*)(b_ptr + x)); - m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(m_ptr + x)), + a = _mm_loadu_si128((const __m128i *)(a_ptr + x)); + b = _mm_loadu_si128((const __m128i *)(b_ptr + x)); + m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)), _mm_setzero_si128()); // Calculate the difference between a & b @@ -334,13 +294,9 @@ static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr, return (_mm_cvtsi128_si32(res) + 31) >> 6; } -static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr, - int a_stride, - const uint8_t *b8_ptr, - int b_stride, - const uint8_t *m_ptr, - int m_stride, - int height) { +static INLINE unsigned int highbd_masked_sad4xh_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i a, b, m, temp1, temp2; const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); @@ -351,8 +307,8 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr, // Load a, b, m in xmm registers a = highbd_width4_load_2rows(a_ptr, a_stride); b = highbd_width4_load_2rows(b_ptr, b_stride); - temp1 = _mm_loadl_epi64((const __m128i*)m_ptr); - temp2 = _mm_loadl_epi64((const __m128i*)(m_ptr + m_stride)); + temp1 = _mm_loadl_epi64((const __m128i *)m_ptr); + temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)); m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2), _mm_setzero_si128()); diff --git a/vpx_dsp/x86/masked_variance_intrin_ssse3.c b/vpx_dsp/x86/masked_variance_intrin_ssse3.c index a0c2b6e853ab40589068befdd347815fe022adeb..ae08422746dd0e998bac56f8913b2268ec7b3d57 100644 --- a/vpx_dsp/x86/masked_variance_intrin_ssse3.c +++ b/vpx_dsp/x86/masked_variance_intrin_ssse3.c @@ -18,9 +18,8 @@ #include "vpx_ports/mem.h" #include "vpx_dsp/vpx_filter.h" - // Half pixel shift -#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS/2) +#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS / 2) /***************************************************************************** * Horizontal additions @@ -39,7 +38,7 @@ static INLINE int64_t hsum_epi64_si64(__m128i v_q) { #else { int64_t tmp; - _mm_storel_epi64((__m128i*)&tmp, v_q); + _mm_storel_epi64((__m128i *)&tmp, v_q); return tmp; } #endif @@ -47,7 +46,7 @@ static INLINE int64_t hsum_epi64_si64(__m128i v_q) { #if CONFIG_VP9_HIGHBITDEPTH static INLINE int64_t hsum_epi32_si64(__m128i v_d) { - const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); + const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); return hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); @@ -55,8 +54,8 @@ static INLINE int64_t hsum_epi32_si64(__m128i v_d) { #endif // CONFIG_VP9_HIGHBITDEPTH static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q, - uint32_t* sse, - const int w, const int h) { + uint32_t *sse, const int w, + const int h) { int64_t sum64; uint64_t sse64; @@ -73,7 +72,7 @@ static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q, // Store the SSE *sse = (uint32_t)sse64; // Compute the variance - return *sse - (uint32_t)((sum64 * sum64) / (w * h)); + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); } /***************************************************************************** @@ -81,11 +80,8 @@ static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q, *****************************************************************************/ static INLINE unsigned int masked_variancewxh_ssse3( - const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *m, int m_stride, - int w, int h, - unsigned int *sse) { + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { int ii, jj; const __m128i v_zero = _mm_setzero_si128(); @@ -96,11 +92,11 @@ static INLINE unsigned int masked_variancewxh_ssse3( assert((w % 16) == 0); for (ii = 0; ii < h; ii++) { - for (jj = 0 ; jj < w ; jj += 16) { + for (jj = 0; jj < w; jj += 16) { // Load inputs - 8 bits - const __m128i v_a_b = _mm_loadu_si128((const __m128i*)(a+jj)); - const __m128i v_b_b = _mm_loadu_si128((const __m128i*)(b+jj)); - const __m128i v_m_b = _mm_loadu_si128((const __m128i*)(m+jj)); + const __m128i v_a_b = _mm_loadu_si128((const __m128i *)(a + jj)); + const __m128i v_b_b = _mm_loadu_si128((const __m128i *)(b + jj)); + const __m128i v_m_b = _mm_loadu_si128((const __m128i *)(m + jj)); // Unpack to 16 bits - still containing max 8 bits const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero); @@ -147,17 +143,13 @@ static INLINE unsigned int masked_variancewxh_ssse3( return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); } -#define MASKED_VARWXH(W, H) \ -unsigned int vpx_masked_variance##W##x##H##_ssse3( \ - const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - const uint8_t *m, int m_stride, \ - unsigned int *sse) { \ - return masked_variancewxh_ssse3(a, a_stride, \ - b, b_stride, \ - m, m_stride, \ - W, H, sse); \ -} +#define MASKED_VARWXH(W, H) \ + unsigned int vpx_masked_variance##W##x##H##_ssse3( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + return masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, m_stride, W, \ + H, sse); \ + } MASKED_VARWXH(16, 8) MASKED_VARWXH(16, 16) @@ -178,11 +170,8 @@ MASKED_VARWXH(128, 128) *****************************************************************************/ static INLINE unsigned int masked_variance8xh_ssse3( - const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *m, int m_stride, - int h, - unsigned int *sse) { + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, int h, unsigned int *sse) { int ii; const __m128i v_zero = _mm_setzero_si128(); @@ -192,9 +181,9 @@ static INLINE unsigned int masked_variance8xh_ssse3( for (ii = 0; ii < h; ii++) { // Load inputs - 8 bits - const __m128i v_a_b = _mm_loadl_epi64((const __m128i*)a); - const __m128i v_b_b = _mm_loadl_epi64((const __m128i*)b); - const __m128i v_m_b = _mm_loadl_epi64((const __m128i*)m); + const __m128i v_a_b = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_b_b = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)m); // Unpack to 16 bits - still containing max 8 bits const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero); @@ -229,17 +218,13 @@ static INLINE unsigned int masked_variance8xh_ssse3( return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); } -#define MASKED_VAR8XH(H) \ -unsigned int vpx_masked_variance8x##H##_ssse3( \ - const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - const uint8_t *m, int m_stride, \ - unsigned int *sse) { \ - return masked_variance8xh_ssse3(a, a_stride, \ - b, b_stride, \ - m, m_stride, \ - H, sse); \ -} +#define MASKED_VAR8XH(H) \ + unsigned int vpx_masked_variance8x##H##_ssse3( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + return masked_variance8xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \ + sse); \ + } MASKED_VAR8XH(4) MASKED_VAR8XH(8) @@ -250,11 +235,8 @@ MASKED_VAR8XH(16) *****************************************************************************/ static INLINE unsigned int masked_variance4xh_ssse3( - const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *m, int m_stride, - int h, - unsigned int *sse) { + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, int h, unsigned int *sse) { int ii; const __m128i v_zero = _mm_setzero_si128(); @@ -264,14 +246,14 @@ static INLINE unsigned int masked_variance4xh_ssse3( assert((h % 2) == 0); - for (ii = 0; ii < h/2; ii++) { + for (ii = 0; ii < h / 2; ii++) { // Load 2 input rows - 8 bits - const __m128i v_a0_b = _mm_cvtsi32_si128(*(const uint32_t*)a); - const __m128i v_b0_b = _mm_cvtsi32_si128(*(const uint32_t*)b); - const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t*)m); - const __m128i v_a1_b = _mm_cvtsi32_si128(*(const uint32_t*)(a + a_stride)); - const __m128i v_b1_b = _mm_cvtsi32_si128(*(const uint32_t*)(b + b_stride)); - const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t*)(m + m_stride)); + const __m128i v_a0_b = _mm_cvtsi32_si128(*(const uint32_t *)a); + const __m128i v_b0_b = _mm_cvtsi32_si128(*(const uint32_t *)b); + const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m); + const __m128i v_a1_b = _mm_cvtsi32_si128(*(const uint32_t *)(a + a_stride)); + const __m128i v_b1_b = _mm_cvtsi32_si128(*(const uint32_t *)(b + b_stride)); + const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride)); // Interleave 2 rows into a single register const __m128i v_a_b = _mm_unpacklo_epi32(v_a0_b, v_a1_b); @@ -311,17 +293,13 @@ static INLINE unsigned int masked_variance4xh_ssse3( return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); } -#define MASKED_VAR4XH(H) \ -unsigned int vpx_masked_variance4x##H##_ssse3( \ - const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - const uint8_t *m, int m_stride, \ - unsigned int *sse) { \ - return masked_variance4xh_ssse3(a, a_stride, \ - b, b_stride, \ - m, m_stride, \ - H, sse); \ -} +#define MASKED_VAR4XH(H) \ + unsigned int vpx_masked_variance4x##H##_ssse3( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + return masked_variance4xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \ + sse); \ + } MASKED_VAR4XH(4) MASKED_VAR4XH(8) @@ -330,11 +308,8 @@ MASKED_VAR4XH(8) // Main calculation for n*8 wide blocks static INLINE void highbd_masked_variance64_ssse3( - const uint16_t *a, int a_stride, - const uint16_t *b, int b_stride, - const uint8_t *m, int m_stride, - int w, int h, - int64_t *sum, uint64_t *sse) { + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, int64_t *sum, uint64_t *sse) { int ii, jj; const __m128i v_zero = _mm_setzero_si128(); @@ -345,11 +320,11 @@ static INLINE void highbd_masked_variance64_ssse3( assert((w % 8) == 0); for (ii = 0; ii < h; ii++) { - for (jj = 0 ; jj < w ; jj += 8) { + for (jj = 0; jj < w; jj += 8) { // Load inputs - 8 bits - const __m128i v_a_w = _mm_loadu_si128((const __m128i*)(a+jj)); - const __m128i v_b_w = _mm_loadu_si128((const __m128i*)(b+jj)); - const __m128i v_m_b = _mm_loadl_epi64((const __m128i*)(m+jj)); + const __m128i v_a_w = _mm_loadu_si128((const __m128i *)(a + jj)); + const __m128i v_b_w = _mm_loadu_si128((const __m128i *)(b + jj)); + const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)(m + jj)); // Unpack m to 16 bits - still containing max 8 bits const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); @@ -396,18 +371,15 @@ static INLINE void highbd_masked_variance64_ssse3( *sse = hsum_epi64_si64(v_sse_q); // Round - *sum = (*sum >= 0) ? *sum : -*sum; + *sum = (*sum >= 0) ? *sum : -*sum; *sum = ROUND_POWER_OF_TWO(*sum, 6); *sse = ROUND_POWER_OF_TWO(*sse, 12); } // Main calculation for 4 wide blocks static INLINE void highbd_masked_variance64_4wide_ssse3( - const uint16_t *a, int a_stride, - const uint16_t *b, int b_stride, - const uint8_t *m, int m_stride, - int h, - int64_t *sum, uint64_t *sse) { + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int h, int64_t *sum, uint64_t *sse) { int ii; const __m128i v_zero = _mm_setzero_si128(); @@ -417,14 +389,14 @@ static INLINE void highbd_masked_variance64_4wide_ssse3( assert((h % 2) == 0); - for (ii = 0; ii < h/2; ii++) { + for (ii = 0; ii < h / 2; ii++) { // Load 2 input rows - 8 bits - const __m128i v_a0_w = _mm_loadl_epi64((const __m128i*)a); - const __m128i v_b0_w = _mm_loadl_epi64((const __m128i*)b); - const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t*)m); - const __m128i v_a1_w = _mm_loadl_epi64((const __m128i*)(a + a_stride)); - const __m128i v_b1_w = _mm_loadl_epi64((const __m128i*)(b + b_stride)); - const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t*)(m + m_stride)); + const __m128i v_a0_w = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_b0_w = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m); + const __m128i v_a1_w = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b1_w = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride)); // Interleave 2 rows into a single register const __m128i v_a_w = _mm_unpacklo_epi64(v_a0_w, v_a1_w); @@ -475,26 +447,23 @@ static INLINE void highbd_masked_variance64_4wide_ssse3( *sse = hsum_epi64_si64(v_sse_q); // Round - *sum = (*sum >= 0) ? *sum : -*sum; + *sum = (*sum >= 0) ? *sum : -*sum; *sum = ROUND_POWER_OF_TWO(*sum, 6); *sse = ROUND_POWER_OF_TWO(*sse, 12); } static INLINE unsigned int highbd_masked_variancewxh_ssse3( - const uint16_t *a, int a_stride, - const uint16_t *b, int b_stride, - const uint8_t *m, int m_stride, - int w, int h, - unsigned int *sse) { + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { uint64_t sse64; int64_t sum64; if (w == 4) - highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, - h, &sum64, &sse64); + highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, + h, &sum64, &sse64); else - highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, - &sum64, &sse64); + highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, + &sum64, &sse64); // Store the SSE *sse = (uint32_t)sse64; @@ -503,20 +472,17 @@ static INLINE unsigned int highbd_masked_variancewxh_ssse3( } static INLINE unsigned int highbd_10_masked_variancewxh_ssse3( - const uint16_t *a, int a_stride, - const uint16_t *b, int b_stride, - const uint8_t *m, int m_stride, - int w, int h, - unsigned int *sse) { + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { uint64_t sse64; int64_t sum64; if (w == 4) - highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, - h, &sum64, &sse64); + highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, + h, &sum64, &sse64); else - highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, - &sum64, &sse64); + highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, + &sum64, &sse64); // Normalise sum64 = ROUND_POWER_OF_TWO(sum64, 2); @@ -529,20 +495,17 @@ static INLINE unsigned int highbd_10_masked_variancewxh_ssse3( } static INLINE unsigned int highbd_12_masked_variancewxh_ssse3( - const uint16_t *a, int a_stride, - const uint16_t *b, int b_stride, - const uint8_t *m, int m_stride, - int w, int h, - unsigned int *sse) { + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { uint64_t sse64; int64_t sum64; if (w == 4) - highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, - h, &sum64, &sse64); + highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, + h, &sum64, &sse64); else - highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, - &sum64, &sse64); + highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, + &sum64, &sse64); sum64 = ROUND_POWER_OF_TWO(sum64, 4); sse64 = ROUND_POWER_OF_TWO(sse64, 8); @@ -553,45 +516,33 @@ static INLINE unsigned int highbd_12_masked_variancewxh_ssse3( return *sse - (uint32_t)((sum64 * sum64) / (w * h)); } -#define HIGHBD_MASKED_VARWXH(W, H) \ -unsigned int vpx_highbd_masked_variance##W##x##H##_ssse3( \ - const uint8_t *a8, int a_stride, \ - const uint8_t *b8, int b_stride, \ - const uint8_t *m, int m_stride, \ - unsigned int *sse) { \ - uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ - uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ - return highbd_masked_variancewxh_ssse3(a, a_stride, \ - b, b_stride, \ - m, m_stride, \ - W, H, sse); \ -} \ - \ -unsigned int vpx_highbd_10_masked_variance##W##x##H##_ssse3( \ - const uint8_t *a8, int a_stride, \ - const uint8_t *b8, int b_stride, \ - const uint8_t *m, int m_stride, \ - unsigned int *sse) { \ - uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ - uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ - return highbd_10_masked_variancewxh_ssse3(a, a_stride, \ - b, b_stride, \ - m, m_stride, \ - W, H, sse); \ -} \ - \ -unsigned int vpx_highbd_12_masked_variance##W##x##H##_ssse3( \ - const uint8_t *a8, int a_stride, \ - const uint8_t *b8, int b_stride, \ - const uint8_t *m, int m_stride, \ - unsigned int *sse) { \ - uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ - uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ - return highbd_12_masked_variancewxh_ssse3(a, a_stride, \ - b, b_stride, \ - m, m_stride, \ - W, H, sse); \ -} +#define HIGHBD_MASKED_VARWXH(W, H) \ + unsigned int vpx_highbd_masked_variance##W##x##H##_ssse3( \ + const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ + uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ + return highbd_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \ + m_stride, W, H, sse); \ + } \ + \ + unsigned int vpx_highbd_10_masked_variance##W##x##H##_ssse3( \ + const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ + uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ + return highbd_10_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \ + m_stride, W, H, sse); \ + } \ + \ + unsigned int vpx_highbd_12_masked_variance##W##x##H##_ssse3( \ + const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ + uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ + return highbd_12_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \ + m_stride, W, H, sse); \ + } HIGHBD_MASKED_VARWXH(4, 4) HIGHBD_MASKED_VARWXH(4, 8) @@ -619,11 +570,11 @@ HIGHBD_MASKED_VARWXH(128, 128) ////////////////////////////////////////////////////////////////////////////// typedef __m128i (*filter_fn_t)(__m128i v_a_b, __m128i v_b_b, - __m128i v_filter_b); + __m128i v_filter_b); static INLINE __m128i apply_filter_avg(const __m128i v_a_b, const __m128i v_b_b, const __m128i v_filter_b) { - (void) v_filter_b; + (void)v_filter_b; return _mm_avg_epu8(v_a_b, v_b_b); } @@ -634,28 +585,27 @@ static INLINE __m128i apply_filter(const __m128i v_a_b, const __m128i v_b_b, __m128i v_input_hi_b = _mm_unpackhi_epi8(v_a_b, v_b_b); __m128i v_temp0_w = _mm_maddubs_epi16(v_input_lo_b, v_filter_b); __m128i v_temp1_w = _mm_maddubs_epi16(v_input_hi_b, v_filter_b); - __m128i v_res_lo_w = _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), - FILTER_BITS); - __m128i v_res_hi_w = _mm_srai_epi16(_mm_add_epi16(v_temp1_w, v_rounding_w), - FILTER_BITS); + __m128i v_res_lo_w = + _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS); + __m128i v_res_hi_w = + _mm_srai_epi16(_mm_add_epi16(v_temp1_w, v_rounding_w), FILTER_BITS); return _mm_packus_epi16(v_res_lo_w, v_res_hi_w); } // Apply the filter to the contents of the lower half of a and b static INLINE void apply_filter_lo(const __m128i v_a_lo_b, const __m128i v_b_lo_b, - const __m128i v_filter_b, - __m128i* v_res_w) { + const __m128i v_filter_b, __m128i *v_res_w) { const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1)); __m128i v_input_b = _mm_unpacklo_epi8(v_a_lo_b, v_b_lo_b); __m128i v_temp0_w = _mm_maddubs_epi16(v_input_b, v_filter_b); - *v_res_w = _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), - FILTER_BITS); + *v_res_w = + _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS); } static void sum_and_sse(const __m128i v_a_b, const __m128i v_b_b, - const __m128i v_m_b, __m128i* v_sum_d, - __m128i* v_sse_q) { + const __m128i v_m_b, __m128i *v_sum_d, + __m128i *v_sse_q) { const __m128i v_zero = _mm_setzero_si128(); // Unpack to 16 bits - still containing max 8 bits const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero); @@ -694,37 +644,38 @@ static void sum_and_sse(const __m128i v_a_b, const __m128i v_b_b, } // Functions for width (W) >= 16 -unsigned int vpx_masked_subpel_varWxH_xzero( - const uint8_t *src, int src_stride, int yoffset, - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int w, int h, filter_fn_t filter_fn) { +unsigned int vpx_masked_subpel_varWxH_xzero(const uint8_t *src, int src_stride, + int yoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int w, int h, + filter_fn_t filter_fn) { int i, j; __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - const __m128i v_filter_b = _mm_set1_epi16(( - bilinear_filters_2t[yoffset][1] << 8) + - bilinear_filters_2t[yoffset][0]); + const __m128i v_filter_b = _mm_set1_epi16( + (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]); assert(yoffset < BIL_SUBPEL_SHIFTS); for (j = 0; j < w; j += 16) { // Load the first row ready - v_src0_b = _mm_loadu_si128((const __m128i*)(src + j)); + v_src0_b = _mm_loadu_si128((const __m128i *)(src + j)); // Process 2 rows at a time for (i = 0; i < h; i += 2) { // Load the next row apply the filter - v_src1_b = _mm_loadu_si128((const __m128i*)(src + j + src_stride)); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride)); v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b); // Load the dst and msk for the variance calculation - v_dst_b = _mm_loadu_si128((const __m128i*)(dst + j)); - v_msk_b = _mm_loadu_si128((const __m128i*)(msk + j)); + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j)); sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); // Load the next row apply the filter - v_src0_b = _mm_loadu_si128((const __m128i*)(src + j + src_stride * 2)); + v_src0_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2)); v_res_b = filter_fn(v_src1_b, v_src0_b, v_filter_b); // Load the dst and msk for the variance calculation - v_dst_b = _mm_loadu_si128((const __m128i*)(dst + j + dst_stride)); - v_msk_b = _mm_loadu_si128((const __m128i*)(msk + j + msk_stride)); + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j + msk_stride)); sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); // Move onto the next block of rows src += src_stride * 2; @@ -738,28 +689,29 @@ unsigned int vpx_masked_subpel_varWxH_xzero( } return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); } -unsigned int vpx_masked_subpel_varWxH_yzero( - const uint8_t *src, int src_stride, int xoffset, - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int w, int h, filter_fn_t filter_fn) { +unsigned int vpx_masked_subpel_varWxH_yzero(const uint8_t *src, int src_stride, + int xoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int w, int h, + filter_fn_t filter_fn) { int i, j; __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - const __m128i v_filter_b = _mm_set1_epi16(( - bilinear_filters_2t[xoffset][1] << 8) + - bilinear_filters_2t[xoffset][0]); + const __m128i v_filter_b = _mm_set1_epi16( + (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]); assert(xoffset < BIL_SUBPEL_SHIFTS); for (i = 0; i < h; i++) { for (j = 0; j < w; j += 16) { // Load this row and one below & apply the filter to them - v_src0_b = _mm_loadu_si128((const __m128i*)(src + j)); - v_src1_b = _mm_loadu_si128((const __m128i*)(src + j + 1)); + v_src0_b = _mm_loadu_si128((const __m128i *)(src + j)); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1)); v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b); // Load the dst and msk for the variance calculation - v_dst_b = _mm_loadu_si128((const __m128i*)(dst + j)); - v_msk_b = _mm_loadu_si128((const __m128i*)(msk + j)); + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j)); sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); } src += src_stride; @@ -769,49 +721,47 @@ unsigned int vpx_masked_subpel_varWxH_yzero( return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); } unsigned int vpx_masked_subpel_varWxH_xnonzero_ynonzero( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int w, int h, filter_fn_t xfilter_fn, - filter_fn_t yfilter_fn) { + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int w, int h, filter_fn_t xfilter_fn, + filter_fn_t yfilter_fn) { int i, j; __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b; __m128i v_filtered0_b, v_filtered1_b, v_res_b, v_dst_b, v_msk_b; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - const __m128i v_filterx_b = _mm_set1_epi16(( - bilinear_filters_2t[xoffset][1] << 8) + - bilinear_filters_2t[xoffset][0]); - const __m128i v_filtery_b = _mm_set1_epi16(( - bilinear_filters_2t[yoffset][1] << 8) + - bilinear_filters_2t[yoffset][0]); + const __m128i v_filterx_b = _mm_set1_epi16( + (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]); + const __m128i v_filtery_b = _mm_set1_epi16( + (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]); assert(yoffset < BIL_SUBPEL_SHIFTS); assert(xoffset < BIL_SUBPEL_SHIFTS); for (j = 0; j < w; j += 16) { // Load the first row ready - v_src0_b = _mm_loadu_si128((const __m128i*)(src + j)); - v_src1_b = _mm_loadu_si128((const __m128i*)(src + j + 1)); + v_src0_b = _mm_loadu_si128((const __m128i *)(src + j)); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1)); v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b); // Process 2 rows at a time for (i = 0; i < h; i += 2) { // Load the next row & apply the filter - v_src2_b = _mm_loadu_si128((const __m128i*)(src + src_stride + j)); - v_src3_b = _mm_loadu_si128((const __m128i*)(src + src_stride + j + 1)); + v_src2_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j)); + v_src3_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1)); v_filtered1_b = xfilter_fn(v_src2_b, v_src3_b, v_filterx_b); // Load the dst and msk for the variance calculation - v_dst_b = _mm_loadu_si128((const __m128i*)(dst + j)); - v_msk_b = _mm_loadu_si128((const __m128i*)(msk + j)); + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j)); // Complete the calculation for this row and add it to the running total v_res_b = yfilter_fn(v_filtered0_b, v_filtered1_b, v_filtery_b); sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); // Load the next row & apply the filter - v_src0_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 2 + j)); - v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 2 + - j + 1)); + v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j)); + v_src1_b = + _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1)); v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b); // Load the dst and msk for the variance calculation - v_dst_b = _mm_loadu_si128((const __m128i*)(dst + dst_stride + j)); - v_msk_b = _mm_loadu_si128((const __m128i*)(msk + msk_stride + j)); + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + msk_stride + j)); // Complete the calculation for this row and add it to the running total v_res_b = yfilter_fn(v_filtered1_b, v_filtered0_b, v_filtery_b); sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); @@ -830,58 +780,61 @@ unsigned int vpx_masked_subpel_varWxH_xnonzero_ynonzero( // Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2, // xmm[63:32] = row 3, xmm[31:0] = row 4 -unsigned int vpx_masked_subpel_var4xH_xzero( - const uint8_t *src, int src_stride, int yoffset, - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int h) { +unsigned int vpx_masked_subpel_var4xH_xzero(const uint8_t *src, int src_stride, + int yoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int h) { int i; __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered1_w, v_filtered2_w; __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b; __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filter_b = _mm_set1_epi16(( - bilinear_filters_2t[yoffset][1] << 8) + - bilinear_filters_2t[yoffset][0]); + __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + + bilinear_filters_2t[yoffset][0]); assert(yoffset < BIL_SUBPEL_SHIFTS); // Load the first row of src data ready - v_src0_b = _mm_loadl_epi64((const __m128i*)src); + v_src0_b = _mm_loadl_epi64((const __m128i *)src); for (i = 0; i < h; i += 4) { // Load the rest of the source data for these rows - v_src1_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1)); + v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); v_src1_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b); - v_src2_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2)); - v_src3_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 3)); + v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); v_src3_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b); - v_src0_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 4)); + v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); // Load the dst data - v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 0)); - v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 1)); + v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0)); + v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1)); v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b); - v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 2)); - v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 3)); + v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2)); + v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3)); v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b); v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b); // Load the mask data - v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 0)); - v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 1)); + v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0)); + v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1)); v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b); - v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 2)); - v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 3)); + v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2)); + v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3)); v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b); v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b); // Apply the y filter if (yoffset == HALF_PIXEL_OFFSET) { v_src1_b = _mm_unpacklo_epi64(v_src3_b, v_src1_b); - v_src2_b = _mm_or_si128(_mm_slli_si128(v_src1_b, 4), - _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0))); + v_src2_b = + _mm_or_si128(_mm_slli_si128(v_src1_b, 4), + _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0))); v_res_b = _mm_avg_epu8(v_src1_b, v_src2_b); } else { - v_src2_b = _mm_or_si128(_mm_slli_si128(v_src1_b, 4), - _mm_and_si128(v_src2_b, _mm_setr_epi32(-1, 0, 0, 0))); + v_src2_b = + _mm_or_si128(_mm_slli_si128(v_src1_b, 4), + _mm_and_si128(v_src2_b, _mm_setr_epi32(-1, 0, 0, 0))); apply_filter_lo(v_src1_b, v_src2_b, v_filter_b, &v_filtered1_w); - v_src2_b = _mm_or_si128(_mm_slli_si128(v_src3_b, 4), - _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0))); + v_src2_b = + _mm_or_si128(_mm_slli_si128(v_src3_b, 4), + _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0))); apply_filter_lo(v_src3_b, v_src2_b, v_filter_b, &v_filtered2_w); v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered1_w); } @@ -896,49 +849,49 @@ unsigned int vpx_masked_subpel_var4xH_xzero( } // Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2 -unsigned int vpx_masked_subpel_var8xH_xzero( - const uint8_t *src, int src_stride, int yoffset, - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int h) { +unsigned int vpx_masked_subpel_var8xH_xzero(const uint8_t *src, int src_stride, + int yoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int h) { int i; __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_res_b; __m128i v_dst_b = _mm_setzero_si128(); __m128i v_msk_b = _mm_setzero_si128(); __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filter_b = _mm_set1_epi16(( - bilinear_filters_2t[yoffset][1] << 8) + - bilinear_filters_2t[yoffset][0]); + __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + + bilinear_filters_2t[yoffset][0]); assert(yoffset < BIL_SUBPEL_SHIFTS); // Load the first row of src data ready - v_src0_b = _mm_loadl_epi64((const __m128i*)src); + v_src0_b = _mm_loadl_epi64((const __m128i *)src); for (i = 0; i < h; i += 2) { if (yoffset == HALF_PIXEL_OFFSET) { // Load the rest of the source data for these rows v_src1_b = _mm_or_si128( - _mm_slli_si128(v_src0_b, 8), - _mm_loadl_epi64((const __m128i*)(src + src_stride * 1))); + _mm_slli_si128(v_src0_b, 8), + _mm_loadl_epi64((const __m128i *)(src + src_stride * 1))); v_src0_b = _mm_or_si128( - _mm_slli_si128(v_src1_b, 8), - _mm_loadl_epi64((const __m128i*)(src + src_stride * 2))); + _mm_slli_si128(v_src1_b, 8), + _mm_loadl_epi64((const __m128i *)(src + src_stride * 2))); // Apply the y filter v_res_b = _mm_avg_epu8(v_src1_b, v_src0_b); } else { // Load the data and apply the y filter - v_src1_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1)); + v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); apply_filter_lo(v_src0_b, v_src1_b, v_filter_b, &v_filtered0_w); - v_src0_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2)); + v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); apply_filter_lo(v_src1_b, v_src0_b, v_filter_b, &v_filtered1_w); v_res_b = _mm_packus_epi16(v_filtered1_w, v_filtered0_w); } // Load the dst data v_dst_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 1)), - _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 0))); + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0))); // Load the mask data v_msk_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 1)), - _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 0))); + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0))); // Compute the sum and SSE sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); // Move onto the next set of rows @@ -951,10 +904,11 @@ unsigned int vpx_masked_subpel_var8xH_xzero( // Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2, // xmm[63:32] = row 3, xmm[31:0] = row 4 -unsigned int vpx_masked_subpel_var4xH_yzero( - const uint8_t *src, int src_stride, int xoffset, - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int h) { +unsigned int vpx_masked_subpel_var4xH_yzero(const uint8_t *src, int src_stride, + int xoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int h) { int i; __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w; __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b; @@ -962,38 +916,37 @@ unsigned int vpx_masked_subpel_var4xH_yzero( __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filter_b = _mm_set1_epi16(( - bilinear_filters_2t[xoffset][1] << 8) + - bilinear_filters_2t[xoffset][0]); + __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + + bilinear_filters_2t[xoffset][0]); assert(xoffset < BIL_SUBPEL_SHIFTS); for (i = 0; i < h; i += 4) { // Load the src data - v_src0_b = _mm_loadl_epi64((const __m128i*)src); + v_src0_b = _mm_loadl_epi64((const __m128i *)src); v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - v_src1_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1)); + v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b); v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); - v_src2_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2)); + v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b); v_src2_shift_b = _mm_srli_si128(v_src2_b, 1); - v_src3_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 3)); + v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b); v_src3_shift_b = _mm_srli_si128(v_src3_b, 1); v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b); // Load the dst data - v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 0)); - v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 1)); + v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0)); + v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1)); v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b); - v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 2)); - v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 3)); + v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2)); + v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3)); v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b); v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b); // Load the mask data - v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 0)); - v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 1)); + v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0)); + v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1)); v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b); - v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 2)); - v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 3)); + v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2)); + v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3)); v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b); v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b); // Apply the x filter @@ -1016,24 +969,24 @@ unsigned int vpx_masked_subpel_var4xH_yzero( return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); } -unsigned int vpx_masked_subpel_var8xH_yzero( - const uint8_t *src, int src_stride, int xoffset, - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int h) { +unsigned int vpx_masked_subpel_var8xH_yzero(const uint8_t *src, int src_stride, + int xoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int h) { int i; __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w; __m128i v_src0_shift_b, v_src1_shift_b, v_res_b, v_dst_b, v_msk_b; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filter_b = _mm_set1_epi16(( - bilinear_filters_2t[xoffset][1] << 8) + - bilinear_filters_2t[xoffset][0]); + __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + + bilinear_filters_2t[xoffset][0]); assert(xoffset < BIL_SUBPEL_SHIFTS); for (i = 0; i < h; i += 2) { // Load the src data - v_src0_b = _mm_loadu_si128((const __m128i*)(src)); + v_src0_b = _mm_loadu_si128((const __m128i *)(src)); v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride)); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride)); v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); // Apply the x filter if (xoffset == HALF_PIXEL_OFFSET) { @@ -1047,12 +1000,12 @@ unsigned int vpx_masked_subpel_var8xH_yzero( } // Load the dst data v_dst_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 0)), - _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 1))); + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); // Load the mask data v_msk_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 0)), - _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 1))); + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); // Compute the sum and SSE sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); // Move onto the next set of rows @@ -1066,9 +1019,9 @@ unsigned int vpx_masked_subpel_var8xH_yzero( // Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2, // xmm[63:32] = row 3, xmm[31:0] = row 4 unsigned int vpx_masked_subpel_var4xH_xnonzero_ynonzero( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int h) { + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int h) { int i; __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w; __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b; @@ -1077,25 +1030,23 @@ unsigned int vpx_masked_subpel_var4xH_xnonzero_ynonzero( __m128i v_xres_b[2]; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filterx_b = _mm_set1_epi16(( - bilinear_filters_2t[xoffset][1] << 8) + - bilinear_filters_2t[xoffset][0]); - __m128i v_filtery_b = _mm_set1_epi16(( - bilinear_filters_2t[yoffset][1] << 8) + - bilinear_filters_2t[yoffset][0]); + __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + + bilinear_filters_2t[xoffset][0]); + __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + + bilinear_filters_2t[yoffset][0]); assert(xoffset < BIL_SUBPEL_SHIFTS); assert(yoffset < BIL_SUBPEL_SHIFTS); for (i = 0; i < h; i += 4) { // Load the src data - v_src0_b = _mm_loadl_epi64((const __m128i*)src); + v_src0_b = _mm_loadl_epi64((const __m128i *)src); v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - v_src1_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1)); + v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b); v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); - v_src2_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2)); + v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b); v_src2_shift_b = _mm_srli_si128(v_src2_b, 1); - v_src3_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 3)); + v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b); v_src3_shift_b = _mm_srli_si128(v_src3_b, 1); v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b); @@ -1113,18 +1064,17 @@ unsigned int vpx_masked_subpel_var4xH_xnonzero_ynonzero( src += src_stride * 4; } // Load one more row to be used in the y filter - v_src0_b = _mm_loadl_epi64((const __m128i*)src); + v_src0_b = _mm_loadl_epi64((const __m128i *)src); v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); // Apply the x filter if (xoffset == HALF_PIXEL_OFFSET) { - v_extra_row_b = _mm_and_si128( - _mm_avg_epu8(v_src0_b, v_src0_shift_b), - _mm_setr_epi32(-1, 0, 0, 0)); + v_extra_row_b = _mm_and_si128(_mm_avg_epu8(v_src0_b, v_src0_shift_b), + _mm_setr_epi32(-1, 0, 0, 0)); } else { apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); - v_extra_row_b = _mm_and_si128( - _mm_packus_epi16(v_filtered0_w, _mm_setzero_si128()), - _mm_setr_epi32(-1, 0, 0, 0)); + v_extra_row_b = + _mm_and_si128(_mm_packus_epi16(v_filtered0_w, _mm_setzero_si128()), + _mm_setr_epi32(-1, 0, 0, 0)); } for (i = 0; i < h; i += 4) { @@ -1143,19 +1093,19 @@ unsigned int vpx_masked_subpel_var4xH_xnonzero_ynonzero( } // Load the dst data - v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 0)); - v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 1)); + v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0)); + v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1)); v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b); - v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 2)); - v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 3)); + v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2)); + v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3)); v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b); v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b); // Load the mask data - v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 0)); - v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 1)); + v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0)); + v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1)); v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b); - v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 2)); - v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 3)); + v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2)); + v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3)); v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b); v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b); // Compute the sum and SSE @@ -1168,27 +1118,25 @@ unsigned int vpx_masked_subpel_var4xH_xnonzero_ynonzero( } unsigned int vpx_masked_subpel_var8xH_xnonzero_ynonzero( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int h) { + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int h) { int i; __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_dst_b, v_msk_b; __m128i v_src0_shift_b, v_src1_shift_b; __m128i v_xres0_b, v_xres1_b, v_res_b, v_temp_b; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filterx_b = _mm_set1_epi16(( - bilinear_filters_2t[xoffset][1] << 8) + - bilinear_filters_2t[xoffset][0]); - __m128i v_filtery_b = _mm_set1_epi16(( - bilinear_filters_2t[yoffset][1] << 8) + - bilinear_filters_2t[yoffset][0]); + __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + + bilinear_filters_2t[xoffset][0]); + __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + + bilinear_filters_2t[yoffset][0]); assert(xoffset < BIL_SUBPEL_SHIFTS); assert(yoffset < BIL_SUBPEL_SHIFTS); // Load the first block of src data - v_src0_b = _mm_loadu_si128((const __m128i*)(src)); + v_src0_b = _mm_loadu_si128((const __m128i *)(src)); v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride)); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride)); v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); // Apply the x filter if (xoffset == HALF_PIXEL_OFFSET) { @@ -1202,9 +1150,9 @@ unsigned int vpx_masked_subpel_var8xH_xnonzero_ynonzero( } for (i = 0; i < h; i += 4) { // Load the next block of src data - v_src0_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 2)); + v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2)); v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 3)); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 3)); v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); // Apply the x filter if (xoffset == HALF_PIXEL_OFFSET) { @@ -1226,19 +1174,19 @@ unsigned int vpx_masked_subpel_var8xH_xnonzero_ynonzero( } // Load the dst data v_dst_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); // Load the mask data v_msk_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); // Compute the sum and SSE sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); // Load the next block of src data - v_src0_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 4)); + v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 4)); v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 5)); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 5)); v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); // Apply the x filter if (xoffset == HALF_PIXEL_OFFSET) { @@ -1260,12 +1208,12 @@ unsigned int vpx_masked_subpel_var8xH_xnonzero_ynonzero( } // Load the dst data v_dst_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)), - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3))); + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3))); // Load the mask data v_msk_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)), - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3))); + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3))); // Compute the sum and SSE sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); // Move onto the next set of rows @@ -1276,89 +1224,77 @@ unsigned int vpx_masked_subpel_var8xH_xnonzero_ynonzero( return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); } - // For W >=16 #define MASK_SUBPIX_VAR_LARGE(W, H) \ -unsigned int vpx_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - assert(W % 16 == 0); \ - if (xoffset == 0) { \ - if (yoffset == 0) \ - return vpx_masked_variance##W##x##H##_ssse3(src, src_stride, \ - dst, dst_stride, \ - msk, msk_stride, sse); \ - else if (yoffset == HALF_PIXEL_OFFSET) \ - return vpx_masked_subpel_varWxH_xzero(src, src_stride, \ - HALF_PIXEL_OFFSET, \ - dst, dst_stride, msk, msk_stride, \ - sse, W, H, apply_filter_avg); \ - else \ - return vpx_masked_subpel_varWxH_xzero(src, src_stride, \ - yoffset, \ - dst, dst_stride, msk, msk_stride, \ - sse, W, H, apply_filter); \ - } else if (yoffset == 0) { \ - if (xoffset == HALF_PIXEL_OFFSET) \ - return vpx_masked_subpel_varWxH_yzero(src, src_stride, \ - HALF_PIXEL_OFFSET, \ - dst, dst_stride, msk, msk_stride, \ - sse, W, H, apply_filter_avg); \ - else \ - return vpx_masked_subpel_varWxH_yzero(src, src_stride, \ - xoffset, \ - dst, dst_stride, msk, msk_stride, \ - sse, W, H, apply_filter); \ - } else if (xoffset == HALF_PIXEL_OFFSET) { \ - if (yoffset == HALF_PIXEL_OFFSET) \ - return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride, \ - HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ - msk_stride, sse, W, H, apply_filter_avg, apply_filter_avg); \ - else \ - return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride, \ - HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \ - msk_stride, sse, W, H, apply_filter_avg, apply_filter); \ - } else { \ - if (yoffset == HALF_PIXEL_OFFSET) \ - return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride, \ - xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ - msk_stride, sse, W, H, apply_filter, apply_filter_avg); \ - else \ - return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride, \ - xoffset, yoffset, dst, dst_stride, msk, \ - msk_stride, sse, W, H, apply_filter, apply_filter); \ - } \ -} + unsigned int vpx_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + assert(W % 16 == 0); \ + if (xoffset == 0) { \ + if (yoffset == 0) \ + return vpx_masked_variance##W##x##H##_ssse3( \ + src, src_stride, dst, dst_stride, msk, msk_stride, sse); \ + else if (yoffset == HALF_PIXEL_OFFSET) \ + return vpx_masked_subpel_varWxH_xzero( \ + src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter_avg); \ + else \ + return vpx_masked_subpel_varWxH_xzero(src, src_stride, yoffset, dst, \ + dst_stride, msk, msk_stride, \ + sse, W, H, apply_filter); \ + } else if (yoffset == 0) { \ + if (xoffset == HALF_PIXEL_OFFSET) \ + return vpx_masked_subpel_varWxH_yzero( \ + src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter_avg); \ + else \ + return vpx_masked_subpel_varWxH_yzero(src, src_stride, xoffset, dst, \ + dst_stride, msk, msk_stride, \ + sse, W, H, apply_filter); \ + } else if (xoffset == HALF_PIXEL_OFFSET) { \ + if (yoffset == HALF_PIXEL_OFFSET) \ + return vpx_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, \ + dst_stride, msk, msk_stride, sse, W, H, apply_filter_avg, \ + apply_filter_avg); \ + else \ + return vpx_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter_avg, apply_filter); \ + } else { \ + if (yoffset == HALF_PIXEL_OFFSET) \ + return vpx_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter, apply_filter_avg); \ + else \ + return vpx_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter, apply_filter); \ + } \ + } // For W < 16 #define MASK_SUBPIX_VAR_SMALL(W, H) \ -unsigned int vpx_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - assert(W == 4 || W == 8); \ - if (xoffset == 0 && yoffset == 0) \ - return vpx_masked_variance##W##x##H##_ssse3(src, src_stride, \ - dst, dst_stride, \ - msk, msk_stride, sse); \ - else if (xoffset == 0) \ - return vpx_masked_subpel_var##W##xH_xzero(src, src_stride, yoffset, \ - dst, dst_stride, \ - msk, msk_stride, sse, H); \ - else if (yoffset == 0) \ - return vpx_masked_subpel_var##W##xH_yzero(src, src_stride, xoffset, \ - dst, dst_stride, \ - msk, msk_stride, sse, H); \ - else \ - return vpx_masked_subpel_var##W##xH_xnonzero_ynonzero( \ - src, src_stride, xoffset, yoffset, dst, dst_stride, \ - msk, msk_stride, sse, H); \ -} + unsigned int vpx_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + assert(W == 4 || W == 8); \ + if (xoffset == 0 && yoffset == 0) \ + return vpx_masked_variance##W##x##H##_ssse3( \ + src, src_stride, dst, dst_stride, msk, msk_stride, sse); \ + else if (xoffset == 0) \ + return vpx_masked_subpel_var##W##xH_xzero( \ + src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H); \ + else if (yoffset == 0) \ + return vpx_masked_subpel_var##W##xH_yzero( \ + src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H); \ + else \ + return vpx_masked_subpel_var##W##xH_xnonzero_ynonzero( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \ + sse, H); \ + } MASK_SUBPIX_VAR_SMALL(4, 4) MASK_SUBPIX_VAR_SMALL(4, 8) @@ -1381,20 +1317,19 @@ MASK_SUBPIX_VAR_LARGE(128, 128) #if CONFIG_VP9_HIGHBITDEPTH typedef uint32_t (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q, - uint32_t *sse, - const int w, const int h); -typedef unsigned int (*highbd_variance_fn_t)( - const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m, int m_stride, - unsigned int *sse); + uint32_t *sse, const int w, + const int h); +typedef unsigned int (*highbd_variance_fn_t)(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, + unsigned int *sse); typedef __m128i (*highbd_filter_fn_t)(__m128i v_a_w, __m128i v_b_w, - __m128i v_filter_w); + __m128i v_filter_w); static INLINE __m128i highbd_apply_filter_avg(const __m128i v_a_w, const __m128i v_b_w, const __m128i v_filter_w) { - (void) v_filter_w; + (void)v_filter_w; return _mm_avg_epu16(v_a_w, v_b_w); } @@ -1406,27 +1341,27 @@ static INLINE __m128i highbd_apply_filter(const __m128i v_a_w, __m128i v_input_hi_w = _mm_unpackhi_epi16(v_a_w, v_b_w); __m128i v_temp0_d = _mm_madd_epi16(v_input_lo_w, v_filter_w); __m128i v_temp1_d = _mm_madd_epi16(v_input_hi_w, v_filter_w); - __m128i v_res_lo_d = _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), - FILTER_BITS); - __m128i v_res_hi_d = _mm_srai_epi32(_mm_add_epi32(v_temp1_d, v_rounding_d), - FILTER_BITS); + __m128i v_res_lo_d = + _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS); + __m128i v_res_hi_d = + _mm_srai_epi32(_mm_add_epi32(v_temp1_d, v_rounding_d), FILTER_BITS); return _mm_packs_epi32(v_res_lo_d, v_res_hi_d); } // Apply the filter to the contents of the lower half of a and b static INLINE void highbd_apply_filter_lo(const __m128i v_a_lo_w, const __m128i v_b_lo_w, const __m128i v_filter_w, - __m128i* v_res_d) { + __m128i *v_res_d) { const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1)); __m128i v_input_w = _mm_unpacklo_epi16(v_a_lo_w, v_b_lo_w); __m128i v_temp0_d = _mm_madd_epi16(v_input_w, v_filter_w); - *v_res_d = _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), - FILTER_BITS); + *v_res_d = + _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS); } static void highbd_sum_and_sse(const __m128i v_a_w, const __m128i v_b_w, - const __m128i v_m_b, __m128i* v_sum_d, - __m128i* v_sse_q) { + const __m128i v_m_b, __m128i *v_sum_d, + __m128i *v_sse_q) { const __m128i v_zero = _mm_setzero_si128(); const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); @@ -1461,11 +1396,8 @@ static void highbd_sum_and_sse(const __m128i v_a_w, const __m128i v_b_w, *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q); } -static INLINE uint32_t highbd_10_calc_masked_variance(__m128i v_sum_d, - __m128i v_sse_q, - uint32_t* sse, - const int w, - const int h) { +static INLINE uint32_t highbd_10_calc_masked_variance( + __m128i v_sum_d, __m128i v_sse_q, uint32_t *sse, const int w, const int h) { int64_t sum64; uint64_t sse64; @@ -1486,13 +1418,10 @@ static INLINE uint32_t highbd_10_calc_masked_variance(__m128i v_sum_d, // Store the SSE *sse = (uint32_t)sse64; // Compute the variance - return *sse - (uint32_t)((sum64 * sum64) / (w * h)); + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); } -static INLINE uint32_t highbd_12_calc_masked_variance(__m128i v_sum_d, - __m128i v_sse_q, - uint32_t* sse, - const int w, - const int h) { +static INLINE uint32_t highbd_12_calc_masked_variance( + __m128i v_sum_d, __m128i v_sse_q, uint32_t *sse, const int w, const int h) { int64_t sum64; uint64_t sse64; @@ -1513,43 +1442,42 @@ static INLINE uint32_t highbd_12_calc_masked_variance(__m128i v_sum_d, // Store the SSE *sse = (uint32_t)sse64; // Compute the variance - return *sse - (uint32_t)((sum64 * sum64) / (w * h)); + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); } - // High bit depth functions for width (W) >= 8 unsigned int vpx_highbd_masked_subpel_varWxH_xzero( - const uint16_t *src, int src_stride, int yoffset, - const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int w, int h, highbd_filter_fn_t filter_fn, - highbd_calc_masked_var_t calc_var) { + const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst, + int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, + int w, int h, highbd_filter_fn_t filter_fn, + highbd_calc_masked_var_t calc_var) { int i, j; __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - const __m128i v_filter_w = _mm_set1_epi32(( - bilinear_filters_2t[yoffset][1] << 16) + - bilinear_filters_2t[yoffset][0]); + const __m128i v_filter_w = + _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + + bilinear_filters_2t[yoffset][0]); assert(yoffset < BIL_SUBPEL_SHIFTS); for (j = 0; j < w; j += 8) { // Load the first row ready - v_src0_w = _mm_loadu_si128((const __m128i*)(src + j)); + v_src0_w = _mm_loadu_si128((const __m128i *)(src + j)); // Process 2 rows at a time for (i = 0; i < h; i += 2) { // Load the next row apply the filter - v_src1_w = _mm_loadu_si128((const __m128i*)(src + j + src_stride)); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride)); v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w); // Load the dst and msk for the variance calculation - v_dst_w = _mm_loadu_si128((const __m128i*)(dst + j)); - v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + j)); + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j)); highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); // Load the next row apply the filter - v_src0_w = _mm_loadu_si128((const __m128i*)(src + j + src_stride * 2)); + v_src0_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2)); v_res_w = filter_fn(v_src1_w, v_src0_w, v_filter_w); // Load the dst and msk for the variance calculation - v_dst_w = _mm_loadu_si128((const __m128i*)(dst + j + dst_stride)); - v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + j + msk_stride)); + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j + msk_stride)); highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); // Move onto the next block of rows src += src_stride * 2; @@ -1564,28 +1492,28 @@ unsigned int vpx_highbd_masked_subpel_varWxH_xzero( return calc_var(v_sum_d, v_sse_q, sse, w, h); } unsigned int vpx_highbd_masked_subpel_varWxH_yzero( - const uint16_t *src, int src_stride, int xoffset, - const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int w, int h, highbd_filter_fn_t filter_fn, - highbd_calc_masked_var_t calc_var) { + const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst, + int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, + int w, int h, highbd_filter_fn_t filter_fn, + highbd_calc_masked_var_t calc_var) { int i, j; __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - const __m128i v_filter_w = _mm_set1_epi32(( - bilinear_filters_2t[xoffset][1] << 16) + - bilinear_filters_2t[xoffset][0]); + const __m128i v_filter_w = + _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + + bilinear_filters_2t[xoffset][0]); assert(xoffset < BIL_SUBPEL_SHIFTS); for (i = 0; i < h; i++) { for (j = 0; j < w; j += 8) { // Load this row & apply the filter to them - v_src0_w = _mm_loadu_si128((const __m128i*)(src + j)); - v_src1_w = _mm_loadu_si128((const __m128i*)(src + j + 1)); + v_src0_w = _mm_loadu_si128((const __m128i *)(src + j)); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1)); v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w); // Load the dst and msk for the variance calculation - v_dst_w = _mm_loadu_si128((const __m128i*)(dst + j)); - v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + j)); + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j)); highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); } src += src_stride; @@ -1596,49 +1524,49 @@ unsigned int vpx_highbd_masked_subpel_varWxH_yzero( } unsigned int vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero( - const uint16_t *src, int src_stride, int xoffset, int yoffset, - const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int w, int h, highbd_filter_fn_t xfilter_fn, - highbd_filter_fn_t yfilter_fn, highbd_calc_masked_var_t calc_var) { + const uint16_t *src, int src_stride, int xoffset, int yoffset, + const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int w, int h, highbd_filter_fn_t xfilter_fn, + highbd_filter_fn_t yfilter_fn, highbd_calc_masked_var_t calc_var) { int i, j; __m128i v_src0_w, v_src1_w, v_src2_w, v_src3_w; __m128i v_filtered0_w, v_filtered1_w, v_res_w, v_dst_w, v_msk_b; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - const __m128i v_filterx_w = _mm_set1_epi32(( - bilinear_filters_2t[xoffset][1] << 16) + - bilinear_filters_2t[xoffset][0]); - const __m128i v_filtery_w = _mm_set1_epi32(( - bilinear_filters_2t[yoffset][1] << 16) + - bilinear_filters_2t[yoffset][0]); + const __m128i v_filterx_w = + _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + + bilinear_filters_2t[xoffset][0]); + const __m128i v_filtery_w = + _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + + bilinear_filters_2t[yoffset][0]); assert(xoffset < BIL_SUBPEL_SHIFTS); assert(yoffset < BIL_SUBPEL_SHIFTS); for (j = 0; j < w; j += 8) { // Load the first row ready - v_src0_w = _mm_loadu_si128((const __m128i*)(src + j)); - v_src1_w = _mm_loadu_si128((const __m128i*)(src + j + 1)); + v_src0_w = _mm_loadu_si128((const __m128i *)(src + j)); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1)); v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w); // Process 2 rows at a time for (i = 0; i < h; i += 2) { // Load the next row & apply the filter - v_src2_w = _mm_loadu_si128((const __m128i*)(src + src_stride + j)); - v_src3_w = _mm_loadu_si128((const __m128i*)(src + src_stride + j + 1)); + v_src2_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j)); + v_src3_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1)); v_filtered1_w = xfilter_fn(v_src2_w, v_src3_w, v_filterx_w); // Load the dst and msk for the variance calculation - v_dst_w = _mm_loadu_si128((const __m128i*)(dst + j)); - v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + j)); + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j)); // Complete the calculation for this row and add it to the running total v_res_w = yfilter_fn(v_filtered0_w, v_filtered1_w, v_filtery_w); highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); // Load the next row & apply the filter - v_src0_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 2 + j)); - v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 2 + - j + 1)); + v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j)); + v_src1_w = + _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1)); v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w); // Load the dst and msk for the variance calculation - v_dst_w = _mm_loadu_si128((const __m128i*)(dst + dst_stride + j)); - v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + msk_stride + j)); + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + msk_stride + j)); // Complete the calculation for this row and add it to the running total v_res_w = yfilter_fn(v_filtered1_w, v_filtered0_w, v_filtery_w); highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); @@ -1657,47 +1585,46 @@ unsigned int vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero( // Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2 unsigned int vpx_highbd_masked_subpel_var4xH_xzero( - const uint16_t *src, int src_stride, int yoffset, - const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) { + const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst, + int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, + int h, highbd_calc_masked_var_t calc_var) { int i; __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_res_w; __m128i v_dst_w, v_msk_b; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filter_w = _mm_set1_epi32(( - bilinear_filters_2t[yoffset][1] << 16) + - bilinear_filters_2t[yoffset][0]); + __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + + bilinear_filters_2t[yoffset][0]); assert(yoffset < BIL_SUBPEL_SHIFTS); // Load the first row of src data ready - v_src0_w = _mm_loadl_epi64((const __m128i*)src); + v_src0_w = _mm_loadl_epi64((const __m128i *)src); for (i = 0; i < h; i += 2) { if (yoffset == HALF_PIXEL_OFFSET) { // Load the rest of the source data for these rows v_src1_w = _mm_or_si128( - _mm_slli_si128(v_src0_w, 8), - _mm_loadl_epi64((const __m128i*)(src + src_stride * 1))); + _mm_slli_si128(v_src0_w, 8), + _mm_loadl_epi64((const __m128i *)(src + src_stride * 1))); v_src0_w = _mm_or_si128( - _mm_slli_si128(v_src1_w, 8), - _mm_loadl_epi64((const __m128i*)(src + src_stride * 2))); + _mm_slli_si128(v_src1_w, 8), + _mm_loadl_epi64((const __m128i *)(src + src_stride * 2))); // Apply the y filter v_res_w = _mm_avg_epu16(v_src1_w, v_src0_w); } else { // Load the data and apply the y filter - v_src1_w = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1)); + v_src1_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); highbd_apply_filter_lo(v_src0_w, v_src1_w, v_filter_w, &v_filtered0_d); - v_src0_w = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2)); + v_src0_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); highbd_apply_filter_lo(v_src1_w, v_src0_w, v_filter_w, &v_filtered1_d); v_res_w = _mm_packs_epi32(v_filtered1_d, v_filtered0_d); } // Load the dst data v_dst_w = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 1)), - _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 0))); + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0))); // Load the mask data v_msk_b = _mm_unpacklo_epi32( - _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 1)), - _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 0))); + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0))); // Compute the sum and SSE highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); // Move onto the next set of rows @@ -1709,23 +1636,22 @@ unsigned int vpx_highbd_masked_subpel_var4xH_xzero( } unsigned int vpx_highbd_masked_subpel_var4xH_yzero( - const uint16_t *src, int src_stride, int xoffset, - const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) { + const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst, + int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, + int h, highbd_calc_masked_var_t calc_var) { int i; __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d; __m128i v_src0_shift_w, v_src1_shift_w, v_res_w, v_dst_w, v_msk_b; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filter_w = _mm_set1_epi32(( - bilinear_filters_2t[xoffset][1] << 16) + - bilinear_filters_2t[xoffset][0]); + __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + + bilinear_filters_2t[xoffset][0]); assert(xoffset < BIL_SUBPEL_SHIFTS); for (i = 0; i < h; i += 2) { // Load the src data - v_src0_w = _mm_loadu_si128((const __m128i*)(src)); + v_src0_w = _mm_loadu_si128((const __m128i *)(src)); v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); - v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride)); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride)); v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); // Apply the x filter if (xoffset == HALF_PIXEL_OFFSET) { @@ -1741,12 +1667,12 @@ unsigned int vpx_highbd_masked_subpel_var4xH_yzero( } // Load the dst data v_dst_w = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 0)), - _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 1))); + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); // Load the mask data v_msk_b = _mm_unpacklo_epi32( - _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 0)), - _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 1))); + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); // Compute the sum and SSE highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); // Move onto the next set of rows @@ -1758,27 +1684,25 @@ unsigned int vpx_highbd_masked_subpel_var4xH_yzero( } unsigned int vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero( - const uint16_t *src, int src_stride, int xoffset, int yoffset, - const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) { + const uint16_t *src, int src_stride, int xoffset, int yoffset, + const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) { int i; __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_dst_w, v_msk_b; __m128i v_src0_shift_w, v_src1_shift_w; __m128i v_xres0_w, v_xres1_w, v_res_w, v_temp_w; __m128i v_sum_d = _mm_setzero_si128(); __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filterx_w = _mm_set1_epi32(( - bilinear_filters_2t[xoffset][1] << 16) + - bilinear_filters_2t[xoffset][0]); - __m128i v_filtery_w = _mm_set1_epi32(( - bilinear_filters_2t[yoffset][1] << 16) + - bilinear_filters_2t[yoffset][0]); + __m128i v_filterx_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + + bilinear_filters_2t[xoffset][0]); + __m128i v_filtery_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + + bilinear_filters_2t[yoffset][0]); assert(xoffset < BIL_SUBPEL_SHIFTS); assert(yoffset < BIL_SUBPEL_SHIFTS); // Load the first block of src data - v_src0_w = _mm_loadu_si128((const __m128i*)(src)); + v_src0_w = _mm_loadu_si128((const __m128i *)(src)); v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); - v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride)); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride)); v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); // Apply the x filter if (xoffset == HALF_PIXEL_OFFSET) { @@ -1794,9 +1718,9 @@ unsigned int vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero( } for (i = 0; i < h; i += 4) { // Load the next block of src data - v_src0_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 2)); + v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2)); v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); - v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 3)); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 3)); v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); // Apply the x filter if (xoffset == HALF_PIXEL_OFFSET) { @@ -1820,19 +1744,19 @@ unsigned int vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero( } // Load the dst data v_dst_w = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); // Load the mask data v_msk_b = _mm_unpacklo_epi32( - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); // Compute the sum and SSE highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); // Load the next block of src data - v_src0_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 4)); + v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 4)); v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); - v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 5)); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 5)); v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); // Apply the x filter if (xoffset == HALF_PIXEL_OFFSET) { @@ -1856,12 +1780,12 @@ unsigned int vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero( } // Load the dst data v_dst_w = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)), - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3))); + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3))); // Load the mask data v_msk_b = _mm_unpacklo_epi32( - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)), - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3))); + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3))); // Compute the sum and SSE highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); // Move onto the next set of rows @@ -1874,145 +1798,115 @@ unsigned int vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero( // For W >=8 #define HIGHBD_MASK_SUBPIX_VAR_LARGE(W, H) \ -unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst8, int dst_stride, \ - const uint8_t *msk, int msk_stride, \ - unsigned int *sse, \ - highbd_calc_masked_var_t calc_var, \ - highbd_variance_fn_t full_variance_function) { \ - uint16_t* src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t* dst = CONVERT_TO_SHORTPTR(dst8); \ - assert(W % 8 == 0); \ - if (xoffset == 0) { \ - if (yoffset == 0) \ - return full_variance_function(src8, src_stride, dst8, dst_stride, \ - msk, msk_stride, sse); \ - else if (yoffset == HALF_PIXEL_OFFSET) \ - return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride, \ - HALF_PIXEL_OFFSET, \ - dst, dst_stride, \ - msk, msk_stride, \ - sse, W, H, \ - highbd_apply_filter_avg, \ - calc_var); \ - else \ - return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride, \ - yoffset, \ - dst, dst_stride, \ - msk, msk_stride, \ - sse, W, H, \ - highbd_apply_filter, \ - calc_var); \ - } else if (yoffset == 0) { \ - if (xoffset == HALF_PIXEL_OFFSET) \ - return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride, \ - HALF_PIXEL_OFFSET, \ - dst, dst_stride, \ - msk, msk_stride, \ - sse, W, H, \ - highbd_apply_filter_avg, \ - calc_var); \ - else \ - return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride, \ - xoffset, \ - dst, dst_stride, \ - msk, msk_stride, \ - sse, W, H, \ - highbd_apply_filter, \ - calc_var); \ - } else if (xoffset == HALF_PIXEL_OFFSET) { \ - if (yoffset == HALF_PIXEL_OFFSET) \ - return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ - src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, \ - dst, dst_stride, msk, msk_stride, sse, W, H, \ - highbd_apply_filter_avg, highbd_apply_filter_avg, calc_var); \ - else \ - return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ - src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, \ - msk, msk_stride, sse, W, H, highbd_apply_filter_avg, \ - highbd_apply_filter, calc_var); \ - } else { \ - if (yoffset == HALF_PIXEL_OFFSET) \ - return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ - src, src_stride, xoffset, HALF_PIXEL_OFFSET, \ - dst, dst_stride, msk, msk_stride, sse, W, H, \ - highbd_apply_filter, highbd_apply_filter_avg, calc_var); \ - else \ - return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ - src, src_stride, xoffset, yoffset, \ - dst, dst_stride, msk, msk_stride, sse, W, H, \ - highbd_apply_filter, highbd_apply_filter, calc_var); \ - } \ -} + unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse, highbd_calc_masked_var_t calc_var, \ + highbd_variance_fn_t full_variance_function) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + assert(W % 8 == 0); \ + if (xoffset == 0) { \ + if (yoffset == 0) \ + return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \ + msk_stride, sse); \ + else if (yoffset == HALF_PIXEL_OFFSET) \ + return vpx_highbd_masked_subpel_varWxH_xzero( \ + src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var); \ + else \ + return vpx_highbd_masked_subpel_varWxH_xzero( \ + src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, \ + W, H, highbd_apply_filter, calc_var); \ + } else if (yoffset == 0) { \ + if (xoffset == HALF_PIXEL_OFFSET) \ + return vpx_highbd_masked_subpel_varWxH_yzero( \ + src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var); \ + else \ + return vpx_highbd_masked_subpel_varWxH_yzero( \ + src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, \ + W, H, highbd_apply_filter, calc_var); \ + } else if (xoffset == HALF_PIXEL_OFFSET) { \ + if (yoffset == HALF_PIXEL_OFFSET) \ + return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, \ + dst_stride, msk, msk_stride, sse, W, H, highbd_apply_filter_avg, \ + highbd_apply_filter_avg, calc_var); \ + else \ + return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter_avg, \ + highbd_apply_filter, calc_var); \ + } else { \ + if (yoffset == HALF_PIXEL_OFFSET) \ + return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter, \ + highbd_apply_filter_avg, calc_var); \ + else \ + return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter, highbd_apply_filter, \ + calc_var); \ + } \ + } // For W < 8 #define HIGHBD_MASK_SUBPIX_VAR_SMALL(W, H) \ -unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst8, int dst_stride, \ - const uint8_t *msk, int msk_stride, \ - unsigned int *sse, \ - highbd_calc_masked_var_t calc_var, \ - highbd_variance_fn_t full_variance_function) { \ - uint16_t* src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t* dst = CONVERT_TO_SHORTPTR(dst8); \ - assert(W == 4); \ - if (xoffset == 0 && yoffset == 0) \ - return full_variance_function(src8, src_stride, dst8, dst_stride, \ - msk, msk_stride, sse); \ - else if (xoffset == 0) \ - return vpx_highbd_masked_subpel_var4xH_xzero(src, src_stride, yoffset, \ - dst, dst_stride, \ - msk, msk_stride, sse, H, \ - calc_var); \ - else if (yoffset == 0) \ - return vpx_highbd_masked_subpel_var4xH_yzero(src, src_stride, xoffset, \ - dst, dst_stride, \ - msk, msk_stride, sse, H, \ - calc_var); \ - else \ - return vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero( \ - src, src_stride, xoffset, yoffset, dst, dst_stride, \ - msk, msk_stride, sse, H, calc_var); \ -} + unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse, highbd_calc_masked_var_t calc_var, \ + highbd_variance_fn_t full_variance_function) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + assert(W == 4); \ + if (xoffset == 0 && yoffset == 0) \ + return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \ + msk_stride, sse); \ + else if (xoffset == 0) \ + return vpx_highbd_masked_subpel_var4xH_xzero( \ + src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H, \ + calc_var); \ + else if (yoffset == 0) \ + return vpx_highbd_masked_subpel_var4xH_yzero( \ + src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H, \ + calc_var); \ + else \ + return vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \ + sse, H, calc_var); \ + } #define HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(W, H) \ -unsigned int vpx_highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst8, int dst_stride, \ - const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(src8, src_stride, \ - xoffset, yoffset, dst8, dst_stride, msk, msk_stride, sse, \ - calc_masked_variance, \ - vpx_highbd_masked_variance##W##x##H##_ssse3); \ -} \ -unsigned int vpx_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst8, int dst_stride, \ - const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(src8, src_stride, \ - xoffset, yoffset, dst8, dst_stride, msk, msk_stride, sse, \ - highbd_10_calc_masked_variance, \ - vpx_highbd_10_masked_variance##W##x##H##_ssse3); \ -} \ -unsigned int vpx_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst8, int dst_stride, \ - const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(src8, src_stride, \ - xoffset, yoffset, dst8, dst_stride, msk, msk_stride, sse, \ - highbd_12_calc_masked_variance, \ - vpx_highbd_12_masked_variance##W##x##H##_ssse3); \ -} \ + unsigned int vpx_highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \ + sse, calc_masked_variance, \ + vpx_highbd_masked_variance##W##x##H##_ssse3); \ + } \ + unsigned int vpx_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \ + sse, highbd_10_calc_masked_variance, \ + vpx_highbd_10_masked_variance##W##x##H##_ssse3); \ + } \ + unsigned int vpx_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \ + sse, highbd_12_calc_masked_variance, \ + vpx_highbd_12_masked_variance##W##x##H##_ssse3); \ + } HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 4) HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 4) diff --git a/vpx_dsp/x86/obmc_sad_sse4.c b/vpx_dsp/x86/obmc_sad_sse4.c index e21bb98c140418e068d7e3cbcd9c6a77b074adae..8a1581c19db17338c60744eca855e70266aeceea 100644 --- a/vpx_dsp/x86/obmc_sad_sse4.c +++ b/vpx_dsp/x86/obmc_sad_sse4.c @@ -22,10 +22,8 @@ // 8 bit //////////////////////////////////////////////////////////////////////////////// -static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, +static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, const int height) { const int pre_step = pre_stride - 4; int n = 0; @@ -62,8 +60,7 @@ static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, - const int32_t *mask, - const int width, + const int32_t *mask, const int width, const int height) { const int pre_step = pre_stride - width; int n = 0; @@ -109,17 +106,16 @@ static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre, return xx_hsum_epi32_si32(v_sad_d); } -#define OBMCSADWXH(w, h) \ -unsigned int vpx_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre, \ - int pre_stride, \ - const int32_t *wsrc, \ - const int32_t *msk) { \ - if (w == 4) { \ - return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \ - } else { \ - return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \ - } \ -} +#define OBMCSADWXH(w, h) \ + unsigned int vpx_obmc_sad##w##x##h##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *msk) { \ + if (w == 4) { \ + return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \ + } else { \ + return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \ + } \ + } #if CONFIG_EXT_PARTITION OBMCSADWXH(128, 128) @@ -187,8 +183,7 @@ static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, - const int width, - const int height) { + const int width, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - width; int n = 0; @@ -234,17 +229,16 @@ static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, return xx_hsum_epi32_si32(v_sad_d); } -#define HBD_OBMCSADWXH(w, h) \ -unsigned int vpx_highbd_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre, \ - int pre_stride, \ - const int32_t *wsrc, \ - const int32_t *mask) { \ - if (w == 4) { \ - return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \ - } else { \ - return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \ - } \ -} +#define HBD_OBMCSADWXH(w, h) \ + unsigned int vpx_highbd_obmc_sad##w##x##h##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + if (w == 4) { \ + return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \ + } else { \ + return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \ + } \ + } #if CONFIG_EXT_PARTITION HBD_OBMCSADWXH(128, 128) diff --git a/vpx_dsp/x86/obmc_variance_sse4.c b/vpx_dsp/x86/obmc_variance_sse4.c index b967c10d5916971c0d11a1cb961b007a9612834d..616db27a6fa456d81516d1abad6ab48fe3a72a69 100644 --- a/vpx_dsp/x86/obmc_variance_sse4.c +++ b/vpx_dsp/x86/obmc_variance_sse4.c @@ -23,12 +23,9 @@ // 8 bit //////////////////////////////////////////////////////////////////////////////// -static INLINE void obmc_variance_w4(const uint8_t *pre, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - unsigned int *const sse, - int *const sum, +static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, const int h) { const int pre_step = pre_stride - 4; int n = 0; @@ -65,14 +62,10 @@ static INLINE void obmc_variance_w4(const uint8_t *pre, *sse = xx_hsum_epi32_si32(v_sse_d); } -static INLINE void obmc_variance_w8n(const uint8_t *pre, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - unsigned int *const sse, - int *const sum, - const int w, - const int h) { +static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { const int pre_step = pre_stride - w; int n = 0; __m128i v_sum_d = _mm_setzero_si128(); @@ -120,20 +113,18 @@ static INLINE void obmc_variance_w8n(const uint8_t *pre, *sse = xx_hsum_epi32_si32(v_sse_d); } -#define OBMCVARWXH(W, H) \ -unsigned int vpx_obmc_variance##W##x##H##_sse4_1(const uint8_t *pre, \ - int pre_stride, \ - const int32_t *wsrc, \ - const int32_t *mask, \ - unsigned int *sse) { \ - int sum; \ - if (W == 4) { \ - obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ - } else { \ - obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ - } \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} +#define OBMCVARWXH(W, H) \ + unsigned int vpx_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + if (W == 4) { \ + obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ + } else { \ + obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } #if CONFIG_EXT_PARTITION OBMCVARWXH(128, 128) @@ -159,13 +150,9 @@ OBMCVARWXH(4, 4) //////////////////////////////////////////////////////////////////////////////// #if CONFIG_VP9_HIGHBITDEPTH -static INLINE void hbd_obmc_variance_w4(const uint8_t *pre8, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - uint64_t *const sse, - int64_t *const sum, - const int h) { +static INLINE void hbd_obmc_variance_w4( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - 4; int n = 0; @@ -202,14 +189,10 @@ static INLINE void hbd_obmc_variance_w4(const uint8_t *pre8, *sse = xx_hsum_epi32_si32(v_sse_d); } -static INLINE void hbd_obmc_variance_w8n(const uint8_t *pre8, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - uint64_t *const sse, - int64_t *const sum, - const int w, - const int h) { +static INLINE void hbd_obmc_variance_w8n( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w, + const int h) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - w; int n = 0; @@ -260,8 +243,7 @@ static INLINE void hbd_obmc_variance_w8n(const uint8_t *pre8, static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, - const int32_t *mask, - int w, int h, + const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int64_t sum64 = 0; uint64_t sse64 = 0; @@ -276,8 +258,7 @@ static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, - const int32_t *mask, - int w, int h, + const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int64_t sum64 = 0; uint64_t sse64 = 0; @@ -292,15 +273,14 @@ static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, - const int32_t *mask, - int w, int h, + const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { int64_t sum64 = 0; uint64_t sse64 = 0; if (w == 128) { do { - hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, - &sse64, &sum64, 128, 32); + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 128, + 32); pre8 += 32 * pre_stride; wsrc += 32 * 128; mask += 32 * 128; @@ -308,8 +288,8 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, } while (h > 0); } else if (w == 64 && h >= 128) { do { - hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, - &sse64, &sum64, 64, 64); + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 64, + 64); pre8 += 64 * pre_stride; wsrc += 64 * 64; mask += 64 * 64; @@ -324,39 +304,30 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); } -#define HBD_OBMCVARWXH(W, H) \ -unsigned int vpx_highbd_obmc_variance##W##x##H##_sse4_1( \ - const uint8_t *pre, \ - int pre_stride, \ - const int32_t *wsrc, \ - const int32_t *mask, \ - unsigned int *sse) { \ - int sum; \ - highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ - \ -unsigned int vpx_highbd_10_obmc_variance##W##x##H##_sse4_1( \ - const uint8_t *pre, \ - int pre_stride, \ - const int32_t *wsrc, \ - const int32_t *mask, \ - unsigned int *sse) { \ - int sum; \ - highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ - \ -unsigned int vpx_highbd_12_obmc_variance##W##x##H##_sse4_1( \ - const uint8_t *pre, \ - int pre_stride, \ - const int32_t *wsrc, \ - const int32_t *mask, \ - unsigned int *sse) { \ - int sum; \ - highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} +#define HBD_OBMCVARWXH(W, H) \ + unsigned int vpx_highbd_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int vpx_highbd_10_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int vpx_highbd_12_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ + } #if CONFIG_EXT_PARTITION HBD_OBMCVARWXH(128, 128) diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c index 8aa4568d674c95c982f8adb1e90870278ea5501c..2c7e431c745a74bb914e8b6d1d33260f940cd76f 100644 --- a/vpx_dsp/x86/quantize_sse2.c +++ b/vpx_dsp/x86/quantize_sse2.c @@ -17,8 +17,9 @@ static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { #if CONFIG_VP9_HIGHBITDEPTH return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], (int16_t)coeff_ptr[4], - (int16_t)coeff_ptr[5], (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); + (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], + (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], + (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); #else return _mm_load_si128((const __m128i *)coeff_ptr); #endif @@ -32,21 +33,20 @@ static INLINE void store_coefficients(__m128i coeff_vals, __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i*)(coeff_ptr + 4), coeff_vals_2); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); #else - _mm_store_si128((__m128i*)(coeff_ptr), coeff_vals); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); #endif } -void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t* zbin_ptr, - const int16_t* round_ptr, const int16_t* quant_ptr, - const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr, - tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr, - uint16_t* eob_ptr, - const int16_t* scan_ptr, - const int16_t* iscan_ptr) { +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { __m128i zero; (void)scan_ptr; @@ -66,13 +66,13 @@ void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs, // Setup global values { __m128i pw_1; - zbin = _mm_load_si128((const __m128i*)zbin_ptr); - round = _mm_load_si128((const __m128i*)round_ptr); - quant = _mm_load_si128((const __m128i*)quant_ptr); + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); pw_1 = _mm_set1_epi16(1); zbin = _mm_sub_epi16(zbin, pw_1); - dequant = _mm_load_si128((const __m128i*)dequant_ptr); - shift = _mm_load_si128((const __m128i*)quant_shift_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); } { @@ -138,8 +138,8 @@ void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -211,8 +211,8 @@ void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index 793658f9ea937098aa17394e0de65d0ce7485ce9..962b8fb11a423dacbb23b130285deb915027f332 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -11,10 +11,8 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_sad32x32x4d_avx2(const uint8_t *src, - int src_stride, - const uint8_t *const ref[4], - int ref_stride, +void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) { __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; @@ -30,7 +28,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src, sum_ref1 = _mm256_set1_epi16(0); sum_ref2 = _mm256_set1_epi16(0); sum_ref3 = _mm256_set1_epi16(0); - for (i = 0; i < 32 ; i++) { + for (i = 0; i < 32; i++) { // load src and all refs src_reg = _mm256_loadu_si256((const __m256i *)src); ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); @@ -48,11 +46,11 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src, sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); - src+= src_stride; - ref0+= ref_stride; - ref1+= ref_stride; - ref2+= ref_stride; - ref3+= ref_stride; + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; } { __m128i sum; @@ -81,10 +79,8 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src, } } -void vpx_sad64x64x4d_avx2(const uint8_t *src, - int src_stride, - const uint8_t *const ref[4], - int ref_stride, +void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) { __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; @@ -102,7 +98,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, sum_ref1 = _mm256_set1_epi16(0); sum_ref2 = _mm256_set1_epi16(0); sum_ref3 = _mm256_set1_epi16(0); - for (i = 0; i < 64 ; i++) { + for (i = 0; i < 64; i++) { // load 64 bytes from src and all refs src_reg = _mm256_loadu_si256((const __m256i *)src); srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); @@ -133,11 +129,11 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); - src+= src_stride; - ref0+= ref_stride; - ref1+= ref_stride; - ref2+= ref_stride; - ref3+= ref_stride; + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; } { __m128i sum; diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c index ce9ad8f780c244d2e2f52fe7b90dcf396f9836ec..d94413430549279b49193dc2237044b2fd1ea093 100644 --- a/vpx_dsp/x86/sad_avx2.c +++ b/vpx_dsp/x86/sad_avx2.c @@ -11,75 +11,74 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" -#define FSAD64_H(h) \ -unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, \ - int src_stride, \ - const uint8_t *ref_ptr, \ - int ref_stride) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - for (i = 0 ; i < h ; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ - sad1_reg = _mm256_sad_epu8(ref1_reg, \ - _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8(ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ - sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr+= ref_stride; \ - src_ptr+= src_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ -} +#define FSAD64_H(h) \ + unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0; i < h; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + return res; \ + } -#define FSAD32_H(h) \ -unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, \ - int src_stride, \ - const uint8_t *ref_ptr, \ - int ref_stride) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - int ref2_stride = ref_stride << 1; \ - int src2_stride = src_stride << 1; \ - int max = h >> 1; \ - for (i = 0 ; i < max ; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ - sad1_reg = _mm256_sad_epu8(ref1_reg, \ - _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8(ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ - sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr+= ref2_stride; \ - src_ptr+= src2_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ -} +#define FSAD32_H(h) \ + unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0; i < max; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref2_stride; \ + src_ptr += src2_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + return res; \ + } -#define FSAD64 \ -FSAD64_H(64); \ -FSAD64_H(32); +#define FSAD64 \ + FSAD64_H(64); \ + FSAD64_H(32); -#define FSAD32 \ -FSAD32_H(64); \ -FSAD32_H(32); \ -FSAD32_H(16); +#define FSAD32 \ + FSAD32_H(64); \ + FSAD32_H(32); \ + FSAD32_H(16); FSAD64; FSAD32; @@ -89,88 +88,86 @@ FSAD32; #undef FSAD64_H #undef FSAD32_H -#define FSADAVG64_H(h) \ -unsigned int vpx_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \ - int src_stride, \ - const uint8_t *ref_ptr, \ - int ref_stride, \ - const uint8_t *second_pred) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - for (i = 0 ; i < h ; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ - ref1_reg = _mm256_avg_epu8(ref1_reg, \ - _mm256_loadu_si256((__m256i const *)second_pred)); \ - ref2_reg = _mm256_avg_epu8(ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(second_pred +32))); \ - sad1_reg = _mm256_sad_epu8(ref1_reg, \ - _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8(ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ - sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr+= ref_stride; \ - src_ptr+= src_stride; \ - second_pred+= 64; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ -} +#define FSADAVG64_H(h) \ + unsigned int vpx_sad64x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0; i < h; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + return res; \ + } -#define FSADAVG32_H(h) \ -unsigned int vpx_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \ - int src_stride, \ - const uint8_t *ref_ptr, \ - int ref_stride, \ - const uint8_t *second_pred) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - int ref2_stride = ref_stride << 1; \ - int src2_stride = src_stride << 1; \ - int max = h >> 1; \ - for (i = 0 ; i < max ; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ - ref1_reg = _mm256_avg_epu8(ref1_reg, \ - _mm256_loadu_si256((__m256i const *)second_pred)); \ - ref2_reg = _mm256_avg_epu8(ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(second_pred +32))); \ - sad1_reg = _mm256_sad_epu8(ref1_reg, \ - _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8(ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ - sum_sad = _mm256_add_epi32(sum_sad, \ - _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr+= ref2_stride; \ - src_ptr+= src2_stride; \ - second_pred+= 64; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ -} +#define FSADAVG32_H(h) \ + unsigned int vpx_sad32x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0; i < max; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref2_stride; \ + src_ptr += src2_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + return res; \ + } -#define FSADAVG64 \ -FSADAVG64_H(64); \ -FSADAVG64_H(32); +#define FSADAVG64 \ + FSADAVG64_H(64); \ + FSADAVG64_H(32); -#define FSADAVG32 \ -FSADAVG32_H(64); \ -FSADAVG32_H(32); \ -FSADAVG32_H(16); +#define FSADAVG32 \ + FSADAVG32_H(64); \ + FSADAVG32_H(32); \ + FSADAVG32_H(16); FSADAVG64; FSADAVG32; diff --git a/vpx_dsp/x86/sum_squares_sse2.c b/vpx_dsp/x86/sum_squares_sse2.c index 5ecd87e7387ee492c0421bebdae1ca2b76e44fae..3d24716f13b28797ba7da93fb2d87e1a0a6a3653 100644 --- a/vpx_dsp/x86/sum_squares_sse2.c +++ b/vpx_dsp/x86/sum_squares_sse2.c @@ -18,10 +18,14 @@ static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) { - const __m128i v_val_0_w = _mm_loadl_epi64((const __m128i*)(src+0*stride)); - const __m128i v_val_1_w = _mm_loadl_epi64((const __m128i*)(src+1*stride)); - const __m128i v_val_2_w = _mm_loadl_epi64((const __m128i*)(src+2*stride)); - const __m128i v_val_3_w = _mm_loadl_epi64((const __m128i*)(src+3*stride)); + const __m128i v_val_0_w = + _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); + const __m128i v_val_1_w = + _mm_loadl_epi64((const __m128i *)(src + 1 * stride)); + const __m128i v_val_2_w = + _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); + const __m128i v_val_3_w = + _mm_loadl_epi64((const __m128i *)(src + 3 * stride)); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); @@ -32,8 +36,8 @@ static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src, const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, - _mm_srli_epi64(v_sum_0123_d, 32)); + const __m128i v_sum_d = + _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); return (uint64_t)_mm_cvtsi128_si32(v_sum_d); } @@ -44,9 +48,8 @@ static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src, // maintenance instructions in the common case of 4x4. __attribute__((noinline)) #endif -static uint64_t vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, - int stride, - int size) { +static uint64_t +vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) { int r, c; const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); @@ -56,16 +59,24 @@ static uint64_t vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, __m128i v_acc_d = _mm_setzero_si128(); for (c = 0; c < size; c += 8) { - const int16_t *b = src+c; - - const __m128i v_val_0_w = _mm_load_si128((const __m128i*)(b+0*stride)); - const __m128i v_val_1_w = _mm_load_si128((const __m128i*)(b+1*stride)); - const __m128i v_val_2_w = _mm_load_si128((const __m128i*)(b+2*stride)); - const __m128i v_val_3_w = _mm_load_si128((const __m128i*)(b+3*stride)); - const __m128i v_val_4_w = _mm_load_si128((const __m128i*)(b+4*stride)); - const __m128i v_val_5_w = _mm_load_si128((const __m128i*)(b+5*stride)); - const __m128i v_val_6_w = _mm_load_si128((const __m128i*)(b+6*stride)); - const __m128i v_val_7_w = _mm_load_si128((const __m128i*)(b+7*stride)); + const int16_t *b = src + c; + + const __m128i v_val_0_w = + _mm_load_si128((const __m128i *)(b + 0 * stride)); + const __m128i v_val_1_w = + _mm_load_si128((const __m128i *)(b + 1 * stride)); + const __m128i v_val_2_w = + _mm_load_si128((const __m128i *)(b + 2 * stride)); + const __m128i v_val_3_w = + _mm_load_si128((const __m128i *)(b + 3 * stride)); + const __m128i v_val_4_w = + _mm_load_si128((const __m128i *)(b + 4 * stride)); + const __m128i v_val_5_w = + _mm_load_si128((const __m128i *)(b + 5 * stride)); + const __m128i v_val_6_w = + _mm_load_si128((const __m128i *)(b + 6 * stride)); + const __m128i v_val_7_w = + _mm_load_si128((const __m128i *)(b + 7 * stride)); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); @@ -91,7 +102,7 @@ static uint64_t vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); - src += 8*stride; + src += 8 * stride; } v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); @@ -101,21 +112,20 @@ static uint64_t vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, #else { uint64_t tmp; - _mm_storel_epi64((__m128i*)&tmp, v_acc_q); + _mm_storel_epi64((__m128i *)&tmp, v_acc_q); return tmp; } #endif } -uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, - int size) { +uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) { // 4 elements per row only requires half an XMM register, so this // must be a special case, but also note that over 75% of all calls // are with size == 4, so it is also the common case. if (LIKELY(size == 4)) { return vpx_sum_squares_2d_i16_4x4_sse2(src, stride); } else { - // Generic case + // Generic case return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size); } } @@ -176,7 +186,7 @@ static uint64_t vpx_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { #else { uint64_t tmp; - _mm_storel_epi64((__m128i*)&tmp, v_acc0_q); + _mm_storel_epi64((__m128i *)&tmp, v_acc0_q); return tmp; } #endif @@ -186,7 +196,7 @@ uint64_t vpx_sum_squares_i16_sse2(const int16_t *src, uint32_t n) { if (n % 64 == 0) { return vpx_sum_squares_i16_64n_sse2(src, n); } else if (n > 64) { - int k = n & ~(64-1); + int k = n & ~(64 - 1); return vpx_sum_squares_i16_64n_sse2(src, k) + vpx_sum_squares_i16_c(src + k, n - k); } else { diff --git a/vpx_dsp/x86/synonyms.h b/vpx_dsp/x86/synonyms.h index 6708dd1101b0eb4e1170a5497497af53096b2e2e..fb4b9428b00716bcebfe708ea6f135adfb149d10 100644 --- a/vpx_dsp/x86/synonyms.h +++ b/vpx_dsp/x86/synonyms.h @@ -26,35 +26,35 @@ // Loads and stores to do away with the tedium of casting the address // to the right type. static INLINE __m128i xx_loadl_32(const void *a) { - return _mm_cvtsi32_si128(*(const uint32_t*)a); + return _mm_cvtsi32_si128(*(const uint32_t *)a); } static INLINE __m128i xx_loadl_64(const void *a) { - return _mm_loadl_epi64((const __m128i*)a); + return _mm_loadl_epi64((const __m128i *)a); } static INLINE __m128i xx_load_128(const void *a) { - return _mm_load_si128((const __m128i*)a); + return _mm_load_si128((const __m128i *)a); } static INLINE __m128i xx_loadu_128(const void *a) { - return _mm_loadu_si128((const __m128i*)a); + return _mm_loadu_si128((const __m128i *)a); } static INLINE void xx_storel_32(void *const a, const __m128i v) { - *(uint32_t*)a = _mm_cvtsi128_si32(v); + *(uint32_t *)a = _mm_cvtsi128_si32(v); } static INLINE void xx_storel_64(void *const a, const __m128i v) { - _mm_storel_epi64((__m128i*)a, v); + _mm_storel_epi64((__m128i *)a, v); } static INLINE void xx_store_128(void *const a, const __m128i v) { - _mm_store_si128((__m128i*)a, v); + _mm_store_si128((__m128i *)a, v); } static INLINE void xx_storeu_128(void *const a, const __m128i v) { - _mm_storeu_si128((__m128i*)a, v); + _mm_storeu_si128((__m128i *)a, v); } static INLINE __m128i xx_round_epu16(__m128i v_val_w) { @@ -62,7 +62,7 @@ static INLINE __m128i xx_round_epu16(__m128i v_val_w) { } static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) { - const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1); + const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1); return _mm_avg_epu16(v_s_w, _mm_setzero_si128()); } @@ -75,8 +75,8 @@ static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) { static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1)); const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); - const __m128i v_tmp_d = _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), - v_sign_d); + const __m128i v_tmp_d = + _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); return _mm_srai_epi32(v_tmp_d, bits); } @@ -94,14 +94,14 @@ static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) { #else { int64_t tmp; - _mm_storel_epi64((__m128i*)&tmp, v_q); + _mm_storel_epi64((__m128i *)&tmp, v_q); return tmp; } #endif } static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) { - const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); + const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); diff --git a/vpx_dsp/x86/txfm_common_sse2.h b/vpx_dsp/x86/txfm_common_sse2.h index f886d30de57b63223ba505b24aa7f3384895c027..e148f5c8bf95051414bb270b042875d390994cea 100644 --- a/vpx_dsp/x86/txfm_common_sse2.h +++ b/vpx_dsp/x86/txfm_common_sse2.h @@ -14,15 +14,15 @@ #include <emmintrin.h> #include "vpx/vpx_integer.h" -#define pair_set_epi16(a, b) \ +#define pair_set_epi16(a, b) \ _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) -#define dual_set_epi16(a, b) \ +#define dual_set_epi16(a, b) \ _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) -#define octa_set_epi16(a, b, c, d, e, f, g, h) \ +#define octa_set_epi16(a, b, c, d, e, f, g, h) \ _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c index f8c97117d487ee46e654a9e7ac5f3cb0caa80d74..7bc2693cfbbfd0f41915fa4953610e128a83655c 100644 --- a/vpx_dsp/x86/variance_avx2.c +++ b/vpx_dsp/x86/variance_avx2.c @@ -14,13 +14,13 @@ typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, unsigned int *sse, int *sum); void vpx_get32x32var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); + const uint8_t *ref, int ref_stride, unsigned int *sse, + int *sum); static void variance_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - int w, int h, unsigned int *sse, int *sum, - get_var_avx2 var_fn, int block_size) { + const uint8_t *ref, int ref_stride, int w, int h, + unsigned int *sse, int *sum, get_var_avx2 var_fn, + int block_size) { int i, j; *sse = 0; @@ -30,21 +30,20 @@ static void variance_avx2(const uint8_t *src, int src_stride, for (j = 0; j < w; j += block_size) { unsigned int sse0; int sum0; - var_fn(&src[src_stride * i + j], src_stride, - &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); + var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j], + ref_stride, &sse0, &sum0); *sse += sse0; *sum += sum0; } } } - unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_avx2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vpx_get16x16var_avx2, 16); + variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + vpx_get16x16var_avx2, 16); return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); } @@ -60,8 +59,8 @@ unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_avx2(src, src_stride, ref, ref_stride, 32, 16, - sse, &sum, vpx_get32x32var_avx2, 32); + variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, + vpx_get32x32var_avx2, 32); return *sse - (((int64_t)sum * sum) >> 9); } @@ -69,8 +68,8 @@ unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_avx2(src, src_stride, ref, ref_stride, 32, 32, - sse, &sum, vpx_get32x32var_avx2, 32); + variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, + vpx_get32x32var_avx2, 32); return *sse - (((int64_t)sum * sum) >> 10); } @@ -78,8 +77,8 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_avx2(src, src_stride, ref, ref_stride, 64, 64, - sse, &sum, vpx_get32x32var_avx2, 32); + variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, + vpx_get32x32var_avx2, 32); return *sse - (((int64_t)sum * sum) >> 12); } @@ -87,79 +86,58 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_avx2(src, src_stride, ref, ref_stride, 64, 32, - sse, &sum, vpx_get32x32var_avx2, 32); + variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, + vpx_get32x32var_avx2, 32); return *sse - (((int64_t)sum * sum) >> 11); } unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, - int height, - unsigned int *sse); - -unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - const uint8_t *sec, - int sec_stride, - int height, - unsigned int *sseptr); + int height, unsigned int *sse); + +unsigned int vpx_sub_pixel_avg_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, + int height, unsigned int *sseptr); unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, + int src_stride, int x_offset, + int y_offset, const uint8_t *dst, int dst_stride, unsigned int *sse) { unsigned int sse1; - const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - 64, &sse1); + const int se1 = vpx_sub_pixel_variance32xh_avx2( + src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1); unsigned int sse2; - const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, - x_offset, y_offset, - dst + 32, dst_stride, - 64, &sse2); + const int se2 = + vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, + dst + 32, dst_stride, 64, &sse2); const int se = se1 + se2; *sse = sse1 + sse2; return *sse - (((int64_t)se * se) >> 12); } unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, + int src_stride, int x_offset, + int y_offset, const uint8_t *dst, int dst_stride, unsigned int *sse) { - const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - 32, sse); + const int se = vpx_sub_pixel_variance32xh_avx2( + src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); return *sse - (((int64_t)se * se) >> 10); } -unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - unsigned int *sse, - const uint8_t *sec) { +unsigned int vpx_sub_pixel_avg_variance64x64_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { unsigned int sse1; - const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - sec, 64, 64, &sse1); + const int se1 = vpx_sub_pixel_avg_variance32xh_avx2( + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1); unsigned int sse2; - const int se2 = - vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, - y_offset, dst + 32, dst_stride, - sec + 32, 64, 64, &sse2); + const int se2 = vpx_sub_pixel_avg_variance32xh_avx2( + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32, + 64, 64, &sse2); const int se = se1 + se2; *sse = sse1 + sse2; @@ -167,17 +145,11 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, return *sse - (((int64_t)se * se) >> 12); } -unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - unsigned int *sse, - const uint8_t *sec) { +unsigned int vpx_sub_pixel_avg_variance32x32_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { // Process 32 elements in parallel. - const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - sec, 32, 32, sse); + const int se = vpx_sub_pixel_avg_variance32xh_avx2( + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); return *sse - (((int64_t)se * se) >> 10); } diff --git a/vpx_dsp/x86/variance_impl_avx2.c b/vpx_dsp/x86/variance_impl_avx2.c index b289e9a0c74840d652aa74c38cf0ca0c0d2cf05a..f26eda3e5c05d0c63d7bd5921ebcb435383f9c24 100644 --- a/vpx_dsp/x86/variance_impl_avx2.c +++ b/vpx_dsp/x86/variance_impl_avx2.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" +/* clang-format off */ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, @@ -31,289 +32,275 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, }; +/* clang-format on */ + +void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *SSE, int *Sum) { + __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; + __m256i ref_expand_high, madd_low, madd_high; + unsigned int i, src_2strides, ref_2strides; + __m256i zero_reg = _mm256_set1_epi16(0); + __m256i sum_ref_src = _mm256_set1_epi16(0); + __m256i madd_ref_src = _mm256_set1_epi16(0); + + // processing two strides in a 256 bit register reducing the number + // of loop stride by half (comparing to the sse2 code) + src_2strides = source_stride << 1; + ref_2strides = recon_stride << 1; + for (i = 0; i < 8; i++) { + src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr))); + src = _mm256_inserti128_si256( + src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1); + + ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr))); + ref = _mm256_inserti128_si256( + ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1); + + // expanding to 16 bit each lane + src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); + src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); + + ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); + ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); + + // src-ref + src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); + src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); + + // madd low (src - ref) + madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); + + // add high to low + src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); + + // madd high (src - ref) + madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); + + sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); + + // add high to low + madd_ref_src = + _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); + + src_ptr += src_2strides; + ref_ptr += ref_2strides; + } + { + __m128i sum_res, madd_res; + __m128i expand_sum_low, expand_sum_high, expand_sum; + __m128i expand_madd_low, expand_madd_high, expand_madd; + __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; -void vpx_get16x16var_avx2(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum) { - __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; - __m256i ref_expand_high, madd_low, madd_high; - unsigned int i, src_2strides, ref_2strides; - __m256i zero_reg = _mm256_set1_epi16(0); - __m256i sum_ref_src = _mm256_set1_epi16(0); - __m256i madd_ref_src = _mm256_set1_epi16(0); - - // processing two strides in a 256 bit register reducing the number - // of loop stride by half (comparing to the sse2 code) - src_2strides = source_stride << 1; - ref_2strides = recon_stride << 1; - for (i = 0; i < 8; i++) { - src = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i const *) (src_ptr))); - src = _mm256_inserti128_si256(src, - _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1); - - ref =_mm256_castsi128_si256( - _mm_loadu_si128((__m128i const *) (ref_ptr))); - ref = _mm256_inserti128_si256(ref, - _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1); - - // expanding to 16 bit each lane - src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); - src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); - - ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); - ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); - - // src-ref - src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); - src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); - - // madd low (src - ref) - madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); - - // add high to low - src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); - - // madd high (src - ref) - madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); - - sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); - - // add high to low - madd_ref_src = _mm256_add_epi32(madd_ref_src, - _mm256_add_epi32(madd_low, madd_high)); - - src_ptr+= src_2strides; - ref_ptr+= ref_2strides; - } - - { - __m128i sum_res, madd_res; - __m128i expand_sum_low, expand_sum_high, expand_sum; - __m128i expand_madd_low, expand_madd_high, expand_madd; - __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; - - // extract the low lane and add it to the high lane - sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), - _mm256_extractf128_si256(sum_ref_src, 1)); + // extract the low lane and add it to the high lane + sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), + _mm256_extractf128_si256(sum_ref_src, 1)); - madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), - _mm256_extractf128_si256(madd_ref_src, 1)); + madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), + _mm256_extractf128_si256(madd_ref_src, 1)); - // padding each 2 bytes with another 2 zeroed bytes - expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), - sum_res); - expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), - sum_res); + // padding each 2 bytes with another 2 zeroed bytes + expand_sum_low = + _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res); + expand_sum_high = + _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res); - // shifting the sign 16 bits right - expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); - expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); + // shifting the sign 16 bits right + expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); + expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); - expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); + expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); - // expand each 32 bits of the madd result to 64 bits - expand_madd_low = _mm_unpacklo_epi32(madd_res, - _mm256_castsi256_si128(zero_reg)); - expand_madd_high = _mm_unpackhi_epi32(madd_res, - _mm256_castsi256_si128(zero_reg)); + // expand each 32 bits of the madd result to 64 bits + expand_madd_low = + _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); + expand_madd_high = + _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); - expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); + expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); - ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum, - _mm256_castsi256_si128(zero_reg)); - ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum, - _mm256_castsi256_si128(zero_reg)); + ex_expand_sum_low = + _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); + ex_expand_sum_high = + _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); - ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); + ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); - // shift 8 bytes eight - madd_res = _mm_srli_si128(expand_madd, 8); - sum_res = _mm_srli_si128(ex_expand_sum, 8); + // shift 8 bytes eight + madd_res = _mm_srli_si128(expand_madd, 8); + sum_res = _mm_srli_si128(ex_expand_sum, 8); - madd_res = _mm_add_epi32(madd_res, expand_madd); - sum_res = _mm_add_epi32(sum_res, ex_expand_sum); + madd_res = _mm_add_epi32(madd_res, expand_madd); + sum_res = _mm_add_epi32(sum_res, ex_expand_sum); - *((int*)SSE)= _mm_cvtsi128_si32(madd_res); + *((int *)SSE) = _mm_cvtsi128_si32(madd_res); - *((int*)Sum)= _mm_cvtsi128_si32(sum_res); - } + *((int *)Sum) = _mm_cvtsi128_si32(sum_res); + } } -void vpx_get32x32var_avx2(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum) { - __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; - __m256i ref_expand_high, madd_low, madd_high; - unsigned int i; - __m256i zero_reg = _mm256_set1_epi16(0); - __m256i sum_ref_src = _mm256_set1_epi16(0); - __m256i madd_ref_src = _mm256_set1_epi16(0); +void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *SSE, int *Sum) { + __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; + __m256i ref_expand_high, madd_low, madd_high; + unsigned int i; + __m256i zero_reg = _mm256_set1_epi16(0); + __m256i sum_ref_src = _mm256_set1_epi16(0); + __m256i madd_ref_src = _mm256_set1_epi16(0); - // processing 32 elements in parallel - for (i = 0; i < 16; i++) { - src = _mm256_loadu_si256((__m256i const *) (src_ptr)); + // processing 32 elements in parallel + for (i = 0; i < 16; i++) { + src = _mm256_loadu_si256((__m256i const *)(src_ptr)); - ref = _mm256_loadu_si256((__m256i const *) (ref_ptr)); + ref = _mm256_loadu_si256((__m256i const *)(ref_ptr)); - // expanding to 16 bit each lane - src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); - src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); + // expanding to 16 bit each lane + src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); + src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); - ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); - ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); + ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); + ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); - // src-ref - src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); - src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); + // src-ref + src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); + src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); - // madd low (src - ref) - madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); + // madd low (src - ref) + madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); - // add high to low - src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); + // add high to low + src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); - // madd high (src - ref) - madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); + // madd high (src - ref) + madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); - sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); + sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); - // add high to low - madd_ref_src = _mm256_add_epi32(madd_ref_src, - _mm256_add_epi32(madd_low, madd_high)); + // add high to low + madd_ref_src = + _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); - src_ptr+= source_stride; - ref_ptr+= recon_stride; - } + src_ptr += source_stride; + ref_ptr += recon_stride; + } - { - __m256i expand_sum_low, expand_sum_high, expand_sum; - __m256i expand_madd_low, expand_madd_high, expand_madd; - __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; + { + __m256i expand_sum_low, expand_sum_high, expand_sum; + __m256i expand_madd_low, expand_madd_high, expand_madd; + __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; - // padding each 2 bytes with another 2 zeroed bytes - expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); - expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); + // padding each 2 bytes with another 2 zeroed bytes + expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); + expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); - // shifting the sign 16 bits right - expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); - expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); + // shifting the sign 16 bits right + expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); + expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); - expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); + expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); - // expand each 32 bits of the madd result to 64 bits - expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); - expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); + // expand each 32 bits of the madd result to 64 bits + expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); + expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); - expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); + expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); - ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); - ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); + ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); + ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); - ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); + ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); - // shift 8 bytes eight - madd_ref_src = _mm256_srli_si256(expand_madd, 8); - sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); + // shift 8 bytes eight + madd_ref_src = _mm256_srli_si256(expand_madd, 8); + sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); - madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); - sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); + madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); + sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); - // extract the low lane and the high lane and add the results - *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + - _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); + // extract the low lane and the high lane and add the results + *((int *)SSE) = + _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); - *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + - _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); - } + *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); + } } -#define FILTER_SRC(filter) \ - /* filter the source */ \ +#define FILTER_SRC(filter) \ + /* filter the source */ \ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ - \ - /* add 8 to source */ \ - exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ - exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ - \ - /* divide source by 16 */ \ - exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ + \ + /* add 8 to source */ \ + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ + \ + /* divide source by 16 */ \ + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); -#define MERGE_WITH_SRC(src_reg, reg) \ +#define MERGE_WITH_SRC(src_reg, reg) \ exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); -#define LOAD_SRC_DST \ - /* load source and destination */ \ - src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ - dst_reg = _mm256_loadu_si256((__m256i const *) (dst)); +#define LOAD_SRC_DST \ + /* load source and destination */ \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); -#define AVG_NEXT_SRC(src_reg, size_stride) \ - src_next_reg = _mm256_loadu_si256((__m256i const *) \ - (src + size_stride)); \ - /* average between current and next stride source */ \ +#define AVG_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ + /* average between current and next stride source */ \ src_reg = _mm256_avg_epu8(src_reg, src_next_reg); -#define MERGE_NEXT_SRC(src_reg, size_stride) \ - src_next_reg = _mm256_loadu_si256((__m256i const *) \ - (src + size_stride)); \ +#define MERGE_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ MERGE_WITH_SRC(src_reg, src_next_reg) -#define CALC_SUM_SSE_INSIDE_LOOP \ - /* expand each byte to 2 bytes */ \ - exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ - exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ - /* source - dest */ \ - exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ - exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ - /* caculate sum */ \ - sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ +#define CALC_SUM_SSE_INSIDE_LOOP \ + /* expand each byte to 2 bytes */ \ + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ + /* source - dest */ \ + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ + /* caculate sum */ \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ - sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ - /* calculate sse */ \ - sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ + /* calculate sse */ \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); // final calculation to sum and sse -#define CALC_SUM_AND_SSE \ - res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ - sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ - sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ - sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ - sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ - sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ - \ - sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ - sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ - \ - sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ - sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ - *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ - _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ - sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ - sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ - sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ +#define CALC_SUM_AND_SSE \ + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ + \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ + \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); - -unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - int height, - unsigned int *sse) { +unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse) { __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; __m256i zero_reg; @@ -325,66 +312,66 @@ unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, // x_offset = 0 and y_offset = 0 if (x_offset == 0) { if (y_offset == 0) { - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { LOAD_SRC_DST // expend each byte to 2 bytes MERGE_WITH_SRC(src_reg, zero_reg) CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; + src += src_stride; + dst += dst_stride; } - // x_offset = 0 and y_offset = 8 + // x_offset = 0 and y_offset = 8 } else if (y_offset == 8) { __m256i src_next_reg; - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { LOAD_SRC_DST AVG_NEXT_SRC(src_reg, src_stride) // expend each byte to 2 bytes MERGE_WITH_SRC(src_reg, zero_reg) CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; + src += src_stride; + dst += dst_stride; } - // x_offset = 0 and y_offset = bilin interpolation + // x_offset = 0 and y_offset = bilin interpolation } else { __m256i filter, pw8, src_next_reg; y_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + y_offset)); + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { LOAD_SRC_DST MERGE_NEXT_SRC(src_reg, src_stride) FILTER_SRC(filter) CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; + src += src_stride; + dst += dst_stride; } } - // x_offset = 8 and y_offset = 0 + // x_offset = 8 and y_offset = 0 } else if (x_offset == 8) { if (y_offset == 0) { __m256i src_next_reg; - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { LOAD_SRC_DST AVG_NEXT_SRC(src_reg, 1) // expand each byte to 2 bytes MERGE_WITH_SRC(src_reg, zero_reg) CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; + src += src_stride; + dst += dst_stride; } - // x_offset = 8 and y_offset = 8 + // x_offset = 8 and y_offset = 8 } else if (y_offset == 8) { __m256i src_next_reg, src_avg; // load source and another source starting from the next // following byte - src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { src_avg = src_reg; - src+= src_stride; + src += src_stride; LOAD_SRC_DST AVG_NEXT_SRC(src_reg, 1) // average between previous average to current average @@ -393,92 +380,92 @@ unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, MERGE_WITH_SRC(src_avg, zero_reg) // save current source average CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; + dst += dst_stride; } - // x_offset = 8 and y_offset = bilin interpolation + // x_offset = 8 and y_offset = bilin interpolation } else { __m256i filter, pw8, src_next_reg, src_avg; y_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + y_offset)); + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); pw8 = _mm256_set1_epi16(8); // load source and another source starting from the next // following byte - src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { // save current source average src_avg = src_reg; - src+= src_stride; + src += src_stride; LOAD_SRC_DST AVG_NEXT_SRC(src_reg, 1) MERGE_WITH_SRC(src_avg, src_reg) FILTER_SRC(filter) CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; + dst += dst_stride; } } - // x_offset = bilin interpolation and y_offset = 0 + // x_offset = bilin interpolation and y_offset = 0 } else { if (y_offset == 0) { __m256i filter, pw8, src_next_reg; x_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + x_offset)); + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { LOAD_SRC_DST MERGE_NEXT_SRC(src_reg, 1) FILTER_SRC(filter) CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; + src += src_stride; + dst += dst_stride; } - // x_offset = bilin interpolation and y_offset = 8 + // x_offset = bilin interpolation and y_offset = 8 } else if (y_offset == 8) { __m256i filter, pw8, src_next_reg, src_pack; x_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + x_offset)); + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); pw8 = _mm256_set1_epi16(8); - src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); MERGE_NEXT_SRC(src_reg, 1) FILTER_SRC(filter) // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height ; i++) { - src+= src_stride; + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; LOAD_SRC_DST MERGE_NEXT_SRC(src_reg, 1) FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); // average between previous pack to the current src_pack = _mm256_avg_epu8(src_pack, src_reg); MERGE_WITH_SRC(src_pack, zero_reg) CALC_SUM_SSE_INSIDE_LOOP src_pack = src_reg; - dst+= dst_stride; + dst += dst_stride; } - // x_offset = bilin interpolation and y_offset = bilin interpolation + // x_offset = bilin interpolation and y_offset = bilin interpolation } else { __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; x_offset <<= 5; - xfilter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + x_offset)); + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); y_offset <<= 5; - yfilter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + y_offset)); + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); pw8 = _mm256_set1_epi16(8); // load source and another source starting from the next // following byte - src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); MERGE_NEXT_SRC(src_reg, 1) FILTER_SRC(xfilter) // convert each 16 bit to 8 bit to each low and high lane source src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height ; i++) { - src+= src_stride; + for (i = 0; i < height; i++) { + src += src_stride; LOAD_SRC_DST MERGE_NEXT_SRC(src_reg, 1) FILTER_SRC(xfilter) @@ -489,7 +476,7 @@ unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, FILTER_SRC(yfilter) src_pack = src_reg; CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; + dst += dst_stride; } } } @@ -497,16 +484,10 @@ unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, return sum; } -unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - const uint8_t *sec, - int sec_stride, - int height, - unsigned int *sse) { +unsigned int vpx_sub_pixel_avg_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, + int height, unsigned int *sse) { __m256i sec_reg; __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; @@ -519,190 +500,190 @@ unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, // x_offset = 0 and y_offset = 0 if (x_offset == 0) { if (y_offset == 0) { - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { LOAD_SRC_DST - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec+= sec_stride; + sec += sec_stride; // expend each byte to 2 bytes MERGE_WITH_SRC(src_reg, zero_reg) CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; + src += src_stride; + dst += dst_stride; } } else if (y_offset == 8) { __m256i src_next_reg; - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { LOAD_SRC_DST AVG_NEXT_SRC(src_reg, src_stride) - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec+= sec_stride; + sec += sec_stride; // expend each byte to 2 bytes MERGE_WITH_SRC(src_reg, zero_reg) CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; + src += src_stride; + dst += dst_stride; } - // x_offset = 0 and y_offset = bilin interpolation + // x_offset = 0 and y_offset = bilin interpolation } else { __m256i filter, pw8, src_next_reg; y_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + y_offset)); + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { LOAD_SRC_DST MERGE_NEXT_SRC(src_reg, src_stride) FILTER_SRC(filter) src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec+= sec_stride; + sec += sec_stride; MERGE_WITH_SRC(src_reg, zero_reg) CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; + src += src_stride; + dst += dst_stride; } } - // x_offset = 8 and y_offset = 0 + // x_offset = 8 and y_offset = 0 } else if (x_offset == 8) { if (y_offset == 0) { __m256i src_next_reg; - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { LOAD_SRC_DST AVG_NEXT_SRC(src_reg, 1) - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec+= sec_stride; + sec += sec_stride; // expand each byte to 2 bytes MERGE_WITH_SRC(src_reg, zero_reg) CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; + src += src_stride; + dst += dst_stride; } - // x_offset = 8 and y_offset = 8 + // x_offset = 8 and y_offset = 8 } else if (y_offset == 8) { __m256i src_next_reg, src_avg; // load source and another source starting from the next // following byte - src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { // save current source average src_avg = src_reg; - src+= src_stride; + src += src_stride; LOAD_SRC_DST AVG_NEXT_SRC(src_reg, 1) // average between previous average to current average src_avg = _mm256_avg_epu8(src_avg, src_reg); - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); src_avg = _mm256_avg_epu8(src_avg, sec_reg); - sec+= sec_stride; + sec += sec_stride; // expand each byte to 2 bytes MERGE_WITH_SRC(src_avg, zero_reg) CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; + dst += dst_stride; } - // x_offset = 8 and y_offset = bilin interpolation + // x_offset = 8 and y_offset = bilin interpolation } else { __m256i filter, pw8, src_next_reg, src_avg; y_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + y_offset)); + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); pw8 = _mm256_set1_epi16(8); // load source and another source starting from the next // following byte - src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { // save current source average src_avg = src_reg; - src+= src_stride; + src += src_stride; LOAD_SRC_DST AVG_NEXT_SRC(src_reg, 1) MERGE_WITH_SRC(src_avg, src_reg) FILTER_SRC(filter) src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); src_avg = _mm256_avg_epu8(src_avg, sec_reg); // expand each byte to 2 bytes MERGE_WITH_SRC(src_avg, zero_reg) - sec+= sec_stride; + sec += sec_stride; CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; + dst += dst_stride; } } - // x_offset = bilin interpolation and y_offset = 0 + // x_offset = bilin interpolation and y_offset = 0 } else { if (y_offset == 0) { __m256i filter, pw8, src_next_reg; x_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + x_offset)); + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height ; i++) { + for (i = 0; i < height; i++) { LOAD_SRC_DST MERGE_NEXT_SRC(src_reg, 1) FILTER_SRC(filter) src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); src_reg = _mm256_avg_epu8(src_reg, sec_reg); MERGE_WITH_SRC(src_reg, zero_reg) - sec+= sec_stride; + sec += sec_stride; CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; + src += src_stride; + dst += dst_stride; } - // x_offset = bilin interpolation and y_offset = 8 + // x_offset = bilin interpolation and y_offset = 8 } else if (y_offset == 8) { __m256i filter, pw8, src_next_reg, src_pack; x_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + x_offset)); + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); pw8 = _mm256_set1_epi16(8); - src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); MERGE_NEXT_SRC(src_reg, 1) FILTER_SRC(filter) // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height ; i++) { - src+= src_stride; + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; LOAD_SRC_DST MERGE_NEXT_SRC(src_reg, 1) FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); // average between previous pack to the current src_pack = _mm256_avg_epu8(src_pack, src_reg); - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); src_pack = _mm256_avg_epu8(src_pack, sec_reg); - sec+= sec_stride; + sec += sec_stride; MERGE_WITH_SRC(src_pack, zero_reg) src_pack = src_reg; CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; + dst += dst_stride; } - // x_offset = bilin interpolation and y_offset = bilin interpolation + // x_offset = bilin interpolation and y_offset = bilin interpolation } else { __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; x_offset <<= 5; - xfilter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + x_offset)); + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); y_offset <<= 5; - yfilter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + y_offset)); + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); pw8 = _mm256_set1_epi16(8); // load source and another source starting from the next // following byte - src_reg = _mm256_loadu_si256((__m256i const *) (src)); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); MERGE_NEXT_SRC(src_reg, 1) FILTER_SRC(xfilter) // convert each 16 bit to 8 bit to each low and high lane source src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height ; i++) { - src+= src_stride; + for (i = 0; i < height; i++) { + src += src_stride; LOAD_SRC_DST MERGE_NEXT_SRC(src_reg, 1) FILTER_SRC(xfilter) @@ -712,13 +693,13 @@ unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, // filter the source FILTER_SRC(yfilter) src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); src_pack = _mm256_avg_epu8(src_pack, sec_reg); MERGE_WITH_SRC(src_pack, zero_reg) src_pack = src_reg; - sec+= sec_stride; + sec += sec_stride; CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; + dst += dst_stride; } } } diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c index e76c1a287285bb058611dc93cd32b9d57e01941b..e40eed7fea6639127d1c7027533294d6051d142f 100644 --- a/vpx_dsp/x86/variance_sse2.c +++ b/vpx_dsp/x86/variance_sse2.c @@ -15,9 +15,9 @@ #include "vpx_ports/mem.h" -typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); +typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); unsigned int vpx_get_mb_ss_sse2(const int16_t *src) { __m128i vsum = _mm_setzero_si128(); @@ -31,11 +31,12 @@ unsigned int vpx_get_mb_ss_sse2(const int16_t *src) { vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - return _mm_cvtsi128_si32(vsum); + return _mm_cvtsi128_si32(vsum); } -#define READ64(p, stride, i) \ - _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ +#define READ64(p, stride, i) \ + _mm_unpacklo_epi8( \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) static void get4x4var_sse2(const uint8_t *src, int src_stride, @@ -57,32 +58,31 @@ static void get4x4var_sse2(const uint8_t *src, int src_stride, *sum = (int16_t)_mm_extract_epi16(vsum, 0); // sse - vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), - _mm_madd_epi16(diff1, diff1)); + vsum = + _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1)); vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); *sse = _mm_cvtsi128_si32(vsum); } -void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { +void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, unsigned int *sse, int *sum) { const __m128i zero = _mm_setzero_si128(); __m128i vsum = _mm_setzero_si128(); __m128i vsse = _mm_setzero_si128(); int i; for (i = 0; i < 8; i += 2) { - const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(src + i * src_stride)), zero); - const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(ref + i * ref_stride)), zero); + const __m128i src0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero); + const __m128i ref0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero); const __m128i diff0 = _mm_sub_epi16(src0, ref0); - const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(src + (i + 1) * src_stride)), zero); - const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(ref + (i + 1) * ref_stride)), zero); + const __m128i src1 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero); + const __m128i ref1 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero); const __m128i diff1 = _mm_sub_epi16(src1, ref1); vsum = _mm_add_epi16(vsum, diff0); @@ -104,8 +104,8 @@ void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, } void vpx_get16x16var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { + const uint8_t *ref, int ref_stride, unsigned int *sse, + int *sum) { const __m128i zero = _mm_setzero_si128(); __m128i vsum = _mm_setzero_si128(); __m128i vsse = _mm_setzero_si128(); @@ -135,8 +135,8 @@ void vpx_get16x16var_sse2(const uint8_t *src, int src_stride, // sum vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0) + - (int16_t)_mm_extract_epi16(vsum, 1); + *sum = + (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1); // sse vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); @@ -144,10 +144,9 @@ void vpx_get16x16var_sse2(const uint8_t *src, int src_stride, *sse = _mm_cvtsi128_si32(vsse); } - static void variance_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - int w, int h, unsigned int *sse, int *sum, + const unsigned char *ref, int ref_stride, int w, + int h, unsigned int *sse, int *sum, getNxMvar_fn_t var_fn, int block_size) { int i, j; @@ -158,8 +157,8 @@ static void variance_sse2(const unsigned char *src, int src_stride, for (j = 0; j < w; j += block_size) { unsigned int sse0; int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); *sse += sse0; *sum += sum0; } @@ -178,8 +177,8 @@ unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 4, - sse, &sum, get4x4var_sse2, 4); + variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum, + get4x4var_sse2, 4); return *sse - ((sum * sum) >> 5); } @@ -187,8 +186,8 @@ unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 4, 8, - sse, &sum, get4x4var_sse2, 4); + variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum, + get4x4var_sse2, 4); return *sse - ((sum * sum) >> 5); } @@ -204,8 +203,8 @@ unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 8, - sse, &sum, vpx_get8x8var_sse2, 8); + variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum, + vpx_get8x8var_sse2, 8); return *sse - ((sum * sum) >> 7); } @@ -213,8 +212,8 @@ unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 16, - sse, &sum, vpx_get8x8var_sse2, 8); + variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum, + vpx_get8x8var_sse2, 8); return *sse - ((sum * sum) >> 7); } @@ -230,8 +229,8 @@ unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 32, - sse, &sum, vpx_get16x16var_sse2, 16); + variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, + vpx_get16x16var_sse2, 16); return *sse - (((int64_t)sum * sum) >> 10); } @@ -239,8 +238,8 @@ unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 16, - sse, &sum, vpx_get16x16var_sse2, 16); + variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, + vpx_get16x16var_sse2, 16); return *sse - (((int64_t)sum * sum) >> 9); } @@ -248,8 +247,8 @@ unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 32, - sse, &sum, vpx_get16x16var_sse2, 16); + variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum, + vpx_get16x16var_sse2, 16); return *sse - (((int64_t)sum * sum) >> 9); } @@ -257,8 +256,8 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 64, - sse, &sum, vpx_get16x16var_sse2, 16); + variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, + vpx_get16x16var_sse2, 16); return *sse - (((int64_t)sum * sum) >> 12); } @@ -266,8 +265,8 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 32, - sse, &sum, vpx_get16x16var_sse2, 16); + variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, + vpx_get16x16var_sse2, 16); return *sse - (((int64_t)sum * sum) >> 11); } @@ -275,8 +274,8 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 64, - sse, &sum, vpx_get16x16var_sse2, 16); + variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum, + vpx_get16x16var_sse2, 16); return *sse - (((int64_t)sum * sum) >> 11); } @@ -310,17 +309,14 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride, // The 2 unused parameters are place holders for PIC enabled build. // These definitions are for functions defined in subpel_variance.asm -#define DECL(w, opt) \ - int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ - ptrdiff_t src_stride, \ - int x_offset, int y_offset, \ - const uint8_t *dst, \ - ptrdiff_t dst_stride, \ - int height, unsigned int *sse, \ - void *unused0, void *unused) +#define DECL(w, opt) \ + int vpx_sub_pixel_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ + void *unused0, void *unused) #define DECLS(opt1, opt2) \ - DECL(4, opt1); \ - DECL(8, opt1); \ + DECL(4, opt1); \ + DECL(8, opt1); \ DECL(16, opt1) DECLS(sse2, sse2); @@ -328,59 +324,52 @@ DECLS(ssse3, ssse3); #undef DECLS #undef DECL -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ -unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \ - int src_stride, \ - int x_offset, \ - int y_offset, \ - const uint8_t *dst, \ - int dst_stride, \ - unsigned int *sse_ptr) { \ - unsigned int sse; \ - int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ - y_offset, dst, dst_stride, \ - h, &sse, NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \ - x_offset, y_offset, \ - dst + 16, dst_stride, \ - h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ - x_offset, y_offset, \ - dst + 32, dst_stride, \ - h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ - x_offset, y_offset, \ - dst + 48, dst_stride, \ - h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - (cast_prod (cast se * se) >> (wlog2 + hlog2)); \ -} +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + unsigned int sse; \ + int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + h, &sse, NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } -#define FNS(opt1, opt2) \ -FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ -FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ -FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ -FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ -FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ -FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ -FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ -FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \ -FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \ -FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \ -FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \ -FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \ -FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t)) +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t)) FNS(sse2, sse2); FNS(ssse3, ssse3); @@ -389,84 +378,69 @@ FNS(ssse3, ssse3); #undef FN // The 2 unused parameters are place holders for PIC enabled build. -#define DECL(w, opt) \ -int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \ - ptrdiff_t src_stride, \ - int x_offset, int y_offset, \ - const uint8_t *dst, \ - ptrdiff_t dst_stride, \ - const uint8_t *sec, \ - ptrdiff_t sec_stride, \ - int height, unsigned int *sse, \ - void *unused0, void *unused) +#define DECL(w, opt) \ + int vpx_sub_pixel_avg_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused) #define DECLS(opt1, opt2) \ -DECL(4, opt1); \ -DECL(8, opt1); \ -DECL(16, opt1) + DECL(4, opt1); \ + DECL(8, opt1); \ + DECL(16, opt1) DECLS(sse2, sse2); DECLS(ssse3, ssse3); #undef DECL #undef DECLS -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ -unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \ - int src_stride, \ - int x_offset, \ - int y_offset, \ - const uint8_t *dst, \ - int dst_stride, \ - unsigned int *sseptr, \ - const uint8_t *sec) { \ - unsigned int sse; \ - int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \ - y_offset, dst, dst_stride, \ - sec, w, h, &sse, NULL, \ - NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \ - x_offset, y_offset, \ - dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, \ - NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \ - x_offset, y_offset, \ - dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, \ - NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \ - x_offset, y_offset, \ - dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, \ - NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sseptr = sse; \ - return sse - (cast_prod (cast se * se) >> (wlog2 + hlog2)); \ -} +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sseptr, \ + const uint8_t *sec) { \ + unsigned int sse; \ + int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sseptr = sse; \ + return sse - (cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } -#define FNS(opt1, opt2) \ -FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ -FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ -FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ -FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ -FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ -FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ -FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ -FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \ -FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \ -FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \ -FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \ -FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \ -FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t)) +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t)) FNS(sse2, sse); FNS(ssse3, ssse3); @@ -474,216 +448,215 @@ FNS(ssse3, ssse3); #undef FNS #undef FN -void vpx_upsampled_pred_sse2(uint8_t *comp_pred, - int width, int height, - const uint8_t *ref, int ref_stride) { - int i, j; - int stride = ref_stride << 3; - - if (width >= 16) { - // read 16 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j+= 16) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); - __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); - __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); - __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64)); - __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80)); - __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96)); - __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112)); - __m128i t0, t1, t2, t3; - - t0 = _mm_unpacklo_epi8(s0, s1); - s1 = _mm_unpackhi_epi8(s0, s1); - t1 = _mm_unpacklo_epi8(s2, s3); - s3 = _mm_unpackhi_epi8(s2, s3); - t2 = _mm_unpacklo_epi8(s4, s5); - s5 = _mm_unpackhi_epi8(s4, s5); - t3 = _mm_unpacklo_epi8(s6, s7); - s7 = _mm_unpackhi_epi8(s6, s7); - - s0 = _mm_unpacklo_epi8(t0, s1); - s2 = _mm_unpacklo_epi8(t1, s3); - s4 = _mm_unpacklo_epi8(t2, s5); - s6 = _mm_unpacklo_epi8(t3, s7); - s0 = _mm_unpacklo_epi32(s0, s2); - s4 = _mm_unpacklo_epi32(s4, s6); - s0 = _mm_unpacklo_epi64(s0, s4); - - _mm_storeu_si128((__m128i *)(comp_pred), s0); - comp_pred += 16; - ref += 16 * 8; - } - ref += stride - (width << 3); +void vpx_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, + const uint8_t *ref, int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + if (width >= 16) { + // read 16 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 16) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64)); + __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80)); + __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96)); + __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112)); + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + t2 = _mm_unpacklo_epi8(s4, s5); + s5 = _mm_unpackhi_epi8(s4, s5); + t3 = _mm_unpacklo_epi8(s6, s7); + s7 = _mm_unpackhi_epi8(s6, s7); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s4 = _mm_unpacklo_epi8(t2, s5); + s6 = _mm_unpacklo_epi8(t3, s7); + s0 = _mm_unpacklo_epi32(s0, s2); + s4 = _mm_unpacklo_epi32(s4, s6); + s0 = _mm_unpacklo_epi64(s0, s4); + + _mm_storeu_si128((__m128i *)(comp_pred), s0); + comp_pred += 16; + ref += 16 * 8; } - } else if (width >= 8) { - // read 8 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j+= 8) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); - __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); - __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); - __m128i t0, t1; - - t0 = _mm_unpacklo_epi8(s0, s1); - s1 = _mm_unpackhi_epi8(s0, s1); - t1 = _mm_unpacklo_epi8(s2, s3); - s3 = _mm_unpackhi_epi8(s2, s3); - - s0 = _mm_unpacklo_epi8(t0, s1); - s2 = _mm_unpacklo_epi8(t1, s3); - s0 = _mm_unpacklo_epi32(s0, s2); - - _mm_storel_epi64((__m128i *)(comp_pred), s0); - comp_pred += 8; - ref += 8 * 8; - } - ref += stride - (width << 3); + ref += stride - (width << 3); + } + } else if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s0 = _mm_unpacklo_epi32(s0, s2); + + _mm_storel_epi64((__m128i *)(comp_pred), s0); + comp_pred += 8; + ref += 8 * 8; } - } else { - // read 4 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j+= 4) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); - __m128i t0; - - t0 = _mm_unpacklo_epi8(s0, s1); - s1 = _mm_unpackhi_epi8(s0, s1); - s0 = _mm_unpacklo_epi8(t0, s1); - - *(int *)comp_pred = _mm_cvtsi128_si32(s0); - comp_pred += 4; - ref += 4 * 8; - } - ref += stride - (width << 3); + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i t0; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + s0 = _mm_unpacklo_epi8(t0, s1); + + *(int *)comp_pred = _mm_cvtsi128_si32(s0); + comp_pred += 4; + ref += 4 * 8; } + ref += stride - (width << 3); } + } } void vpx_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, - const uint8_t *ref, int ref_stride) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - int i, j; - int stride = ref_stride << 3; - - if (width >= 16) { - // read 16 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j+= 16) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); - __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); - __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); - __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64)); - __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80)); - __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96)); - __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112)); - __m128i p0 = _mm_loadu_si128((const __m128i *)pred); - __m128i p1; - __m128i t0, t1, t2, t3; - - t0 = _mm_unpacklo_epi8(s0, s1); - s1 = _mm_unpackhi_epi8(s0, s1); - t1 = _mm_unpacklo_epi8(s2, s3); - s3 = _mm_unpackhi_epi8(s2, s3); - t2 = _mm_unpacklo_epi8(s4, s5); - s5 = _mm_unpackhi_epi8(s4, s5); - t3 = _mm_unpacklo_epi8(s6, s7); - s7 = _mm_unpackhi_epi8(s6, s7); - - s0 = _mm_unpacklo_epi8(t0, s1); - s2 = _mm_unpacklo_epi8(t1, s3); - s4 = _mm_unpacklo_epi8(t2, s5); - s6 = _mm_unpacklo_epi8(t3, s7); - - s0 = _mm_unpacklo_epi32(s0, s2); - s4 = _mm_unpacklo_epi32(s4, s6); - s0 = _mm_unpacklo_epi8(s0, zero); - s4 = _mm_unpacklo_epi8(s4, zero); - - p1 = _mm_unpackhi_epi8(p0, zero); - p0 = _mm_unpacklo_epi8(p0, zero); - p0 = _mm_adds_epu16(s0, p0); - p1 = _mm_adds_epu16(s4, p1); - p0 = _mm_adds_epu16(p0, one); - p1 = _mm_adds_epu16(p1, one); - - p0 = _mm_srli_epi16(p0, 1); - p1 = _mm_srli_epi16(p1, 1); - p0 = _mm_packus_epi16(p0, p1); - - _mm_storeu_si128((__m128i *)(comp_pred), p0); - comp_pred += 16; - pred += 16; - ref += 16 * 8; - } - ref += stride - (width << 3); + int width, int height, const uint8_t *ref, + int ref_stride) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + int i, j; + int stride = ref_stride << 3; + + if (width >= 16) { + // read 16 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 16) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64)); + __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80)); + __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96)); + __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112)); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + __m128i p1; + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + t2 = _mm_unpacklo_epi8(s4, s5); + s5 = _mm_unpackhi_epi8(s4, s5); + t3 = _mm_unpacklo_epi8(s6, s7); + s7 = _mm_unpackhi_epi8(s6, s7); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s4 = _mm_unpacklo_epi8(t2, s5); + s6 = _mm_unpacklo_epi8(t3, s7); + + s0 = _mm_unpacklo_epi32(s0, s2); + s4 = _mm_unpacklo_epi32(s4, s6); + s0 = _mm_unpacklo_epi8(s0, zero); + s4 = _mm_unpacklo_epi8(s4, zero); + + p1 = _mm_unpackhi_epi8(p0, zero); + p0 = _mm_unpacklo_epi8(p0, zero); + p0 = _mm_adds_epu16(s0, p0); + p1 = _mm_adds_epu16(s4, p1); + p0 = _mm_adds_epu16(p0, one); + p1 = _mm_adds_epu16(p1, one); + + p0 = _mm_srli_epi16(p0, 1); + p1 = _mm_srli_epi16(p1, 1); + p0 = _mm_packus_epi16(p0, p1); + + _mm_storeu_si128((__m128i *)(comp_pred), p0); + comp_pred += 16; + pred += 16; + ref += 16 * 8; } - } else if (width >= 8) { - // read 8 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j+= 8) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); - __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); - __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); - __m128i p0 = _mm_loadl_epi64((const __m128i *)pred); - __m128i t0, t1; - - t0 = _mm_unpacklo_epi8(s0, s1); - s1 = _mm_unpackhi_epi8(s0, s1); - t1 = _mm_unpacklo_epi8(s2, s3); - s3 = _mm_unpackhi_epi8(s2, s3); - - s0 = _mm_unpacklo_epi8(t0, s1); - s2 = _mm_unpacklo_epi8(t1, s3); - s0 = _mm_unpacklo_epi32(s0, s2); - s0 = _mm_unpacklo_epi8(s0, zero); - - p0 = _mm_unpacklo_epi8(p0, zero); - p0 = _mm_adds_epu16(s0, p0); - p0 = _mm_adds_epu16(p0, one); - p0 = _mm_srli_epi16(p0, 1); - p0 = _mm_packus_epi16(p0, zero); - - _mm_storel_epi64((__m128i *)(comp_pred), p0); - comp_pred += 8; - pred += 8; - ref += 8 * 8; - } - ref += stride - (width << 3); + ref += stride - (width << 3); + } + } else if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i p0 = _mm_loadl_epi64((const __m128i *)pred); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s0 = _mm_unpacklo_epi32(s0, s2); + s0 = _mm_unpacklo_epi8(s0, zero); + + p0 = _mm_unpacklo_epi8(p0, zero); + p0 = _mm_adds_epu16(s0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + p0 = _mm_packus_epi16(p0, zero); + + _mm_storel_epi64((__m128i *)(comp_pred), p0); + comp_pred += 8; + pred += 8; + ref += 8 * 8; } - } else { - // read 4 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j+= 4) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); - __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred); - __m128i t0; - - t0 = _mm_unpacklo_epi8(s0, s1); - s1 = _mm_unpackhi_epi8(s0, s1); - s0 = _mm_unpacklo_epi8(t0, s1); - s0 = _mm_unpacklo_epi8(s0, zero); - - p0 = _mm_unpacklo_epi8(p0, zero); - p0 = _mm_adds_epu16(s0, p0); - p0 = _mm_adds_epu16(p0, one); - p0 = _mm_srli_epi16(p0, 1); - p0 = _mm_packus_epi16(p0, zero); - - *(int *)comp_pred = _mm_cvtsi128_si32(p0); - comp_pred += 4; - pred += 4; - ref += 4 * 8; - } - ref += stride - (width << 3); + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred); + __m128i t0; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + s0 = _mm_unpacklo_epi8(t0, s1); + s0 = _mm_unpacklo_epi8(s0, zero); + + p0 = _mm_unpacklo_epi8(p0, zero); + p0 = _mm_adds_epu16(s0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + p0 = _mm_packus_epi16(p0, zero); + + *(int *)comp_pred = _mm_cvtsi128_si32(p0); + comp_pred += 4; + pred += 4; + ref += 4 * 8; } + ref += stride - (width << 3); } + } } diff --git a/vpx_dsp/x86/vpx_asm_stubs.c b/vpx_dsp/x86/vpx_asm_stubs.c index 422b0fc422d68e0a22d81aec174a4ae5750169de..727d9d1156ead8b04aff6cfa0850763299eb61a4 100644 --- a/vpx_dsp/x86/vpx_asm_stubs.c +++ b/vpx_dsp/x86/vpx_asm_stubs.c @@ -75,7 +75,7 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); // const int16_t *filter_y, int y_step_q4, // int w, int h); FUN_CONV_2D(, sse2); -FUN_CONV_2D(avg_ , sse2); +FUN_CONV_2D(avg_, sse2); #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; @@ -157,6 +157,6 @@ HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, // const int16_t *filter_y, int y_step_q4, // int w, int h, int bd); HIGH_FUN_CONV_2D(, sse2); -HIGH_FUN_CONV_2D(avg_ , sse2); +HIGH_FUN_CONV_2D(avg_, sse2); #endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 #endif // HAVE_SSE2 diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 01cf4354a0938aee25c27bd192099729d16f123b..6d53b8705ddf9573efee76d392e8402614b50ab8 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -36,35 +36,32 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { }; #if defined(__clang__) -# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \ - (defined(__APPLE__) && defined(__apple_build_version__) && \ - ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ - (__clang_major__ == 5 && __clang_minor__ == 0))) - -# define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -# else // clang > 3.3, and not 5.0 on macosx. -# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -# endif // clang <= 3.3 +#if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && defined(__apple_build_version__) && \ + ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ + (__clang_major__ == 5 && __clang_minor__ == 0))) + +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *) & (x)) +#else // clang > 3.3, and not 5.0 on macosx. +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // clang <= 3.3 #elif defined(__GNUC__) -# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) -# define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 -# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) -# else // gcc > 4.7 -# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -# endif // gcc <= 4.6 -#else // !(gcc || clang) -# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *) & (x)) +#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +#else // gcc > 4.7 +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // gcc <= 4.6 +#else // !(gcc || clang) +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) #endif // __clang__ -static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, - ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { +static void vpx_filter_block1d16_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; @@ -78,26 +75,22 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x100u)); + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x302u)); + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x504u)); + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x706u)); + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); @@ -107,17 +100,18 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; - for (i = output_height; i > 1; i-=2) { + for (i = output_height; i > 1; i -= 2) { // load the 2 strides of source - srcReg32b1 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm_loadu_si128((const __m128i *) - (src_ptr+src_pixels_per_line-3)), 1); + srcReg32b1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); + srcReg32b1 = _mm256_inserti128_si256( + srcReg32b1, + _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), + 1); // filter the source buffer - srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); - srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); @@ -127,28 +121,29 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer - srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); - srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, - _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + srcRegFilt32b1_1 = _mm256_adds_epi16( + srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) - srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm_loadu_si128((const __m128i *) - (src_ptr+src_pixels_per_line+5)), 1); + srcReg32b2 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); + srcReg32b2 = _mm256_inserti128_si256( + srcReg32b2, + _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), + 1); // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, - _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + srcRegFilt32b1_1 = _mm256_adds_epi16( + srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); @@ -162,19 +157,18 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); // filter the source buffer - srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); - srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, - _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, - _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - + srcRegFilt32b2_1 = _mm256_adds_epi16( + srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + srcRegFilt32b2_1 = _mm256_adds_epi16( + srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); @@ -187,19 +181,18 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result - srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, - srcRegFilt32b2_1); + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); - src_ptr+=src_stride; + src_ptr += src_stride; // save 16 bytes - _mm_store_si128((__m128i*)output_ptr, - _mm256_castsi256_si128(srcRegFilt32b1_1)); + _mm_store_si128((__m128i *)output_ptr, + _mm256_castsi256_si128(srcRegFilt32b1_1)); // save the next 16 bits - _mm_store_si128((__m128i*)(output_ptr+output_pitch), - _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); - output_ptr+=dst_stride; + _mm_store_si128((__m128i *)(output_ptr + output_pitch), + _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr += dst_stride; } // if the number of strides is odd. @@ -211,83 +204,74 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer - srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, - _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, - _mm256_castsi256_si128(filt4Reg)); + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, - _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, - _mm256_castsi256_si128(forthFilters)); + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg1, - _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2= _mm_shuffle_epi8(srcReg1, - _mm256_castsi256_si128(filt3Reg)); + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, - _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, - _mm256_castsi256_si128(thirdFilters)); + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, - _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, - _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); // filter the source buffer - srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, - _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, - _mm256_castsi256_si128(filt4Reg)); + srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result - srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, - _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, - _mm256_castsi256_si128(forthFilters)); + srcRegFilt2_1 = + _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg2, - _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, - _mm256_castsi256_si128(filt3Reg)); + srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, - _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, - _mm256_castsi256_si128(thirdFilters)); + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, - _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, - _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, - _mm256_castsi256_si128(addFilterReg64)); + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, - _mm256_castsi256_si128(addFilterReg64)); + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); @@ -299,16 +283,13 @@ static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); // save 16 bytes - _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); + _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1); } } -static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, - ptrdiff_t src_pitch, - uint8_t *output_ptr, - ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { +static void vpx_filter_block1d16_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; @@ -323,60 +304,56 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x100u)); + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x302u)); + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x504u)); + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x706u)); + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch - srcReg32b1 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr))); + srcReg32b1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr))); srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); srcReg32b3 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); srcReg32b4 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); srcReg32b5 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); srcReg32b6 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg32b7 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm256_castsi256_si128(srcReg32b2), 1); + _mm256_castsi256_si128(srcReg32b2), 1); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm256_castsi256_si128(srcReg32b3), 1); + _mm256_castsi256_si128(srcReg32b3), 1); srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, - _mm256_castsi256_si128(srcReg32b4), 1); + _mm256_castsi256_si128(srcReg32b4), 1); srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, - _mm256_castsi256_si128(srcReg32b5), 1); + _mm256_castsi256_si128(srcReg32b5), 1); srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, - _mm256_castsi256_si128(srcReg32b6), 1); + _mm256_castsi256_si128(srcReg32b6), 1); srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, - _mm256_castsi256_si128(srcReg32b7), 1); + _mm256_castsi256_si128(srcReg32b7), 1); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); @@ -394,89 +371,87 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, // save srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + // add and saturate the results together + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); - for (i = output_height; i > 1; i-=2) { - // load the last 2 loads of 16 bytes and have every two - // consecutive loads in the same 256 bit register - srcReg32b8 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); - srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, - _mm256_castsi256_si128(srcReg32b8), 1); - srcReg32b9 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); - srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, - _mm256_castsi256_si128(srcReg32b9), 1); - - // merge every two consecutive registers - // save - srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); - srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); - - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); - - // add and saturate the results together - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); - srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); - - src_ptr+=src_stride; - - // save 16 bytes - _mm_store_si128((__m128i*)output_ptr, - _mm256_castsi256_si128(srcReg32b1)); - - // save the next 16 bits - _mm_store_si128((__m128i*)(output_ptr+out_pitch), - _mm256_extractf128_si256(srcReg32b1, 1)); - - output_ptr+=dst_stride; - - // save part of the registers for next strides - srcReg32b10 = srcReg32b11; - srcReg32b1 = srcReg32b3; - srcReg32b11 = srcReg32b2; - srcReg32b3 = srcReg32b5; - srcReg32b2 = srcReg32b4; - srcReg32b5 = srcReg32b7; - srcReg32b7 = srcReg32b9; + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + + src_ptr += src_stride; + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1)); + + // save the next 16 bits + _mm_store_si128((__m128i *)(output_ptr + out_pitch), + _mm256_extractf128_si256(srcReg32b1, 1)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b1 = srcReg32b3; + srcReg32b11 = srcReg32b2; + srcReg32b3 = srcReg32b5; + srcReg32b2 = srcReg32b4; + srcReg32b5 = srcReg32b7; + srcReg32b7 = srcReg32b9; } if (i > 0) { __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; @@ -485,55 +460,53 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together - srcRegFilt4 = _mm_unpacklo_epi8( - _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); - srcRegFilt7 = _mm_unpackhi_epi8( - _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt4 = + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt7 = + _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, - _mm256_castsi256_si128(forthFilters)); + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = + _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, - _mm256_castsi256_si128(forthFilters)); + _mm256_castsi256_si128(firstFilters)); + srcRegFilt7 = + _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); - // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), - _mm256_castsi256_si128(secondFilters)); + _mm256_castsi256_si128(secondFilters)); srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), - _mm256_castsi256_si128(secondFilters)); + _mm256_castsi256_si128(secondFilters)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), - _mm256_castsi256_si128(thirdFilters)); + _mm256_castsi256_si128(thirdFilters)); srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), - _mm256_castsi256_si128(thirdFilters)); + _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm_min_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, - _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); // add and saturate the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm_max_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, - _mm_max_epi16(srcRegFilt5, srcRegFilt7)); - + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm256_castsi256_si128(addFilterReg64)); - srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, - _mm256_castsi256_si128(addFilterReg64)); + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); @@ -545,7 +518,7 @@ static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); // save 16 bytes - _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); + _mm_store_si128((__m128i *)output_ptr, srcRegFilt1); } } @@ -575,10 +548,10 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3; #define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3 #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3 #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3 -#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3 -#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3 -#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3 -#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3 +#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3 +#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3 +#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3 +#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 69cd6967a9ddfd62a0be6cc12fae6c334cef7a7c..36af4dd132da85dbf0a73fe938b8603fd5265ec3 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -48,23 +48,20 @@ filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; -void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, - ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { +void vpx_filter_block1d4_h8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, srcReg, minReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 =_mm_set1_epi32((int)0x0400040u); + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter into the first lane firstFilters = _mm_shufflelo_epi16(filtersReg, 0); @@ -78,23 +75,23 @@ void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); // loading the local filters - shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); + shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer - srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); - srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); + srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); + srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // extract the higher half of the lane - srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); - srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); + srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); @@ -110,21 +107,18 @@ void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - src_ptr+=src_pixels_per_line; + src_ptr += src_pixels_per_line; // save only 4 bytes - *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); - output_ptr+=output_pitch; + output_ptr += output_pitch; } } -void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, - ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { +void vpx_filter_block1d8_h8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; @@ -136,7 +130,7 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits (first and second byte) // across 128 bit register @@ -160,16 +154,16 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer - srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); + srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); + srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); - srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); + srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); + srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); @@ -179,7 +173,7 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); @@ -190,21 +184,18 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - src_ptr+=src_pixels_per_line; + src_ptr += src_pixels_per_line; // save only 8 bytes - _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); + _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); - output_ptr+=output_pitch; + output_ptr += output_pitch; } } -void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, - ptrdiff_t src_pitch, - uint8_t *output_ptr, - ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { +void vpx_filter_block1d8_v8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i addFilterReg64, filtersReg, minReg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; @@ -217,7 +208,7 @@ void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); @@ -269,7 +260,7 @@ void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - src_ptr+=src_pitch; + src_ptr += src_pitch; // shift down a row srcReg1 = srcReg2; @@ -281,9 +272,9 @@ void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, srcReg7 = srcReg8; // save only 8 bytes convolve result - _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); + _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); - output_ptr+=out_pitch; + output_ptr += out_pitch; } } @@ -339,32 +330,33 @@ FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, ssse3); -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ - const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ - const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ - const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ - \ - const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ - const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ - const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ - const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ - \ - out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ - out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ - out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ - out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ - out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ - out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ - out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ - out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ -} +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ + const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ + const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ + const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ + \ + const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ + const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ + const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ + const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ + \ + out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ + out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ + out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ + out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ + out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ + out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ + out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ + out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ + } static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *x_filter) { @@ -420,7 +412,7 @@ static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result - _mm_storel_epi64((__m128i*)dst, temp); + _mm_storel_epi64((__m128i *)dst, temp); } static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, @@ -436,23 +428,22 @@ static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); - TRANSPOSE_8X8(A, B, C, D, E, F, G, H, - A, B, C, D, E, F, G, H); - - _mm_storel_epi64((__m128i*)dst, A); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H); + TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H); + + _mm_storel_epi64((__m128i *)dst, A); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H); } static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, - int x0_q4, int x_step_q4, int w, int h) { + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; @@ -523,7 +514,7 @@ static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 02 03 12 13 22 23 32 33 @@ -565,16 +556,16 @@ static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, C = _mm_srli_si128(A, 8); D = _mm_srli_si128(A, 12); - *(int *)(dst) = _mm_cvtsi128_si32(A); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); - *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); - *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); + *(int *)(dst) = _mm_cvtsi128_si32(A); + *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); + *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); + *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); } static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, - int x0_q4, int x_step_q4, int w, int h) { + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; @@ -648,8 +639,8 @@ static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, - int y0_q4, int y_step_q4, int w, int h) { + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { int y; int y_q4 = y0_q4; @@ -705,13 +696,13 @@ static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result - _mm_storel_epi64((__m128i*)dst, temp); + _mm_storel_epi64((__m128i *)dst, temp); } static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, - int y0_q4, int y_step_q4, int w, int h) { + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { int y; int y_q4 = y0_q4; @@ -794,15 +785,15 @@ static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, // result temp_hi = _mm_packus_epi16(temp_lo, temp_hi); src_ptr += 16; - // save 16 bytes convolve result - _mm_store_si128((__m128i*)&dst[i], temp_hi); + // save 16 bytes convolve result + _mm_store_si128((__m128i *)&dst[i], temp_hi); } } static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, - int y0_q4, int y_step_q4, int w, int h) { + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { int y; int y_q4 = y0_q4; @@ -822,11 +813,9 @@ static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, - int x0_q4, int x_step_q4, - const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, - int w, int h) { + const InterpKernel *const x_filters, int x0_q4, + int x_step_q4, const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -851,38 +840,26 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, if (w >= 8) { scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, - temp, - MAX_SB_SIZE, - x_filters, x0_q4, x_step_q4, - w, intermediate_height); + src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4, + x_step_q4, w, intermediate_height); } else { scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, - temp, - MAX_SB_SIZE, - x_filters, x0_q4, x_step_q4, - w, intermediate_height); + src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4, + x_step_q4, w, intermediate_height); } if (w >= 16) { scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, - dst, - dst_stride, - y_filters, y0_q4, y_step_q4, w, h); + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, + y_step_q4, w, h); } else if (w == 8) { scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, - dst, - dst_stride, - y_filters, y0_q4, y_step_q4, w, h); + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, + y_step_q4, w, h); } else { scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, - dst, - dst_stride, - y_filters, y0_q4, y_step_q4, w, h); + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, + y_step_q4, w, h); } } @@ -896,10 +873,9 @@ static int get_filter_offset(const int16_t *f, const InterpKernel *base) { return (int)((const InterpKernel *)(intptr_t)f - base); } -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); @@ -907,9 +883,8 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); - scaledconvolve2d(src, src_stride, dst, dst_stride, - filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h); + scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, filters_y, y0_q4, y_step_q4, w, h); } // void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, @@ -923,4 +898,4 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, // const int16_t *filter_y, int y_step_q4, // int w, int h); FUN_CONV_2D(, ssse3); -FUN_CONV_2D(avg_ , ssse3); +FUN_CONV_2D(avg_, ssse3); diff --git a/vpxdec.c b/vpxdec.c index 67b3b51c7e1bdf1e00c9c3457b5be607870cb5fb..cd6d7ed8a5724a962259e823909afe431d53c403 100644 --- a/vpxdec.c +++ b/vpxdec.c @@ -47,117 +47,124 @@ struct VpxDecInputContext { struct WebmInputContext *webm_ctx; }; -static const arg_def_t looparg = ARG_DEF( - NULL, "loops", 1, "Number of times to decode the file"); -static const arg_def_t codecarg = ARG_DEF( - NULL, "codec", 1, "Codec to use"); -static const arg_def_t use_yv12 = ARG_DEF( - NULL, "yv12", 0, "Output raw YV12 frames"); -static const arg_def_t use_i420 = ARG_DEF( - NULL, "i420", 0, "Output raw I420 frames"); -static const arg_def_t flipuvarg = ARG_DEF( - NULL, "flipuv", 0, "Flip the chroma planes in the output"); -static const arg_def_t rawvideo = ARG_DEF( - NULL, "rawvideo", 0, "Output raw YUV frames"); -static const arg_def_t noblitarg = ARG_DEF( - NULL, "noblit", 0, "Don't process the decoded frames"); -static const arg_def_t progressarg = ARG_DEF( - NULL, "progress", 0, "Show progress after each frame decodes"); -static const arg_def_t limitarg = ARG_DEF( - NULL, "limit", 1, "Stop decoding after n frames"); -static const arg_def_t skiparg = ARG_DEF( - NULL, "skip", 1, "Skip the first n input frames"); -static const arg_def_t postprocarg = ARG_DEF( - NULL, "postproc", 0, "Postprocess decoded frames"); -static const arg_def_t summaryarg = ARG_DEF( - NULL, "summary", 0, "Show timing summary"); -static const arg_def_t outputfile = ARG_DEF( - "o", "output", 1, "Output file name pattern (see below)"); -static const arg_def_t threadsarg = ARG_DEF( - "t", "threads", 1, "Max threads to use"); -static const arg_def_t frameparallelarg = ARG_DEF( - NULL, "frame-parallel", 0, "Frame parallel decode"); -static const arg_def_t verbosearg = ARG_DEF( - "v", "verbose", 0, "Show version string"); -static const arg_def_t error_concealment = ARG_DEF( - NULL, "error-concealment", 0, "Enable decoder error-concealment"); -static const arg_def_t scalearg = ARG_DEF( - "S", "scale", 0, "Scale output frames uniformly"); -static const arg_def_t continuearg = ARG_DEF( - "k", "keep-going", 0, "(debug) Continue decoding after error"); -static const arg_def_t fb_arg = ARG_DEF( - NULL, "frame-buffers", 1, "Number of frame buffers to use"); -static const arg_def_t md5arg = ARG_DEF( - NULL, "md5", 0, "Compute the MD5 sum of the decoded frame"); +static const arg_def_t looparg = + ARG_DEF(NULL, "loops", 1, "Number of times to decode the file"); +static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"); +static const arg_def_t use_yv12 = + ARG_DEF(NULL, "yv12", 0, "Output raw YV12 frames"); +static const arg_def_t use_i420 = + ARG_DEF(NULL, "i420", 0, "Output raw I420 frames"); +static const arg_def_t flipuvarg = + ARG_DEF(NULL, "flipuv", 0, "Flip the chroma planes in the output"); +static const arg_def_t rawvideo = + ARG_DEF(NULL, "rawvideo", 0, "Output raw YUV frames"); +static const arg_def_t noblitarg = + ARG_DEF(NULL, "noblit", 0, "Don't process the decoded frames"); +static const arg_def_t progressarg = + ARG_DEF(NULL, "progress", 0, "Show progress after each frame decodes"); +static const arg_def_t limitarg = + ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames"); +static const arg_def_t skiparg = + ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"); +static const arg_def_t postprocarg = + ARG_DEF(NULL, "postproc", 0, "Postprocess decoded frames"); +static const arg_def_t summaryarg = + ARG_DEF(NULL, "summary", 0, "Show timing summary"); +static const arg_def_t outputfile = + ARG_DEF("o", "output", 1, "Output file name pattern (see below)"); +static const arg_def_t threadsarg = + ARG_DEF("t", "threads", 1, "Max threads to use"); +static const arg_def_t frameparallelarg = + ARG_DEF(NULL, "frame-parallel", 0, "Frame parallel decode"); +static const arg_def_t verbosearg = + ARG_DEF("v", "verbose", 0, "Show version string"); +static const arg_def_t error_concealment = + ARG_DEF(NULL, "error-concealment", 0, "Enable decoder error-concealment"); +static const arg_def_t scalearg = + ARG_DEF("S", "scale", 0, "Scale output frames uniformly"); +static const arg_def_t continuearg = + ARG_DEF("k", "keep-going", 0, "(debug) Continue decoding after error"); +static const arg_def_t fb_arg = + ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use"); +static const arg_def_t md5arg = + ARG_DEF(NULL, "md5", 0, "Compute the MD5 sum of the decoded frame"); #if CONFIG_VP9_HIGHBITDEPTH -static const arg_def_t outbitdeptharg = ARG_DEF( - NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames"); +static const arg_def_t outbitdeptharg = + ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames"); #endif #if CONFIG_EXT_TILE -static const arg_def_t tiler = ARG_DEF( - NULL, "tile-row", 1, "Row index of tile to decode " - "(-1 for all rows)"); -static const arg_def_t tilec = ARG_DEF( - NULL, "tile-column", 1, "Column index of tile to decode " - "(-1 for all columns)"); +static const arg_def_t tiler = ARG_DEF(NULL, "tile-row", 1, + "Row index of tile to decode " + "(-1 for all rows)"); +static const arg_def_t tilec = ARG_DEF(NULL, "tile-column", 1, + "Column index of tile to decode " + "(-1 for all columns)"); #endif // CONFIG_EXT_TILE -static const arg_def_t *all_args[] = { - &codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg, - &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile, - &threadsarg, &frameparallelarg, &verbosearg, &scalearg, &fb_arg, - &md5arg, &error_concealment, &continuearg, +static const arg_def_t *all_args[] = { &codecarg, + &use_yv12, + &use_i420, + &flipuvarg, + &rawvideo, + &noblitarg, + &progressarg, + &limitarg, + &skiparg, + &postprocarg, + &summaryarg, + &outputfile, + &threadsarg, + &frameparallelarg, + &verbosearg, + &scalearg, + &fb_arg, + &md5arg, + &error_concealment, + &continuearg, #if CONFIG_VP9_HIGHBITDEPTH - &outbitdeptharg, + &outbitdeptharg, #endif #if CONFIG_EXT_TILE - &tiler, &tilec, + &tiler, + &tilec, #endif // CONFIG_EXT_TILE - NULL -}; + NULL }; #if CONFIG_LIBYUV static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst, - FilterModeEnum mode) { + FilterModeEnum mode) { #if CONFIG_VP9_HIGHBITDEPTH if (src->fmt == VPX_IMG_FMT_I42016) { assert(dst->fmt == VPX_IMG_FMT_I42016); - return I420Scale_16((uint16_t*)src->planes[VPX_PLANE_Y], - src->stride[VPX_PLANE_Y]/2, - (uint16_t*)src->planes[VPX_PLANE_U], - src->stride[VPX_PLANE_U]/2, - (uint16_t*)src->planes[VPX_PLANE_V], - src->stride[VPX_PLANE_V]/2, - src->d_w, src->d_h, - (uint16_t*)dst->planes[VPX_PLANE_Y], - dst->stride[VPX_PLANE_Y]/2, - (uint16_t*)dst->planes[VPX_PLANE_U], - dst->stride[VPX_PLANE_U]/2, - (uint16_t*)dst->planes[VPX_PLANE_V], - dst->stride[VPX_PLANE_V]/2, - dst->d_w, dst->d_h, - mode); + return I420Scale_16( + (uint16_t *)src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y] / 2, + (uint16_t *)src->planes[VPX_PLANE_U], src->stride[VPX_PLANE_U] / 2, + (uint16_t *)src->planes[VPX_PLANE_V], src->stride[VPX_PLANE_V] / 2, + src->d_w, src->d_h, (uint16_t *)dst->planes[VPX_PLANE_Y], + dst->stride[VPX_PLANE_Y] / 2, (uint16_t *)dst->planes[VPX_PLANE_U], + dst->stride[VPX_PLANE_U] / 2, (uint16_t *)dst->planes[VPX_PLANE_V], + dst->stride[VPX_PLANE_V] / 2, dst->d_w, dst->d_h, mode); } #endif assert(src->fmt == VPX_IMG_FMT_I420); assert(dst->fmt == VPX_IMG_FMT_I420); return I420Scale(src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y], src->planes[VPX_PLANE_U], src->stride[VPX_PLANE_U], - src->planes[VPX_PLANE_V], src->stride[VPX_PLANE_V], - src->d_w, src->d_h, - dst->planes[VPX_PLANE_Y], dst->stride[VPX_PLANE_Y], + src->planes[VPX_PLANE_V], src->stride[VPX_PLANE_V], src->d_w, + src->d_h, dst->planes[VPX_PLANE_Y], dst->stride[VPX_PLANE_Y], dst->planes[VPX_PLANE_U], dst->stride[VPX_PLANE_U], - dst->planes[VPX_PLANE_V], dst->stride[VPX_PLANE_V], - dst->d_w, dst->d_h, - mode); + dst->planes[VPX_PLANE_V], dst->stride[VPX_PLANE_V], dst->d_w, + dst->d_h, mode); } #endif void usage_exit(void) { int i; - fprintf(stderr, "Usage: %s <options> filename\n\n" - "Options:\n", exec_name); + fprintf(stderr, + "Usage: %s <options> filename\n\n" + "Options:\n", + exec_name); arg_show_usage(stderr, all_args); fprintf(stderr, "\nOutput File Patterns:\n\n" @@ -172,27 +179,25 @@ void usage_exit(void) { "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)" "\n\n Pattern arguments are only supported in conjunction " "with the --yv12 and\n --i420 options. If the -o option is " - "not specified, the output will be\n directed to stdout.\n" - ); + "not specified, the output will be\n directed to stdout.\n"); fprintf(stderr, "\nIncluded decoders:\n\n"); for (i = 0; i < get_vpx_decoder_count(); ++i) { const VpxInterface *const decoder = get_vpx_decoder_by_index(i); - fprintf(stderr, " %-6s - %s\n", - decoder->name, vpx_codec_iface_name(decoder->codec_interface())); + fprintf(stderr, " %-6s - %s\n", decoder->name, + vpx_codec_iface_name(decoder->codec_interface())); } exit(EXIT_FAILURE); } -static int raw_read_frame(FILE *infile, uint8_t **buffer, - size_t *bytes_read, size_t *buffer_size) { +static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size) { char raw_hdr[RAW_FRAME_HDR_SZ]; size_t frame_size = 0; if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) { - if (!feof(infile)) - warn("Failed to read RAW frame size\n"); + if (!feof(infile)) warn("Failed to read RAW frame size\n"); } else { const size_t kCorruptFrameThreshold = 256 * 1024 * 1024; const size_t kFrameTooSmallThreshold = 256 * 1024; @@ -239,13 +244,12 @@ static int read_frame(struct VpxDecInputContext *input, uint8_t **buf, return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer); #endif case FILE_TYPE_RAW: - return raw_read_frame(input->vpx_input_ctx->file, - buf, bytes_in_buffer, buffer_size); + return raw_read_frame(input->vpx_input_ctx->file, buf, bytes_in_buffer, + buffer_size); case FILE_TYPE_IVF: - return ivf_read_frame(input->vpx_input_ctx->file, - buf, bytes_in_buffer, buffer_size); - default: - return 1; + return ivf_read_frame(input->vpx_input_ctx->file, buf, bytes_in_buffer, + buffer_size); + default: return 1; } } @@ -258,7 +262,7 @@ static void update_image_md5(const vpx_image_t *img, const int planes[3], const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; const int w = vpx_img_plane_width(img, plane) * - ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); const int h = vpx_img_plane_height(img, plane); for (y = 0; y < h; ++y) { @@ -304,8 +308,8 @@ static int file_is_raw(struct VpxInputContext *input) { if (mem_get_le32(buf) < 256 * 1024 * 1024) { for (i = 0; i < get_vpx_decoder_count(); ++i) { const VpxInterface *const decoder = get_vpx_decoder_by_index(i); - if (!vpx_codec_peek_stream_info(decoder->codec_interface(), - buf + 4, 32 - 4, &si)) { + if (!vpx_codec_peek_stream_info(decoder->codec_interface(), buf + 4, + 32 - 4, &si)) { is_raw = 1; input->fourcc = decoder->fourcc; input->width = si.w; @@ -324,13 +328,13 @@ static int file_is_raw(struct VpxInputContext *input) { static void show_progress(int frame_in, int frame_out, uint64_t dx_time) { fprintf(stderr, - "%d decoded frames/%d showed frames in %"PRId64" us (%.2f fps)\r", + "%d decoded frames/%d showed frames in %" PRId64 " us (%.2f fps)\r", frame_in, frame_out, dx_time, (double)frame_out * 1000000.0 / (double)dx_time); } struct ExternalFrameBuffer { - uint8_t* data; + uint8_t *data; size_t size; int in_use; }; @@ -349,23 +353,19 @@ static int get_vp9_frame_buffer(void *cb_priv, size_t min_size, int i; struct ExternalFrameBufferList *const ext_fb_list = (struct ExternalFrameBufferList *)cb_priv; - if (ext_fb_list == NULL) - return -1; + if (ext_fb_list == NULL) return -1; // Find a free frame buffer. for (i = 0; i < ext_fb_list->num_external_frame_buffers; ++i) { - if (!ext_fb_list->ext_fb[i].in_use) - break; + if (!ext_fb_list->ext_fb[i].in_use) break; } - if (i == ext_fb_list->num_external_frame_buffers) - return -1; + if (i == ext_fb_list->num_external_frame_buffers) return -1; if (ext_fb_list->ext_fb[i].size < min_size) { free(ext_fb_list->ext_fb[i].data); ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t)); - if (!ext_fb_list->ext_fb[i].data) - return -1; + if (!ext_fb_list->ext_fb[i].data) return -1; ext_fb_list->ext_fb[i].size = min_size; } @@ -406,47 +406,22 @@ static void generate_filename(const char *pattern, char *out, size_t q_len, /* parse the pattern */ q[q_len - 1] = '\0'; switch (p[1]) { - case 'w': - snprintf(q, q_len - 1, "%d", d_w); - break; - case 'h': - snprintf(q, q_len - 1, "%d", d_h); - break; - case '1': - snprintf(q, q_len - 1, "%d", frame_in); - break; - case '2': - snprintf(q, q_len - 1, "%02d", frame_in); - break; - case '3': - snprintf(q, q_len - 1, "%03d", frame_in); - break; - case '4': - snprintf(q, q_len - 1, "%04d", frame_in); - break; - case '5': - snprintf(q, q_len - 1, "%05d", frame_in); - break; - case '6': - snprintf(q, q_len - 1, "%06d", frame_in); - break; - case '7': - snprintf(q, q_len - 1, "%07d", frame_in); - break; - case '8': - snprintf(q, q_len - 1, "%08d", frame_in); - break; - case '9': - snprintf(q, q_len - 1, "%09d", frame_in); - break; - default: - die("Unrecognized pattern %%%c\n", p[1]); - break; + case 'w': snprintf(q, q_len - 1, "%d", d_w); break; + case 'h': snprintf(q, q_len - 1, "%d", d_h); break; + case '1': snprintf(q, q_len - 1, "%d", frame_in); break; + case '2': snprintf(q, q_len - 1, "%02d", frame_in); break; + case '3': snprintf(q, q_len - 1, "%03d", frame_in); break; + case '4': snprintf(q, q_len - 1, "%04d", frame_in); break; + case '5': snprintf(q, q_len - 1, "%05d", frame_in); break; + case '6': snprintf(q, q_len - 1, "%06d", frame_in); break; + case '7': snprintf(q, q_len - 1, "%07d", frame_in); break; + case '8': snprintf(q, q_len - 1, "%08d", frame_in); break; + case '9': snprintf(q, q_len - 1, "%09d", frame_in); break; + default: die("Unrecognized pattern %%%c\n", p[1]); break; } pat_len = strlen(q); - if (pat_len >= q_len - 1) - die("Output filename too long.\n"); + if (pat_len >= q_len - 1) die("Output filename too long.\n"); q += pat_len; p += 2; q_len -= pat_len; @@ -459,8 +434,7 @@ static void generate_filename(const char *pattern, char *out, size_t q_len, else copy_len = next_pat - p; - if (copy_len >= q_len - 1) - die("Output filename too long.\n"); + if (copy_len >= q_len - 1) die("Output filename too long.\n"); memcpy(q, p, copy_len); q[copy_len] = '\0'; @@ -478,8 +452,7 @@ static int is_single_file(const char *outfile_pattern) { p = strchr(p, '%'); if (p && p[1] >= '1' && p[1] <= '9') return 0; // pattern contains sequence number, so it's not unique - if (p) - p++; + if (p) p++; } while (p); return 1; @@ -488,8 +461,7 @@ static int is_single_file(const char *outfile_pattern) { static void print_md5(unsigned char digest[16], const char *filename) { int i; - for (i = 0; i < 16; ++i) - printf("%02x", digest[i]); + for (i = 0; i < 16; ++i) printf("%02x", digest[i]); printf(" %s\n", filename); } @@ -499,8 +471,7 @@ static FILE *open_outfile(const char *name) { return stdout; } else { FILE *file = fopen(name, "wb"); - if (!file) - fatal("Failed to open output file '%s'", name); + if (!file) fatal("Failed to open output file '%s'", name); return file; } } @@ -509,62 +480,61 @@ static FILE *open_outfile(const char *name) { static int img_shifted_realloc_required(const vpx_image_t *img, const vpx_image_t *shifted, vpx_img_fmt_t required_fmt) { - return img->d_w != shifted->d_w || - img->d_h != shifted->d_h || + return img->d_w != shifted->d_w || img->d_h != shifted->d_h || required_fmt != shifted->fmt; } #endif static int main_loop(int argc, const char **argv_) { - vpx_codec_ctx_t decoder; - char *fn = NULL; - int i; - uint8_t *buf = NULL; - size_t bytes_in_buffer = 0, buffer_size = 0; - FILE *infile; - int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0; - int do_md5 = 0, progress = 0, frame_parallel = 0; - int stop_after = 0, postproc = 0, summary = 0, quiet = 1; - int arg_skip = 0; - int ec_enabled = 0; - int keep_going = 0; + vpx_codec_ctx_t decoder; + char *fn = NULL; + int i; + uint8_t *buf = NULL; + size_t bytes_in_buffer = 0, buffer_size = 0; + FILE *infile; + int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0; + int do_md5 = 0, progress = 0, frame_parallel = 0; + int stop_after = 0, postproc = 0, summary = 0, quiet = 1; + int arg_skip = 0; + int ec_enabled = 0; + int keep_going = 0; const VpxInterface *interface = NULL; const VpxInterface *fourcc_interface = NULL; uint64_t dx_time = 0; - struct arg arg; - char **argv, **argi, **argj; - - int single_file; - int use_y4m = 1; - int opt_yv12 = 0; - int opt_i420 = 0; - vpx_codec_dec_cfg_t cfg = {0, 0, 0}; + struct arg arg; + char **argv, **argi, **argj; + + int single_file; + int use_y4m = 1; + int opt_yv12 = 0; + int opt_i420 = 0; + vpx_codec_dec_cfg_t cfg = { 0, 0, 0 }; #if CONFIG_VP9_HIGHBITDEPTH - unsigned int output_bit_depth = 0; + unsigned int output_bit_depth = 0; #endif #if CONFIG_EXT_TILE - int tile_row = -1; - int tile_col = -1; + int tile_row = -1; + int tile_col = -1; #endif // CONFIG_EXT_TILE - int frames_corrupted = 0; - int dec_flags = 0; - int do_scale = 0; - vpx_image_t *scaled_img = NULL; + int frames_corrupted = 0; + int dec_flags = 0; + int do_scale = 0; + vpx_image_t *scaled_img = NULL; #if CONFIG_VP9_HIGHBITDEPTH - vpx_image_t *img_shifted = NULL; + vpx_image_t *img_shifted = NULL; #endif - int frame_avail, got_data, flush_decoder = 0; - int num_external_frame_buffers = 0; - struct ExternalFrameBufferList ext_fb_list = {0, NULL}; + int frame_avail, got_data, flush_decoder = 0; + int num_external_frame_buffers = 0; + struct ExternalFrameBufferList ext_fb_list = { 0, NULL }; const char *outfile_pattern = NULL; - char outfile_name[PATH_MAX] = {0}; + char outfile_name[PATH_MAX] = { 0 }; FILE *outfile = NULL; MD5Context md5_ctx; unsigned char md5_digest[16]; - struct VpxDecInputContext input = {NULL, NULL}; + struct VpxDecInputContext input = { NULL, NULL }; struct VpxInputContext vpx_input_ctx; #if CONFIG_WEBM_IO struct WebmInputContext webm_ctx; @@ -702,7 +672,8 @@ static int main_loop(int argc, const char **argv_) { if (use_y4m && !noblit) { if (!single_file) { - fprintf(stderr, "YUV4MPEG2 not supported with output patterns," + fprintf(stderr, + "YUV4MPEG2 not supported with output patterns," " try --i420 or --yv12 or --rawvideo.\n"); return EXIT_FAILURE; } @@ -710,7 +681,8 @@ static int main_loop(int argc, const char **argv_) { #if CONFIG_WEBM_IO if (vpx_input_ctx.file_type == FILE_TYPE_WEBM) { if (webm_guess_framerate(input.webm_ctx, input.vpx_input_ctx)) { - fprintf(stderr, "Failed to guess framerate -- error parsing " + fprintf(stderr, + "Failed to guess framerate -- error parsing " "webm file?\n"); return EXIT_FAILURE; } @@ -724,21 +696,19 @@ static int main_loop(int argc, const char **argv_) { else interface = fourcc_interface; - if (!interface) - interface = get_vpx_decoder_by_index(0); + if (!interface) interface = get_vpx_decoder_by_index(0); dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) | (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0) | (frame_parallel ? VPX_CODEC_USE_FRAME_THREADING : 0); - if (vpx_codec_dec_init(&decoder, interface->codec_interface(), - &cfg, dec_flags)) { + if (vpx_codec_dec_init(&decoder, interface->codec_interface(), &cfg, + dec_flags)) { fprintf(stderr, "Failed to initialize decoder: %s\n", vpx_codec_error(&decoder)); return EXIT_FAILURE; } - if (!quiet) - fprintf(stderr, "%s\n", decoder.name); + if (!quiet) fprintf(stderr, "%s\n", decoder.name); #if CONFIG_VP10_DECODER && CONFIG_EXT_TILE if (strncmp(decoder.name, "WebM Project VP10", 17) == 0) { @@ -756,11 +726,9 @@ static int main_loop(int argc, const char **argv_) { } #endif - if (arg_skip) - fprintf(stderr, "Skipping first %d frames.\n", arg_skip); + if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip); while (arg_skip) { - if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) - break; + if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break; arg_skip--; } @@ -768,9 +736,9 @@ static int main_loop(int argc, const char **argv_) { ext_fb_list.num_external_frame_buffers = num_external_frame_buffers; ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc( num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb)); - if (vpx_codec_set_frame_buffer_functions( - &decoder, get_vp9_frame_buffer, release_vp9_frame_buffer, - &ext_fb_list)) { + if (vpx_codec_set_frame_buffer_functions(&decoder, get_vp9_frame_buffer, + release_vp9_frame_buffer, + &ext_fb_list)) { fprintf(stderr, "Failed to configure external frame buffers: %s\n", vpx_codec_error(&decoder)); return EXIT_FAILURE; @@ -782,10 +750,10 @@ static int main_loop(int argc, const char **argv_) { /* Decode file */ while (frame_avail || got_data) { - vpx_codec_iter_t iter = NULL; - vpx_image_t *img; + vpx_codec_iter_t iter = NULL; + vpx_image_t *img; struct vpx_usec_timer timer; - int corrupted = 0; + int corrupted = 0; frame_avail = 0; if (!stop_after || frame_in < stop_after) { @@ -795,16 +763,14 @@ static int main_loop(int argc, const char **argv_) { vpx_usec_timer_start(&timer); - if (vpx_codec_decode(&decoder, buf, (unsigned int)bytes_in_buffer, - NULL, 0)) { + if (vpx_codec_decode(&decoder, buf, (unsigned int)bytes_in_buffer, NULL, + 0)) { const char *detail = vpx_codec_error_detail(&decoder); - warn("Failed to decode frame %d: %s", - frame_in, vpx_codec_error(&decoder)); + warn("Failed to decode frame %d: %s", frame_in, + vpx_codec_error(&decoder)); - if (detail) - warn("Additional information: %s", detail); - if (!keep_going) - goto fail; + if (detail) warn("Additional information: %s", detail); + if (!keep_going) goto fail; } vpx_usec_timer_mark(&timer); @@ -837,17 +803,15 @@ static int main_loop(int argc, const char **argv_) { if (!frame_parallel && vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) { warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder)); - if (!keep_going) - goto fail; + if (!keep_going) goto fail; } frames_corrupted += corrupted; - if (progress) - show_progress(frame_in, frame_out, dx_time); + if (progress) show_progress(frame_in, frame_out, dx_time); if (!noblit && img) { - const int PLANES_YUV[] = {VPX_PLANE_Y, VPX_PLANE_U, VPX_PLANE_V}; - const int PLANES_YVU[] = {VPX_PLANE_Y, VPX_PLANE_V, VPX_PLANE_U}; + const int PLANES_YUV[] = { VPX_PLANE_Y, VPX_PLANE_U, VPX_PLANE_V }; + const int PLANES_YVU[] = { VPX_PLANE_Y, VPX_PLANE_V, VPX_PLANE_U }; const int *planes = flipuv ? PLANES_YVU : PLANES_YUV; if (do_scale) { @@ -871,8 +835,8 @@ static int main_loop(int argc, const char **argv_) { render_height = render_size[1]; } } - scaled_img = vpx_img_alloc(NULL, img->fmt, render_width, - render_height, 16); + scaled_img = + vpx_img_alloc(NULL, img->fmt, render_width, render_height, 16); scaled_img->bit_depth = img->bit_depth; } @@ -881,7 +845,8 @@ static int main_loop(int argc, const char **argv_) { libyuv_scale(img, scaled_img, kFilterBox); img = scaled_img; #else - fprintf(stderr, "Failed to scale output frame: %s.\n" + fprintf(stderr, + "Failed to scale output frame: %s.\n" "Scaling is disabled in this configuration. " "To enable scaling, configure with --enable-libyuv\n", vpx_codec_error(&decoder)); @@ -896,22 +861,22 @@ static int main_loop(int argc, const char **argv_) { } // Shift up or down if necessary if (output_bit_depth != 0 && output_bit_depth != img->bit_depth) { - const vpx_img_fmt_t shifted_fmt = output_bit_depth == 8 ? - img->fmt ^ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) : - img->fmt | VPX_IMG_FMT_HIGHBITDEPTH; + const vpx_img_fmt_t shifted_fmt = + output_bit_depth == 8 + ? img->fmt ^ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) + : img->fmt | VPX_IMG_FMT_HIGHBITDEPTH; if (img_shifted && img_shifted_realloc_required(img, img_shifted, shifted_fmt)) { vpx_img_free(img_shifted); img_shifted = NULL; } if (!img_shifted) { - img_shifted = vpx_img_alloc(NULL, shifted_fmt, - img->d_w, img->d_h, 16); + img_shifted = + vpx_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16); img_shifted->bit_depth = output_bit_depth; } if (output_bit_depth > img->bit_depth) { - vpx_img_upshift(img_shifted, img, - output_bit_depth - img->bit_depth); + vpx_img_upshift(img_shifted, img, output_bit_depth - img->bit_depth); } else { vpx_img_downshift(img_shifted, img, img->bit_depth - output_bit_depth); @@ -927,7 +892,7 @@ static int main_loop(int argc, const char **argv_) { if (single_file) { if (use_y4m) { - char buf[Y4M_BUFFER_SIZE] = {0}; + char buf[Y4M_BUFFER_SIZE] = { 0 }; size_t len = 0; if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) { fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n"); @@ -935,11 +900,9 @@ static int main_loop(int argc, const char **argv_) { } if (frame_out == 1) { // Y4M file header - len = y4m_write_file_header(buf, sizeof(buf), - vpx_input_ctx.width, - vpx_input_ctx.height, - &vpx_input_ctx.framerate, - img->fmt, img->bit_depth); + len = y4m_write_file_header( + buf, sizeof(buf), vpx_input_ctx.width, vpx_input_ctx.height, + &vpx_input_ctx.framerate, img->fmt, img->bit_depth); if (do_md5) { MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len); } else { @@ -967,7 +930,8 @@ static int main_loop(int argc, const char **argv_) { } if (opt_yv12) { if ((img->fmt != VPX_IMG_FMT_I420 && - img->fmt != VPX_IMG_FMT_YV12) || img->bit_depth != 8) { + img->fmt != VPX_IMG_FMT_YV12) || + img->bit_depth != 8) { fprintf(stderr, "Cannot produce yv12 output for bit-stream.\n"); goto fail; } @@ -981,8 +945,8 @@ static int main_loop(int argc, const char **argv_) { write_image_file(img, planes, outfile); } } else { - generate_filename(outfile_pattern, outfile_name, PATH_MAX, - img->d_w, img->d_h, frame_in); + generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w, + img->d_h, frame_in); if (do_md5) { MD5Init(&md5_ctx); update_image_md5(img, planes, &md5_ctx); @@ -1027,8 +991,7 @@ fail: webm_free(input.webm_ctx); #endif - if (input.vpx_input_ctx->file_type != FILE_TYPE_WEBM) - free(buf); + if (input.vpx_input_ctx->file_type != FILE_TYPE_WEBM) free(buf); if (scaled_img) vpx_img_free(scaled_img); #if CONFIG_VP9_HIGHBITDEPTH @@ -1063,7 +1026,6 @@ int main(int argc, const char **argv_) { } } free(argv); - for (i = 0; !error && i < loops; i++) - error = main_loop(argc, argv_); + for (i = 0; !error && i < loops; i++) error = main_loop(argc, argv_); return error; } diff --git a/vpxenc.c b/vpxenc.c index 30d9696e13628c0d1edcd2cdc88793d3daf54946..1bc060b4f6c57f37e38aba02c80783e016c0c731 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -51,8 +51,7 @@ #include "./y4minput.h" /* Swallow warnings about unused results of fread/fwrite */ -static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, - FILE *stream) { +static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { return fread(ptr, size, nmemb, stream); } #define fread wrap_fread @@ -63,7 +62,6 @@ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb, } #define fwrite wrap_fwrite - static const char *exec_name; static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal, @@ -74,11 +72,9 @@ static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal, vfprintf(stderr, s, ap); fprintf(stderr, ": %s\n", vpx_codec_error(ctx)); - if (detail) - fprintf(stderr, " %s\n", detail); + if (detail) fprintf(stderr, " %s\n", detail); - if (fatal) - exit(EXIT_FAILURE); + if (fatal) exit(EXIT_FAILURE); } } @@ -105,8 +101,7 @@ static int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) { int shortread = 0; if (input_ctx->file_type == FILE_TYPE_Y4M) { - if (y4m_input_fetch_frame(y4m, f, img) < 1) - return 0; + if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0; } else { shortread = read_yuv_frame(input_ctx, img); } @@ -128,252 +123,262 @@ static int fourcc_is_ivf(const char detect[4]) { return 0; } -static const arg_def_t debugmode = ARG_DEF( - "D", "debug", 0, "Debug mode (makes output deterministic)"); -static const arg_def_t outputfile = ARG_DEF( - "o", "output", 1, "Output filename"); -static const arg_def_t use_yv12 = ARG_DEF( - NULL, "yv12", 0, "Input file is YV12 "); -static const arg_def_t use_i420 = ARG_DEF( - NULL, "i420", 0, "Input file is I420 (default)"); -static const arg_def_t use_i422 = ARG_DEF( - NULL, "i422", 0, "Input file is I422"); -static const arg_def_t use_i444 = ARG_DEF( - NULL, "i444", 0, "Input file is I444"); -static const arg_def_t use_i440 = ARG_DEF( - NULL, "i440", 0, "Input file is I440"); -static const arg_def_t codecarg = ARG_DEF( - NULL, "codec", 1, "Codec to use"); -static const arg_def_t passes = ARG_DEF( - "p", "passes", 1, "Number of passes (1/2)"); -static const arg_def_t pass_arg = ARG_DEF( - NULL, "pass", 1, "Pass to execute (1/2)"); -static const arg_def_t fpf_name = ARG_DEF( - NULL, "fpf", 1, "First pass statistics file name"); +static const arg_def_t debugmode = + ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)"); +static const arg_def_t outputfile = + ARG_DEF("o", "output", 1, "Output filename"); +static const arg_def_t use_yv12 = + ARG_DEF(NULL, "yv12", 0, "Input file is YV12 "); +static const arg_def_t use_i420 = + ARG_DEF(NULL, "i420", 0, "Input file is I420 (default)"); +static const arg_def_t use_i422 = + ARG_DEF(NULL, "i422", 0, "Input file is I422"); +static const arg_def_t use_i444 = + ARG_DEF(NULL, "i444", 0, "Input file is I444"); +static const arg_def_t use_i440 = + ARG_DEF(NULL, "i440", 0, "Input file is I440"); +static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"); +static const arg_def_t passes = + ARG_DEF("p", "passes", 1, "Number of passes (1/2)"); +static const arg_def_t pass_arg = + ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)"); +static const arg_def_t fpf_name = + ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"); #if CONFIG_FP_MB_STATS -static const arg_def_t fpmbf_name = ARG_DEF( - NULL, "fpmbf", 1, "First pass block statistics file name"); +static const arg_def_t fpmbf_name = + ARG_DEF(NULL, "fpmbf", 1, "First pass block statistics file name"); #endif -static const arg_def_t limit = ARG_DEF( - NULL, "limit", 1, "Stop encoding after n input frames"); -static const arg_def_t skip = ARG_DEF( - NULL, "skip", 1, "Skip the first n input frames"); -static const arg_def_t deadline = ARG_DEF( - "d", "deadline", 1, "Deadline per frame (usec)"); -static const arg_def_t best_dl = ARG_DEF( - NULL, "best", 0, "Use Best Quality Deadline"); -static const arg_def_t good_dl = ARG_DEF( - NULL, "good", 0, "Use Good Quality Deadline"); -static const arg_def_t rt_dl = ARG_DEF( - NULL, "rt", 0, "Use Realtime Quality Deadline"); -static const arg_def_t quietarg = ARG_DEF( - "q", "quiet", 0, "Do not print encode progress"); -static const arg_def_t verbosearg = ARG_DEF( - "v", "verbose", 0, "Show encoder parameters"); -static const arg_def_t psnrarg = ARG_DEF( - NULL, "psnr", 0, "Show PSNR in status line"); +static const arg_def_t limit = + ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames"); +static const arg_def_t skip = + ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"); +static const arg_def_t deadline = + ARG_DEF("d", "deadline", 1, "Deadline per frame (usec)"); +static const arg_def_t best_dl = + ARG_DEF(NULL, "best", 0, "Use Best Quality Deadline"); +static const arg_def_t good_dl = + ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline"); +static const arg_def_t rt_dl = + ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline"); +static const arg_def_t quietarg = + ARG_DEF("q", "quiet", 0, "Do not print encode progress"); +static const arg_def_t verbosearg = + ARG_DEF("v", "verbose", 0, "Show encoder parameters"); +static const arg_def_t psnrarg = + ARG_DEF(NULL, "psnr", 0, "Show PSNR in status line"); static const struct arg_enum_list test_decode_enum[] = { - {"off", TEST_DECODE_OFF}, - {"fatal", TEST_DECODE_FATAL}, - {"warn", TEST_DECODE_WARN}, - {NULL, 0} + { "off", TEST_DECODE_OFF }, + { "fatal", TEST_DECODE_FATAL }, + { "warn", TEST_DECODE_WARN }, + { NULL, 0 } }; static const arg_def_t recontest = ARG_DEF_ENUM( NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum); -static const arg_def_t framerate = ARG_DEF( - NULL, "fps", 1, "Stream frame rate (rate/scale)"); -static const arg_def_t use_webm = ARG_DEF( - NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)"); -static const arg_def_t use_ivf = ARG_DEF( - NULL, "ivf", 0, "Output IVF"); -static const arg_def_t out_part = ARG_DEF( - "P", "output-partitions", 0, - "Makes encoder output partitions. Requires IVF output!"); -static const arg_def_t q_hist_n = ARG_DEF( - NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)"); -static const arg_def_t rate_hist_n = ARG_DEF( - NULL, "rate-hist", 1, "Show rate histogram (n-buckets)"); -static const arg_def_t disable_warnings = ARG_DEF( - NULL, "disable-warnings", 0, - "Disable warnings about potentially incorrect encode settings."); -static const arg_def_t disable_warning_prompt = ARG_DEF( - "y", "disable-warning-prompt", 0, - "Display warnings, but do not prompt user to continue."); +static const arg_def_t framerate = + ARG_DEF(NULL, "fps", 1, "Stream frame rate (rate/scale)"); +static const arg_def_t use_webm = + ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)"); +static const arg_def_t use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF"); +static const arg_def_t out_part = + ARG_DEF("P", "output-partitions", 0, + "Makes encoder output partitions. Requires IVF output!"); +static const arg_def_t q_hist_n = + ARG_DEF(NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)"); +static const arg_def_t rate_hist_n = + ARG_DEF(NULL, "rate-hist", 1, "Show rate histogram (n-buckets)"); +static const arg_def_t disable_warnings = + ARG_DEF(NULL, "disable-warnings", 0, + "Disable warnings about potentially incorrect encode settings."); +static const arg_def_t disable_warning_prompt = + ARG_DEF("y", "disable-warning-prompt", 0, + "Display warnings, but do not prompt user to continue."); #if CONFIG_VP9_HIGHBITDEPTH static const arg_def_t test16bitinternalarg = ARG_DEF( NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer"); #endif -static const arg_def_t *main_args[] = { - &debugmode, - &outputfile, &codecarg, &passes, &pass_arg, &fpf_name, &limit, &skip, - &deadline, &best_dl, &good_dl, &rt_dl, - &quietarg, &verbosearg, &psnrarg, &use_webm, &use_ivf, &out_part, &q_hist_n, - &rate_hist_n, &disable_warnings, &disable_warning_prompt, &recontest, - NULL -}; - -static const arg_def_t usage = ARG_DEF( - "u", "usage", 1, "Usage profile number to use"); -static const arg_def_t threads = ARG_DEF( - "t", "threads", 1, "Max number of threads to use"); -static const arg_def_t profile = ARG_DEF( - NULL, "profile", 1, "Bitstream profile number to use"); +static const arg_def_t *main_args[] = { &debugmode, + &outputfile, + &codecarg, + &passes, + &pass_arg, + &fpf_name, + &limit, + &skip, + &deadline, + &best_dl, + &good_dl, + &rt_dl, + &quietarg, + &verbosearg, + &psnrarg, + &use_webm, + &use_ivf, + &out_part, + &q_hist_n, + &rate_hist_n, + &disable_warnings, + &disable_warning_prompt, + &recontest, + NULL }; + +static const arg_def_t usage = + ARG_DEF("u", "usage", 1, "Usage profile number to use"); +static const arg_def_t threads = + ARG_DEF("t", "threads", 1, "Max number of threads to use"); +static const arg_def_t profile = + ARG_DEF(NULL, "profile", 1, "Bitstream profile number to use"); static const arg_def_t width = ARG_DEF("w", "width", 1, "Frame width"); static const arg_def_t height = ARG_DEF("h", "height", 1, "Frame height"); #if CONFIG_WEBM_IO static const struct arg_enum_list stereo_mode_enum[] = { - {"mono", STEREO_FORMAT_MONO}, - {"left-right", STEREO_FORMAT_LEFT_RIGHT}, - {"bottom-top", STEREO_FORMAT_BOTTOM_TOP}, - {"top-bottom", STEREO_FORMAT_TOP_BOTTOM}, - {"right-left", STEREO_FORMAT_RIGHT_LEFT}, - {NULL, 0} + { "mono", STEREO_FORMAT_MONO }, + { "left-right", STEREO_FORMAT_LEFT_RIGHT }, + { "bottom-top", STEREO_FORMAT_BOTTOM_TOP }, + { "top-bottom", STEREO_FORMAT_TOP_BOTTOM }, + { "right-left", STEREO_FORMAT_RIGHT_LEFT }, + { NULL, 0 } }; static const arg_def_t stereo_mode = ARG_DEF_ENUM( NULL, "stereo-mode", 1, "Stereo 3D video format", stereo_mode_enum); #endif static const arg_def_t timebase = ARG_DEF( NULL, "timebase", 1, "Output timestamp precision (fractional seconds)"); -static const arg_def_t error_resilient = ARG_DEF( - NULL, "error-resilient", 1, "Enable error resiliency features"); -static const arg_def_t lag_in_frames = ARG_DEF( - NULL, "lag-in-frames", 1, "Max number of frames to lag"); - -static const arg_def_t *global_args[] = { - &use_yv12, &use_i420, &use_i422, &use_i444, &use_i440, - &usage, &threads, &profile, - &width, &height, +static const arg_def_t error_resilient = + ARG_DEF(NULL, "error-resilient", 1, "Enable error resiliency features"); +static const arg_def_t lag_in_frames = + ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag"); + +static const arg_def_t *global_args[] = { &use_yv12, + &use_i420, + &use_i422, + &use_i444, + &use_i440, + &usage, + &threads, + &profile, + &width, + &height, #if CONFIG_WEBM_IO - &stereo_mode, + &stereo_mode, #endif - &timebase, &framerate, - &error_resilient, + &timebase, + &framerate, + &error_resilient, #if CONFIG_VP9_HIGHBITDEPTH - &test16bitinternalarg, + &test16bitinternalarg, #endif - &lag_in_frames, NULL -}; - -static const arg_def_t dropframe_thresh = ARG_DEF( - NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"); -static const arg_def_t resize_allowed = ARG_DEF( - NULL, "resize-allowed", 1, "Spatial resampling enabled (bool)"); -static const arg_def_t resize_width = ARG_DEF( - NULL, "resize-width", 1, "Width of encoded frame"); -static const arg_def_t resize_height = ARG_DEF( - NULL, "resize-height", 1, "Height of encoded frame"); -static const arg_def_t resize_up_thresh = ARG_DEF( - NULL, "resize-up", 1, "Upscale threshold (buf %)"); -static const arg_def_t resize_down_thresh = ARG_DEF( - NULL, "resize-down", 1, "Downscale threshold (buf %)"); -static const struct arg_enum_list end_usage_enum[] = { - {"vbr", VPX_VBR}, - {"cbr", VPX_CBR}, - {"cq", VPX_CQ}, - {"q", VPX_Q}, - {NULL, 0} -}; -static const arg_def_t end_usage = ARG_DEF_ENUM( - NULL, "end-usage", 1, "Rate control mode", end_usage_enum); -static const arg_def_t target_bitrate = ARG_DEF( - NULL, "target-bitrate", 1, "Bitrate (kbps)"); -static const arg_def_t min_quantizer = ARG_DEF( - NULL, "min-q", 1, "Minimum (best) quantizer"); -static const arg_def_t max_quantizer = ARG_DEF( - NULL, "max-q", 1, "Maximum (worst) quantizer"); -static const arg_def_t undershoot_pct = ARG_DEF( - NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)"); -static const arg_def_t overshoot_pct = ARG_DEF( - NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)"); -static const arg_def_t buf_sz = ARG_DEF( - NULL, "buf-sz", 1, "Client buffer size (ms)"); -static const arg_def_t buf_initial_sz = ARG_DEF( - NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)"); -static const arg_def_t buf_optimal_sz = ARG_DEF( - NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)"); + &lag_in_frames, + NULL }; + +static const arg_def_t dropframe_thresh = + ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"); +static const arg_def_t resize_allowed = + ARG_DEF(NULL, "resize-allowed", 1, "Spatial resampling enabled (bool)"); +static const arg_def_t resize_width = + ARG_DEF(NULL, "resize-width", 1, "Width of encoded frame"); +static const arg_def_t resize_height = + ARG_DEF(NULL, "resize-height", 1, "Height of encoded frame"); +static const arg_def_t resize_up_thresh = + ARG_DEF(NULL, "resize-up", 1, "Upscale threshold (buf %)"); +static const arg_def_t resize_down_thresh = + ARG_DEF(NULL, "resize-down", 1, "Downscale threshold (buf %)"); +static const struct arg_enum_list end_usage_enum[] = { { "vbr", VPX_VBR }, + { "cbr", VPX_CBR }, + { "cq", VPX_CQ }, + { "q", VPX_Q }, + { NULL, 0 } }; +static const arg_def_t end_usage = + ARG_DEF_ENUM(NULL, "end-usage", 1, "Rate control mode", end_usage_enum); +static const arg_def_t target_bitrate = + ARG_DEF(NULL, "target-bitrate", 1, "Bitrate (kbps)"); +static const arg_def_t min_quantizer = + ARG_DEF(NULL, "min-q", 1, "Minimum (best) quantizer"); +static const arg_def_t max_quantizer = + ARG_DEF(NULL, "max-q", 1, "Maximum (worst) quantizer"); +static const arg_def_t undershoot_pct = + ARG_DEF(NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)"); +static const arg_def_t overshoot_pct = + ARG_DEF(NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)"); +static const arg_def_t buf_sz = + ARG_DEF(NULL, "buf-sz", 1, "Client buffer size (ms)"); +static const arg_def_t buf_initial_sz = + ARG_DEF(NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)"); +static const arg_def_t buf_optimal_sz = + ARG_DEF(NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)"); static const arg_def_t *rc_args[] = { - &dropframe_thresh, &resize_allowed, &resize_width, &resize_height, - &resize_up_thresh, &resize_down_thresh, &end_usage, &target_bitrate, - &min_quantizer, &max_quantizer, &undershoot_pct, &overshoot_pct, &buf_sz, - &buf_initial_sz, &buf_optimal_sz, NULL -}; - - -static const arg_def_t bias_pct = ARG_DEF( - NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)"); -static const arg_def_t minsection_pct = ARG_DEF( - NULL, "minsection-pct", 1, "GOP min bitrate (% of target)"); -static const arg_def_t maxsection_pct = ARG_DEF( - NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)"); -static const arg_def_t *rc_twopass_args[] = { - &bias_pct, &minsection_pct, &maxsection_pct, NULL -}; - - -static const arg_def_t kf_min_dist = ARG_DEF( - NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)"); -static const arg_def_t kf_max_dist = ARG_DEF( - NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)"); -static const arg_def_t kf_disabled = ARG_DEF( - NULL, "disable-kf", 0, "Disable keyframe placement"); -static const arg_def_t *kf_args[] = { - &kf_min_dist, &kf_max_dist, &kf_disabled, NULL + &dropframe_thresh, &resize_allowed, &resize_width, &resize_height, + &resize_up_thresh, &resize_down_thresh, &end_usage, &target_bitrate, + &min_quantizer, &max_quantizer, &undershoot_pct, &overshoot_pct, + &buf_sz, &buf_initial_sz, &buf_optimal_sz, NULL }; - -static const arg_def_t noise_sens = ARG_DEF( - NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)"); -static const arg_def_t sharpness = ARG_DEF( - NULL, "sharpness", 1, "Loop filter sharpness (0..7)"); -static const arg_def_t static_thresh = ARG_DEF( - NULL, "static-thresh", 1, "Motion detection threshold"); -static const arg_def_t auto_altref = ARG_DEF( - NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames"); -static const arg_def_t arnr_maxframes = ARG_DEF( - NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)"); -static const arg_def_t arnr_strength = ARG_DEF( - NULL, "arnr-strength", 1, "AltRef filter strength (0..6)"); -static const arg_def_t arnr_type = ARG_DEF( - NULL, "arnr-type", 1, "AltRef type"); +static const arg_def_t bias_pct = + ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)"); +static const arg_def_t minsection_pct = + ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)"); +static const arg_def_t maxsection_pct = + ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)"); +static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct, + &maxsection_pct, NULL }; + +static const arg_def_t kf_min_dist = + ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)"); +static const arg_def_t kf_max_dist = + ARG_DEF(NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)"); +static const arg_def_t kf_disabled = + ARG_DEF(NULL, "disable-kf", 0, "Disable keyframe placement"); +static const arg_def_t *kf_args[] = { &kf_min_dist, &kf_max_dist, &kf_disabled, + NULL }; + +static const arg_def_t noise_sens = + ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)"); +static const arg_def_t sharpness = + ARG_DEF(NULL, "sharpness", 1, "Loop filter sharpness (0..7)"); +static const arg_def_t static_thresh = + ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"); +static const arg_def_t auto_altref = + ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames"); +static const arg_def_t arnr_maxframes = + ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)"); +static const arg_def_t arnr_strength = + ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)"); +static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1, "AltRef type"); static const struct arg_enum_list tuning_enum[] = { - {"psnr", VPX_TUNE_PSNR}, - {"ssim", VPX_TUNE_SSIM}, - {NULL, 0} + { "psnr", VPX_TUNE_PSNR }, { "ssim", VPX_TUNE_SSIM }, { NULL, 0 } }; -static const arg_def_t tune_ssim = ARG_DEF_ENUM( - NULL, "tune", 1, "Material to favor", tuning_enum); -static const arg_def_t cq_level = ARG_DEF( - NULL, "cq-level", 1, "Constant/Constrained Quality level"); -static const arg_def_t max_intra_rate_pct = ARG_DEF( - NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"); - +static const arg_def_t tune_ssim = + ARG_DEF_ENUM(NULL, "tune", 1, "Material to favor", tuning_enum); +static const arg_def_t cq_level = + ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level"); +static const arg_def_t max_intra_rate_pct = + ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"); #if CONFIG_VP10_ENCODER -static const arg_def_t cpu_used_vp9 = ARG_DEF( - NULL, "cpu-used", 1, "CPU Used (-8..8)"); -static const arg_def_t tile_cols = ARG_DEF( - NULL, "tile-columns", 1, "Number of tile columns to use, log2"); -static const arg_def_t tile_rows = ARG_DEF( - NULL, "tile-rows", 1, - "Number of tile rows to use, log2 (set to 0 while threads > 1)"); -static const arg_def_t lossless = ARG_DEF( - NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)"); +static const arg_def_t cpu_used_vp9 = + ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-8..8)"); +static const arg_def_t tile_cols = + ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2"); +static const arg_def_t tile_rows = + ARG_DEF(NULL, "tile-rows", 1, + "Number of tile rows to use, log2 (set to 0 while threads > 1)"); +static const arg_def_t lossless = + ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)"); static const arg_def_t frame_parallel_decoding = ARG_DEF( NULL, "frame-parallel", 1, "Enable frame parallel decodability features"); static const arg_def_t aq_mode = ARG_DEF( NULL, "aq-mode", 1, "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, " "3: cyclic refresh, 4: equator360)"); -static const arg_def_t frame_periodic_boost = ARG_DEF( - NULL, "frame-boost", 1, - "Enable frame periodic boost (0: off (default), 1: on)"); +static const arg_def_t frame_periodic_boost = + ARG_DEF(NULL, "frame-boost", 1, + "Enable frame periodic boost (0: off (default), 1: on)"); static const arg_def_t gf_cbr_boost_pct = ARG_DEF( NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)"); -static const arg_def_t max_inter_rate_pct = ARG_DEF( - NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)"); +static const arg_def_t max_inter_rate_pct = + ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)"); static const arg_def_t min_gf_interval = ARG_DEF( NULL, "min-gf-interval", 1, "min gf/arf frame interval (default 0, indicating in-built behavior)"); @@ -393,30 +398,27 @@ static const struct arg_enum_list color_space_enum[] = { { NULL, 0 } }; -static const arg_def_t input_color_space = ARG_DEF_ENUM( - NULL, "color-space", 1, - "The color space of input content:", color_space_enum); +static const arg_def_t input_color_space = + ARG_DEF_ENUM(NULL, "color-space", 1, "The color space of input content:", + color_space_enum); #if CONFIG_VP9_HIGHBITDEPTH static const struct arg_enum_list bitdepth_enum[] = { - {"8", VPX_BITS_8}, - {"10", VPX_BITS_10}, - {"12", VPX_BITS_12}, - {NULL, 0} + { "8", VPX_BITS_8 }, { "10", VPX_BITS_10 }, { "12", VPX_BITS_12 }, { NULL, 0 } }; static const arg_def_t bitdeptharg = ARG_DEF_ENUM( "b", "bit-depth", 1, "Bit depth for codec (8 for version <=1, 10 or 12 for version 2)", bitdepth_enum); -static const arg_def_t inbitdeptharg = ARG_DEF( - NULL, "input-bit-depth", 1, "Bit depth of input"); +static const arg_def_t inbitdeptharg = + ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input"); #endif static const struct arg_enum_list tune_content_enum[] = { - {"default", VPX_CONTENT_DEFAULT}, - {"screen", VPX_CONTENT_SCREEN}, - {NULL, 0} + { "default", VPX_CONTENT_DEFAULT }, + { "screen", VPX_CONTENT_SCREEN }, + { NULL, 0 } }; static const arg_def_t tune_content = ARG_DEF_ENUM( @@ -428,51 +430,76 @@ static const arg_def_t target_level = ARG_DEF( " 11: level 1.1; ... 62: level 6.2)"); #endif - #if CONFIG_VP10_ENCODER #if CONFIG_EXT_PARTITION static const struct arg_enum_list superblock_size_enum[] = { - {"dynamic", VPX_SUPERBLOCK_SIZE_DYNAMIC}, - {"64", VPX_SUPERBLOCK_SIZE_64X64}, - {"128", VPX_SUPERBLOCK_SIZE_128X128}, - {NULL, 0} + { "dynamic", VPX_SUPERBLOCK_SIZE_DYNAMIC }, + { "64", VPX_SUPERBLOCK_SIZE_64X64 }, + { "128", VPX_SUPERBLOCK_SIZE_128X128 }, + { NULL, 0 } }; static const arg_def_t superblock_size = ARG_DEF_ENUM( NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum); #endif // CONFIG_EXT_PARTITION -static const arg_def_t *vp10_args[] = { - &cpu_used_vp9, &auto_altref, &sharpness, &static_thresh, - &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type, - &tune_ssim, &cq_level, &max_intra_rate_pct, &max_inter_rate_pct, - &gf_cbr_boost_pct, &lossless, - &frame_parallel_decoding, &aq_mode, &frame_periodic_boost, - &noise_sens, &tune_content, &input_color_space, - &min_gf_interval, &max_gf_interval, +static const arg_def_t *vp10_args[] = { &cpu_used_vp9, + &auto_altref, + &sharpness, + &static_thresh, + &tile_cols, + &tile_rows, + &arnr_maxframes, + &arnr_strength, + &arnr_type, + &tune_ssim, + &cq_level, + &max_intra_rate_pct, + &max_inter_rate_pct, + &gf_cbr_boost_pct, + &lossless, + &frame_parallel_decoding, + &aq_mode, + &frame_periodic_boost, + &noise_sens, + &tune_content, + &input_color_space, + &min_gf_interval, + &max_gf_interval, #if CONFIG_EXT_PARTITION - &superblock_size, + &superblock_size, #endif // CONFIG_EXT_PARTITION #if CONFIG_VP9_HIGHBITDEPTH - &bitdeptharg, &inbitdeptharg, + &bitdeptharg, + &inbitdeptharg, #endif // CONFIG_VP9_HIGHBITDEPTH - NULL -}; -static const int vp10_arg_ctrl_map[] = { - VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF, - VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD, - VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS, - VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE, - VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT, - VP9E_SET_MAX_INTER_BITRATE_PCT, VP9E_SET_GF_CBR_BOOST_PCT, - VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE, - VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY, - VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE, - VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL, + NULL }; +static const int vp10_arg_ctrl_map[] = { VP8E_SET_CPUUSED, + VP8E_SET_ENABLEAUTOALTREF, + VP8E_SET_SHARPNESS, + VP8E_SET_STATIC_THRESHOLD, + VP9E_SET_TILE_COLUMNS, + VP9E_SET_TILE_ROWS, + VP8E_SET_ARNR_MAXFRAMES, + VP8E_SET_ARNR_STRENGTH, + VP8E_SET_ARNR_TYPE, + VP8E_SET_TUNING, + VP8E_SET_CQ_LEVEL, + VP8E_SET_MAX_INTRA_BITRATE_PCT, + VP9E_SET_MAX_INTER_BITRATE_PCT, + VP9E_SET_GF_CBR_BOOST_PCT, + VP9E_SET_LOSSLESS, + VP9E_SET_FRAME_PARALLEL_DECODING, + VP9E_SET_AQ_MODE, + VP9E_SET_FRAME_PERIODIC_BOOST, + VP9E_SET_NOISE_SENSITIVITY, + VP9E_SET_TUNE_CONTENT, + VP9E_SET_COLOR_SPACE, + VP9E_SET_MIN_GF_INTERVAL, + VP9E_SET_MAX_GF_INTERVAL, #if CONFIG_EXT_PARTITION - VP10E_SET_SUPERBLOCK_SIZE, + VP10E_SET_SUPERBLOCK_SIZE, #endif // CONFIG_EXT_PARTITION - 0 -}; + 0 }; #endif static const arg_def_t *no_args[] = { NULL }; @@ -498,17 +525,17 @@ void usage_exit(void) { fprintf(stderr, "\nVP10 Specific Options:\n"); arg_show_usage(stderr, vp10_args); #endif - fprintf(stderr, "\nStream timebase (--timebase):\n" + fprintf(stderr, + "\nStream timebase (--timebase):\n" " The desired precision of timestamps in the output, expressed\n" " in fractional seconds. Default is 1/1000.\n"); fprintf(stderr, "\nIncluded encoders:\n\n"); for (i = 0; i < num_encoder; ++i) { const VpxInterface *const encoder = get_vpx_encoder_by_index(i); - const char* defstr = (i == (num_encoder - 1)) ? "(default)" : ""; - fprintf(stderr, " %-6s - %s %s\n", - encoder->name, vpx_codec_iface_name(encoder->codec_interface()), - defstr); + const char *defstr = (i == (num_encoder - 1)) ? "(default)" : ""; + fprintf(stderr, " %-6s - %s %s\n", encoder->name, + vpx_codec_iface_name(encoder->codec_interface()), defstr); } fprintf(stderr, "\n "); fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n"); @@ -516,12 +543,12 @@ void usage_exit(void) { exit(EXIT_FAILURE); } -#define mmin(a, b) ((a) < (b) ? (a) : (b)) +#define mmin(a, b) ((a) < (b) ? (a) : (b)) #if CONFIG_VP9_HIGHBITDEPTH static void find_mismatch_high(const vpx_image_t *const img1, - const vpx_image_t *const img2, - int yloc[4], int uloc[4], int vloc[4]) { + const vpx_image_t *const img2, int yloc[4], + int uloc[4], int vloc[4]) { uint16_t *plane1, *plane2; uint32_t stride1, stride2; const uint32_t bsize = 64; @@ -534,10 +561,10 @@ static void find_mismatch_high(const vpx_image_t *const img1, int match = 1; uint32_t i, j; yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; - plane1 = (uint16_t*)img1->planes[VPX_PLANE_Y]; - plane2 = (uint16_t*)img2->planes[VPX_PLANE_Y]; - stride1 = img1->stride[VPX_PLANE_Y]/2; - stride2 = img2->stride[VPX_PLANE_Y]/2; + plane1 = (uint16_t *)img1->planes[VPX_PLANE_Y]; + plane2 = (uint16_t *)img2->planes[VPX_PLANE_Y]; + stride1 = img1->stride[VPX_PLANE_Y] / 2; + stride2 = img2->stride[VPX_PLANE_Y] / 2; for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { for (j = 0; match && j < img1->d_w; j += bsize) { int k, l; @@ -560,10 +587,10 @@ static void find_mismatch_high(const vpx_image_t *const img1, } uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; - plane1 = (uint16_t*)img1->planes[VPX_PLANE_U]; - plane2 = (uint16_t*)img2->planes[VPX_PLANE_U]; - stride1 = img1->stride[VPX_PLANE_U]/2; - stride2 = img2->stride[VPX_PLANE_U]/2; + plane1 = (uint16_t *)img1->planes[VPX_PLANE_U]; + plane2 = (uint16_t *)img2->planes[VPX_PLANE_U]; + stride1 = img1->stride[VPX_PLANE_U] / 2; + stride2 = img2->stride[VPX_PLANE_U] / 2; for (i = 0, match = 1; match && i < c_h; i += bsizey) { for (j = 0; match && j < c_w; j += bsizex) { int k, l; @@ -586,10 +613,10 @@ static void find_mismatch_high(const vpx_image_t *const img1, } vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; - plane1 = (uint16_t*)img1->planes[VPX_PLANE_V]; - plane2 = (uint16_t*)img2->planes[VPX_PLANE_V]; - stride1 = img1->stride[VPX_PLANE_V]/2; - stride2 = img2->stride[VPX_PLANE_V]/2; + plane1 = (uint16_t *)img1->planes[VPX_PLANE_V]; + plane2 = (uint16_t *)img2->planes[VPX_PLANE_V]; + stride1 = img1->stride[VPX_PLANE_V] / 2; + stride2 = img2->stride[VPX_PLANE_V] / 2; for (i = 0, match = 1; match && i < c_h; i += bsizey) { for (j = 0; match && j < c_w; j += bsizex) { int k, l; @@ -614,8 +641,8 @@ static void find_mismatch_high(const vpx_image_t *const img1, #endif static void find_mismatch(const vpx_image_t *const img1, - const vpx_image_t *const img2, - int yloc[4], int uloc[4], int vloc[4]) { + const vpx_image_t *const img2, int yloc[4], + int uloc[4], int vloc[4]) { const uint32_t bsize = 64; const uint32_t bsizey = bsize >> img1->y_chroma_shift; const uint32_t bsizex = bsize >> img1->x_chroma_shift; @@ -706,8 +733,7 @@ static void find_mismatch(const vpx_image_t *const img1, static int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2) { uint32_t l_w = img1->d_w; - uint32_t c_w = - (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; const uint32_t c_h = (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; uint32_t i; @@ -741,84 +767,79 @@ static int compare_img(const vpx_image_t *const img1, return match; } - -#define NELEMENTS(x) (sizeof(x)/sizeof(x[0])) +#define NELEMENTS(x) (sizeof(x) / sizeof(x[0])) #if CONFIG_VP10_ENCODER #define ARG_CTRL_CNT_MAX NELEMENTS(vp10_arg_ctrl_map) #endif #if !CONFIG_WEBM_IO typedef int stereo_format_t; -struct WebmOutputContext { int debug; }; +struct WebmOutputContext { + int debug; +}; #endif /* Per-stream configuration */ struct stream_config { - struct vpx_codec_enc_cfg cfg; - const char *out_fn; - const char *stats_fn; + struct vpx_codec_enc_cfg cfg; + const char *out_fn; + const char *stats_fn; #if CONFIG_FP_MB_STATS - const char *fpmb_stats_fn; + const char *fpmb_stats_fn; #endif - stereo_format_t stereo_fmt; - int arg_ctrls[ARG_CTRL_CNT_MAX][2]; - int arg_ctrl_cnt; - int write_webm; + stereo_format_t stereo_fmt; + int arg_ctrls[ARG_CTRL_CNT_MAX][2]; + int arg_ctrl_cnt; + int write_webm; #if CONFIG_VP9_HIGHBITDEPTH // whether to use 16bit internal buffers - int use_16bit_internal; + int use_16bit_internal; #endif }; - struct stream_state { - int index; - struct stream_state *next; - struct stream_config config; - FILE *file; - struct rate_hist *rate_hist; - struct WebmOutputContext webm_ctx; - uint64_t psnr_sse_total; - uint64_t psnr_samples_total; - double psnr_totals[4]; - int psnr_count; - int counts[64]; - vpx_codec_ctx_t encoder; - unsigned int frames_out; - uint64_t cx_time; - size_t nbytes; - stats_io_t stats; + int index; + struct stream_state *next; + struct stream_config config; + FILE *file; + struct rate_hist *rate_hist; + struct WebmOutputContext webm_ctx; + uint64_t psnr_sse_total; + uint64_t psnr_samples_total; + double psnr_totals[4]; + int psnr_count; + int counts[64]; + vpx_codec_ctx_t encoder; + unsigned int frames_out; + uint64_t cx_time; + size_t nbytes; + stats_io_t stats; #if CONFIG_FP_MB_STATS - stats_io_t fpmb_stats; + stats_io_t fpmb_stats; #endif - struct vpx_image *img; - vpx_codec_ctx_t decoder; - int mismatch_seen; + struct vpx_image *img; + vpx_codec_ctx_t decoder; + int mismatch_seen; }; - -static void validate_positive_rational(const char *msg, +static void validate_positive_rational(const char *msg, struct vpx_rational *rat) { if (rat->den < 0) { rat->num *= -1; rat->den *= -1; } - if (rat->num < 0) - die("Error: %s must be positive\n", msg); + if (rat->num < 0) die("Error: %s must be positive\n", msg); - if (!rat->den) - die("Error: %s has zero denominator\n", msg); + if (!rat->den) die("Error: %s has zero denominator\n", msg); } - static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { - char **argi, **argj; - struct arg arg; + char **argi, **argj; + struct arg arg; const int num_encoder = get_vpx_encoder_count(); - if (num_encoder < 1) - die("Error: no valid encoder available\n"); + if (num_encoder < 1) die("Error: no valid encoder available\n"); /* Initialize default parameters */ memset(global, 0, sizeof(*global)); @@ -844,8 +865,7 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { global->pass = arg_parse_uint(&arg); if (global->pass < 1 || global->pass > 2) - die("Error: Invalid pass selected (%d)\n", - global->pass); + die("Error: Invalid pass selected (%d)\n", global->pass); } else if (arg_match(&arg, &usage, argi)) global->usage = arg_parse_uint(&arg); else if (arg_match(&arg, &deadline, argi)) @@ -901,8 +921,8 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { if (global->pass) { /* DWIM: Assume the user meant passes=2 if pass=2 is specified */ if (global->pass > global->passes) { - warn("Assuming --pass=%d implies --passes=%d\n", - global->pass, global->pass); + warn("Assuming --pass=%d implies --passes=%d\n", global->pass, + global->pass); global->passes = global->pass; } } @@ -913,27 +933,26 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { // encoder if (global->codec != NULL && global->codec->name != NULL) global->passes = (strcmp(global->codec->name, "vp9") == 0 && - global->deadline != VPX_DL_REALTIME) ? 2 : 1; + global->deadline != VPX_DL_REALTIME) + ? 2 + : 1; #else global->passes = 1; #endif } - if (global->deadline == VPX_DL_REALTIME && - global->passes > 1) { + if (global->deadline == VPX_DL_REALTIME && global->passes > 1) { warn("Enforcing one-pass encoding in realtime mode\n"); global->passes = 1; } } - static void open_input_file(struct VpxInputContext *input) { /* Parse certain options from the input file, if possible */ - input->file = strcmp(input->filename, "-") - ? fopen(input->filename, "rb") : set_binary_mode(stdin); + input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") + : set_binary_mode(stdin); - if (!input->file) - fatal("Failed to open input file"); + if (!input->file) fatal("Failed to open input file"); if (!fseeko(input->file, 0, SEEK_END)) { /* Input file is seekable. Figure out how long it is, so we can get @@ -953,8 +972,7 @@ static void open_input_file(struct VpxInputContext *input) { input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file); input->detect.position = 0; - if (input->detect.buf_read == 4 - && file_is_y4m(input->detect.buf)) { + if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) { if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, input->only_i420) >= 0) { input->file_type = FILE_TYPE_Y4M; @@ -975,11 +993,9 @@ static void open_input_file(struct VpxInputContext *input) { } } - static void close_input_file(struct VpxInputContext *input) { fclose(input->file); - if (input->file_type == FILE_TYPE_Y4M) - y4m_input_close(&input->y4m); + if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m); } static struct stream_state *new_stream(struct VpxEncoderConfig *global, @@ -996,14 +1012,12 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global, stream->index++; prev->next = stream; } else { - vpx_codec_err_t res; + vpx_codec_err_t res; /* Populate encoder configuration */ res = vpx_codec_enc_config_default(global->codec->codec_interface(), - &stream->config.cfg, - global->usage); - if (res) - fatal("Failed to get config: %s\n", vpx_codec_err_to_string(res)); + &stream->config.cfg, global->usage); + if (res) fatal("Failed to get config: %s\n", vpx_codec_err_to_string(res)); /* Change the default timebase to a high enough value so that the * encoder will always create strictly increasing timestamps. @@ -1040,18 +1054,16 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global, return stream; } - static int parse_stream_params(struct VpxEncoderConfig *global, - struct stream_state *stream, - char **argv) { - char **argi, **argj; - struct arg arg; + struct stream_state *stream, char **argv) { + char **argi, **argj; + struct arg arg; static const arg_def_t **ctrl_args = no_args; - static const int *ctrl_args_map = NULL; - struct stream_config *config = &stream->config; - int eos_mark_found = 0; + static const int *ctrl_args_map = NULL; + struct stream_config *config = &stream->config; + int eos_mark_found = 0; #if CONFIG_VP9_HIGHBITDEPTH - int test_16bit_internal = 0; + int test_16bit_internal = 0; #endif // Handle codec specific options @@ -1156,7 +1168,7 @@ static int parse_stream_params(struct VpxEncoderConfig *global, } else if (arg_match(&arg, &buf_optimal_sz, argi)) { config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg); } else if (arg_match(&arg, &bias_pct, argi)) { - config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg); + config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg); if (global->passes < 2) warn("option %s ignored in one-pass mode.\n", arg.name); } else if (arg_match(&arg, &minsection_pct, argi)) { @@ -1202,43 +1214,41 @@ static int parse_stream_params(struct VpxEncoderConfig *global, if (ctrl_args_map != NULL && j < (int)ARG_CTRL_CNT_MAX) { config->arg_ctrls[j][0] = ctrl_args_map[i]; config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg); - if (j == config->arg_ctrl_cnt) - config->arg_ctrl_cnt++; + if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++; } } } - if (!match) - argj++; + if (!match) argj++; } } #if CONFIG_VP9_HIGHBITDEPTH if (strcmp(global->codec->name, "vp9") == 0 || strcmp(global->codec->name, "vp10") == 0) { - config->use_16bit_internal = test_16bit_internal | - (config->cfg.g_profile > 1); + config->use_16bit_internal = + test_16bit_internal | (config->cfg.g_profile > 1); } #endif return eos_mark_found; } - -#define FOREACH_STREAM(func) \ - do { \ - struct stream_state *stream; \ +#define FOREACH_STREAM(func) \ + do { \ + struct stream_state *stream; \ for (stream = streams; stream; stream = stream->next) { \ - func; \ - } \ + func; \ + } \ } while (0) - static void validate_stream_config(const struct stream_state *stream, const struct VpxEncoderConfig *global) { const struct stream_state *streami; (void)global; if (!stream->config.cfg.g_w || !stream->config.cfg.g_h) - fatal("Stream %d: Specify stream dimensions with --width (-w) " - " and --height (-h)", stream->index); + fatal( + "Stream %d: Specify stream dimensions with --width (-w) " + " and --height (-h)", + stream->index); // Check that the codec bit depth is greater than the input bit depth. if (stream->config.cfg.g_input_bit_depth > @@ -1285,9 +1295,7 @@ static void validate_stream_config(const struct stream_state *stream, } } - -static void set_stream_dimensions(struct stream_state *stream, - unsigned int w, +static void set_stream_dimensions(struct stream_state *stream, unsigned int w, unsigned int h) { if (!stream->config.cfg.g_w) { if (!stream->config.cfg.g_h) @@ -1300,7 +1308,7 @@ static void set_stream_dimensions(struct stream_state *stream, } } -static const char* file_type_to_string(enum VideoFileType t) { +static const char *file_type_to_string(enum VideoFileType t) { switch (t) { case FILE_TYPE_RAW: return "RAW"; case FILE_TYPE_Y4M: return "Y4M"; @@ -1308,7 +1316,7 @@ static const char* file_type_to_string(enum VideoFileType t) { } } -static const char* image_format_to_string(vpx_img_fmt_t f) { +static const char *image_format_to_string(vpx_img_fmt_t f) { switch (f) { case VPX_IMG_FMT_I420: return "I420"; case VPX_IMG_FMT_I422: return "I422"; @@ -1326,7 +1334,6 @@ static const char* image_format_to_string(vpx_img_fmt_t f) { static void show_stream_config(struct stream_state *stream, struct VpxEncoderConfig *global, struct VpxInputContext *input) { - #define SHOW(field) \ fprintf(stderr, " %-28s = %d\n", #field, stream->config.cfg.field) @@ -1334,8 +1341,7 @@ static void show_stream_config(struct stream_state *stream, fprintf(stderr, "Codec: %s\n", vpx_codec_iface_name(global->codec->codec_interface())); fprintf(stderr, "Source file: %s File Type: %s Format: %s\n", - input->filename, - file_type_to_string(input->file_type), + input->filename, file_type_to_string(input->file_type), image_format_to_string(input->fmt)); } if (stream->next || stream->index) @@ -1378,20 +1384,17 @@ static void show_stream_config(struct stream_state *stream, SHOW(kf_max_dist); } - static void open_output_file(struct stream_state *stream, struct VpxEncoderConfig *global, const struct VpxRational *pixel_aspect_ratio) { const char *fn = stream->config.out_fn; const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg; - if (cfg->g_pass == VPX_RC_FIRST_PASS) - return; + if (cfg->g_pass == VPX_RC_FIRST_PASS) return; stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout); - if (!stream->file) - fatal("Failed to open output file"); + if (!stream->file) fatal("Failed to open output file"); if (stream->config.write_webm && fseek(stream->file, 0, SEEK_CUR)) fatal("WebM output to pipes not supported."); @@ -1399,10 +1402,8 @@ static void open_output_file(struct stream_state *stream, #if CONFIG_WEBM_IO if (stream->config.write_webm) { stream->webm_ctx.stream = stream->file; - write_webm_file_header(&stream->webm_ctx, cfg, - &global->framerate, - stream->config.stereo_fmt, - global->codec->fourcc, + write_webm_file_header(&stream->webm_ctx, cfg, &global->framerate, + stream->config.stereo_fmt, global->codec->fourcc, pixel_aspect_ratio); } #else @@ -1414,13 +1415,11 @@ static void open_output_file(struct stream_state *stream, } } - static void close_output_file(struct stream_state *stream, unsigned int fourcc) { const struct vpx_codec_enc_cfg *const cfg = &stream->config.cfg; - if (cfg->g_pass == VPX_RC_FIRST_PASS) - return; + if (cfg->g_pass == VPX_RC_FIRST_PASS) return; #if CONFIG_WEBM_IO if (stream->config.write_webm) { @@ -1430,21 +1429,17 @@ static void close_output_file(struct stream_state *stream, if (!stream->config.write_webm) { if (!fseek(stream->file, 0, SEEK_SET)) - ivf_write_file_header(stream->file, &stream->config.cfg, - fourcc, + ivf_write_file_header(stream->file, &stream->config.cfg, fourcc, stream->frames_out); } fclose(stream->file); } - static void setup_pass(struct stream_state *stream, - struct VpxEncoderConfig *global, - int pass) { + struct VpxEncoderConfig *global, int pass) { if (stream->config.stats_fn) { - if (!stats_open_file(&stream->stats, stream->config.stats_fn, - pass)) + if (!stats_open_file(&stream->stats, stream->config.stats_fn, pass)) fatal("Failed to open statistics store"); } else { if (!stats_open_mem(&stream->stats, pass)) @@ -1453,8 +1448,8 @@ static void setup_pass(struct stream_state *stream, #if CONFIG_FP_MB_STATS if (stream->config.fpmb_stats_fn) { - if (!stats_open_file(&stream->fpmb_stats, - stream->config.fpmb_stats_fn, pass)) + if (!stats_open_file(&stream->fpmb_stats, stream->config.fpmb_stats_fn, + pass)) fatal("Failed to open mb statistics store"); } else { if (!stats_open_mem(&stream->fpmb_stats, pass)) @@ -1463,8 +1458,8 @@ static void setup_pass(struct stream_state *stream, #endif stream->config.cfg.g_pass = global->passes == 2 - ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS - : VPX_RC_ONE_PASS; + ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS + : VPX_RC_ONE_PASS; if (pass) { stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats); #if CONFIG_FP_MB_STATS @@ -1478,7 +1473,6 @@ static void setup_pass(struct stream_state *stream, stream->frames_out = 0; } - static void initialize_encoder(struct stream_state *stream, struct VpxEncoderConfig *global) { int i; @@ -1503,8 +1497,7 @@ static void initialize_encoder(struct stream_state *stream, int ctrl = stream->config.arg_ctrls[i][0]; int value = stream->config.arg_ctrls[i][1]; if (vpx_codec_control_(&stream->encoder, ctrl, value)) - fprintf(stderr, "Error: Tried to set control %d = %d\n", - ctrl, value); + fprintf(stderr, "Error: Tried to set control %d = %d\n", ctrl, value); ctx_exit_on_error(&stream->encoder, "Failed to control codec"); } @@ -1512,7 +1505,7 @@ static void initialize_encoder(struct stream_state *stream, #if CONFIG_DECODERS if (global->test_decode != TEST_DECODE_OFF) { const VpxInterface *decoder = get_vpx_decoder_by_name(global->codec->name); - vpx_codec_dec_cfg_t cfg = { 0, 0, 0}; + vpx_codec_dec_cfg_t cfg = { 0, 0, 0 }; vpx_codec_dec_init(&stream->decoder, decoder->codec_interface(), &cfg, 0); #if CONFIG_VP10_DECODER && CONFIG_EXT_TILE @@ -1528,23 +1521,21 @@ static void initialize_encoder(struct stream_state *stream, #endif } - static void encode_frame(struct stream_state *stream, - struct VpxEncoderConfig *global, - struct vpx_image *img, + struct VpxEncoderConfig *global, struct vpx_image *img, unsigned int frames_in) { vpx_codec_pts_t frame_start, next_frame_start; struct vpx_codec_enc_cfg *cfg = &stream->config.cfg; struct vpx_usec_timer timer; - frame_start = (cfg->g_timebase.den * (int64_t)(frames_in - 1) - * global->framerate.den) - / cfg->g_timebase.num / global->framerate.num; - next_frame_start = (cfg->g_timebase.den * (int64_t)(frames_in) - * global->framerate.den) - / cfg->g_timebase.num / global->framerate.num; + frame_start = + (cfg->g_timebase.den * (int64_t)(frames_in - 1) * global->framerate.den) / + cfg->g_timebase.num / global->framerate.num; + next_frame_start = + (cfg->g_timebase.den * (int64_t)(frames_in)*global->framerate.den) / + cfg->g_timebase.num / global->framerate.num; - /* Scale if necessary */ +/* Scale if necessary */ #if CONFIG_VP9_HIGHBITDEPTH if (img) { if ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) && @@ -1555,32 +1546,28 @@ static void encode_frame(struct stream_state *stream, } #if CONFIG_LIBYUV if (!stream->img) { - stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I42016, - cfg->g_w, cfg->g_h, 16); + stream->img = + vpx_img_alloc(NULL, VPX_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16); } - I420Scale_16((uint16*)img->planes[VPX_PLANE_Y], - img->stride[VPX_PLANE_Y]/2, - (uint16*)img->planes[VPX_PLANE_U], - img->stride[VPX_PLANE_U]/2, - (uint16*)img->planes[VPX_PLANE_V], - img->stride[VPX_PLANE_V]/2, - img->d_w, img->d_h, - (uint16*)stream->img->planes[VPX_PLANE_Y], - stream->img->stride[VPX_PLANE_Y]/2, - (uint16*)stream->img->planes[VPX_PLANE_U], - stream->img->stride[VPX_PLANE_U]/2, - (uint16*)stream->img->planes[VPX_PLANE_V], - stream->img->stride[VPX_PLANE_V]/2, - stream->img->d_w, stream->img->d_h, - kFilterBox); + I420Scale_16( + (uint16 *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2, + (uint16 *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2, + (uint16 *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2, + img->d_w, img->d_h, (uint16 *)stream->img->planes[VPX_PLANE_Y], + stream->img->stride[VPX_PLANE_Y] / 2, + (uint16 *)stream->img->planes[VPX_PLANE_U], + stream->img->stride[VPX_PLANE_U] / 2, + (uint16 *)stream->img->planes[VPX_PLANE_V], + stream->img->stride[VPX_PLANE_V] / 2, stream->img->d_w, + stream->img->d_h, kFilterBox); img = stream->img; #else - stream->encoder.err = 1; - ctx_exit_on_error(&stream->encoder, - "Stream %d: Failed to encode frame.\n" - "Scaling disabled in this configuration. \n" - "To enable, configure with --enable-libyuv\n", - stream->index); + stream->encoder.err = 1; + ctx_exit_on_error(&stream->encoder, + "Stream %d: Failed to encode frame.\n" + "Scaling disabled in this configuration. \n" + "To enable, configure with --enable-libyuv\n", + stream->index); #endif } } @@ -1592,20 +1579,16 @@ static void encode_frame(struct stream_state *stream, } #if CONFIG_LIBYUV if (!stream->img) - stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, - cfg->g_w, cfg->g_h, 16); - I420Scale(img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y], - img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U], - img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V], - img->d_w, img->d_h, - stream->img->planes[VPX_PLANE_Y], - stream->img->stride[VPX_PLANE_Y], - stream->img->planes[VPX_PLANE_U], - stream->img->stride[VPX_PLANE_U], - stream->img->planes[VPX_PLANE_V], - stream->img->stride[VPX_PLANE_V], - stream->img->d_w, stream->img->d_h, - kFilterBox); + stream->img = + vpx_img_alloc(NULL, VPX_IMG_FMT_I420, cfg->g_w, cfg->g_h, 16); + I420Scale( + img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y], + img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U], + img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V], img->d_w, img->d_h, + stream->img->planes[VPX_PLANE_Y], stream->img->stride[VPX_PLANE_Y], + stream->img->planes[VPX_PLANE_U], stream->img->stride[VPX_PLANE_U], + stream->img->planes[VPX_PLANE_V], stream->img->stride[VPX_PLANE_V], + stream->img->d_w, stream->img->d_h, kFilterBox); img = stream->img; #else stream->encoder.err = 1; @@ -1619,15 +1602,14 @@ static void encode_frame(struct stream_state *stream, vpx_usec_timer_start(&timer); vpx_codec_encode(&stream->encoder, img, frame_start, - (unsigned long)(next_frame_start - frame_start), - 0, global->deadline); + (unsigned long)(next_frame_start - frame_start), 0, + global->deadline); vpx_usec_timer_mark(&timer); stream->cx_time += vpx_usec_timer_elapsed(&timer); ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame", stream->index); } - static void update_quantizer_histogram(struct stream_state *stream) { if (stream->config.cfg.g_pass != VPX_RC_FIRST_PASS) { int q; @@ -1638,10 +1620,8 @@ static void update_quantizer_histogram(struct stream_state *stream) { } } - static void get_cx_data(struct stream_state *stream, - struct VpxEncoderConfig *global, - int *got_data) { + struct VpxEncoderConfig *global, int *got_data) { const vpx_codec_cx_pkt_t *pkt; const struct vpx_codec_enc_cfg *cfg = &stream->config.cfg; vpx_codec_iter_t iter = NULL; @@ -1682,8 +1662,8 @@ static void get_cx_data(struct stream_state *stream, } } - (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, - stream->file); + (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, + stream->file); } stream->nbytes += pkt->data.raw.sz; @@ -1704,15 +1684,13 @@ static void get_cx_data(struct stream_state *stream, break; case VPX_CODEC_STATS_PKT: stream->frames_out++; - stats_write(&stream->stats, - pkt->data.twopass_stats.buf, + stats_write(&stream->stats, pkt->data.twopass_stats.buf, pkt->data.twopass_stats.sz); stream->nbytes += pkt->data.raw.sz; break; #if CONFIG_FP_MB_STATS case VPX_CODEC_FPMB_STATS_PKT: - stats_write(&stream->fpmb_stats, - pkt->data.firstpass_mb_stats.buf, + stats_write(&stream->fpmb_stats, pkt->data.firstpass_mb_stats.buf, pkt->data.firstpass_mb_stats.sz); stream->nbytes += pkt->data.raw.sz; break; @@ -1733,19 +1711,16 @@ static void get_cx_data(struct stream_state *stream, } break; - default: - break; + default: break; } } } - -static void show_psnr(struct stream_state *stream, double peak) { +static void show_psnr(struct stream_state *stream, double peak) { int i; double ovpsnr; - if (!stream->psnr_count) - return; + if (!stream->psnr_count) return; fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index); ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, peak, @@ -1758,18 +1733,16 @@ static void show_psnr(struct stream_state *stream, double peak) { fprintf(stderr, "\n"); } - static float usec_to_fps(uint64_t usec, unsigned int frames) { return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0); } -static void test_decode(struct stream_state *stream, +static void test_decode(struct stream_state *stream, enum TestDecodeFatality fatal, const VpxInterface *codec) { vpx_image_t enc_img, dec_img; - if (stream->mismatch_seen) - return; + if (stream->mismatch_seen) return; /* Get the internal reference frame */ if (strcmp(codec->name, "vp8") == 0) { @@ -1831,10 +1804,8 @@ static void test_decode(struct stream_state *stream, " Y[%d, %d] {%d/%d}," " U[%d, %d] {%d/%d}," " V[%d, %d] {%d/%d}", - stream->index, stream->frames_out, - y[0], y[1], y[2], y[3], - u[0], u[1], u[2], u[3], - v[0], v[1], v[2], v[3]); + stream->index, stream->frames_out, y[0], y[1], y[2], + y[3], u[0], u[1], u[2], u[3], v[0], v[1], v[2], v[3]); stream->mismatch_seen = stream->frames_out; } @@ -1842,7 +1813,6 @@ static void test_decode(struct stream_state *stream, vpx_img_free(&dec_img); } - static void print_time(const char *label, int64_t etl) { int64_t hours; int64_t mins; @@ -1855,14 +1825,13 @@ static void print_time(const char *label, int64_t etl) { etl -= mins * 60; secs = etl; - fprintf(stderr, "[%3s %2"PRId64":%02"PRId64":%02"PRId64"] ", - label, hours, mins, secs); + fprintf(stderr, "[%3s %2" PRId64 ":%02" PRId64 ":%02" PRId64 "] ", label, + hours, mins, secs); } else { fprintf(stderr, "[%3s unknown] ", label); } } - int main(int argc, const char **argv_) { int pass; vpx_image_t raw; @@ -1885,8 +1854,7 @@ int main(int argc, const char **argv_) { memset(&input, 0, sizeof(input)); exec_name = argv_[0]; - if (argc < 3) - usage_exit(); + if (argc < 3) usage_exit(); /* Setup default input stream settings */ input.framerate.numerator = 30; @@ -1902,21 +1870,11 @@ int main(int argc, const char **argv_) { parse_global_config(&global, argv); switch (global.color_type) { - case I420: - input.fmt = VPX_IMG_FMT_I420; - break; - case I422: - input.fmt = VPX_IMG_FMT_I422; - break; - case I444: - input.fmt = VPX_IMG_FMT_I444; - break; - case I440: - input.fmt = VPX_IMG_FMT_I440; - break; - case YV12: - input.fmt = VPX_IMG_FMT_YV12; - break; + case I420: input.fmt = VPX_IMG_FMT_I420; break; + case I422: input.fmt = VPX_IMG_FMT_I422; break; + case I444: input.fmt = VPX_IMG_FMT_I444; break; + case I440: input.fmt = VPX_IMG_FMT_I440; break; + case YV12: input.fmt = VPX_IMG_FMT_YV12; break; } { @@ -1929,8 +1887,7 @@ int main(int argc, const char **argv_) { do { stream = new_stream(&global, stream); stream_cnt++; - if (!streams) - streams = stream; + if (!streams) streams = stream; } while (parse_stream_params(&global, stream, argv)); } @@ -1939,14 +1896,13 @@ int main(int argc, const char **argv_) { if (argi[0][0] == '-' && argi[0][1]) die("Error: Unrecognized option %s\n", *argi); - FOREACH_STREAM(check_encoder_config(global.disable_warning_prompt, - &global, &stream->config.cfg);); + FOREACH_STREAM(check_encoder_config(global.disable_warning_prompt, &global, + &stream->config.cfg);); /* Handle non-option arguments */ input.filename = argv[0]; - if (!input.filename) - usage_exit(); + if (!input.filename) usage_exit(); /* Decide if other chroma subsamplings than 4:2:0 are supported */ if (global.codec->fourcc == VP9_FOURCC || global.codec->fourcc == VP10_FOURCC) @@ -1975,8 +1931,9 @@ int main(int argc, const char **argv_) { /* Update stream configurations from the input file's parameters */ if (!input.width || !input.height) - fatal("Specify stream dimensions with --width (-w) " - " and --height (-h)"); + fatal( + "Specify stream dimensions with --width (-w) " + " and --height (-h)"); /* If input file does not specify bit-depth but input-bit-depth parameter * exists, assume that to be the input bit-depth. However, if the @@ -1993,9 +1950,8 @@ int main(int argc, const char **argv_) { }); if (input.bit_depth > 8) input.fmt |= VPX_IMG_FMT_HIGHBITDEPTH; } else { - FOREACH_STREAM({ - stream->config.cfg.g_input_bit_depth = input.bit_depth; - }); + FOREACH_STREAM( + { stream->config.cfg.g_input_bit_depth = input.bit_depth; }); } FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height)); @@ -2005,18 +1961,21 @@ int main(int argc, const char **argv_) { * --passes=2, ensure --fpf was set. */ if (global.pass && global.passes == 2) - FOREACH_STREAM( { - if (!stream->config.stats_fn) - die("Stream %d: Must specify --fpf when --pass=%d" - " and --passes=2\n", stream->index, global.pass); - }); + FOREACH_STREAM({ + if (!stream->config.stats_fn) + die( + "Stream %d: Must specify --fpf when --pass=%d" + " and --passes=2\n", + stream->index, global.pass); + }); #if !CONFIG_WEBM_IO FOREACH_STREAM({ if (stream->config.write_webm) { stream->config.write_webm = 0; - warn("vpxenc was compiled without WebM container support." - "Producing IVF output"); + warn( + "vpxenc was compiled without WebM container support." + "Producing IVF output"); } }); #endif @@ -2044,14 +2003,13 @@ int main(int argc, const char **argv_) { else vpx_img_alloc(&raw, input.fmt, input.width, input.height, 32); - FOREACH_STREAM(stream->rate_hist = - init_rate_histogram(&stream->config.cfg, - &global.framerate)); + FOREACH_STREAM(stream->rate_hist = init_rate_histogram( + &stream->config.cfg, &global.framerate)); } FOREACH_STREAM(setup_pass(stream, &global, pass)); - FOREACH_STREAM(open_output_file(stream, &global, - &input.pixel_aspect_ratio)); + FOREACH_STREAM( + open_output_file(stream, &global, &input.pixel_aspect_ratio)); FOREACH_STREAM(initialize_encoder(stream, &global)); #if CONFIG_VP9_HIGHBITDEPTH @@ -2068,7 +2026,7 @@ int main(int argc, const char **argv_) { input_shift = 0; } else { input_shift = (int)stream->config.cfg.g_bit_depth - - stream->config.cfg.g_input_bit_depth; + stream->config.cfg.g_input_bit_depth; } }); } @@ -2083,26 +2041,23 @@ int main(int argc, const char **argv_) { if (!global.limit || frames_in < global.limit) { frame_avail = read_frame(&input, &raw); - if (frame_avail) - frames_in++; - seen_frames = frames_in > global.skip_frames ? - frames_in - global.skip_frames : 0; + if (frame_avail) frames_in++; + seen_frames = + frames_in > global.skip_frames ? frames_in - global.skip_frames : 0; if (!global.quiet) { float fps = usec_to_fps(cx_time, seen_frames); fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes); if (stream_cnt == 1) - fprintf(stderr, - "frame %4d/%-4d %7"PRId64"B ", - frames_in, streams->frames_out, (int64_t)streams->nbytes); + fprintf(stderr, "frame %4d/%-4d %7" PRId64 "B ", frames_in, + streams->frames_out, (int64_t)streams->nbytes); else fprintf(stderr, "frame %4d ", frames_in); - fprintf(stderr, "%7"PRId64" %s %.2f %s ", + fprintf(stderr, "%7" PRId64 " %s %.2f %s ", cx_time > 9999999 ? cx_time / 1000 : cx_time, - cx_time > 9999999 ? "ms" : "us", - fps >= 1.0 ? fps : fps * 60, + cx_time > 9999999 ? "ms" : "us", fps >= 1.0 ? fps : fps * 60, fps >= 1.0 ? "fps" : "fpm"); print_time("ETA", estimated_time_left); } @@ -2133,8 +2088,7 @@ int main(int argc, const char **argv_) { FOREACH_STREAM({ if (stream->config.use_16bit_internal) encode_frame(stream, &global, - frame_avail ? frame_to_encode : NULL, - frames_in); + frame_avail ? frame_to_encode : NULL, frames_in); else assert(0); }); @@ -2146,8 +2100,7 @@ int main(int argc, const char **argv_) { } #else vpx_usec_timer_start(&timer); - FOREACH_STREAM(encode_frame(stream, &global, - frame_avail ? &raw : NULL, + FOREACH_STREAM(encode_frame(stream, &global, frame_avail ? &raw : NULL, frames_in)); #endif vpx_usec_timer_mark(&timer); @@ -2169,8 +2122,8 @@ int main(int argc, const char **argv_) { const int64_t frame_in_lagged = (seen_frames - lagged_count) * 1000; rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0; - remaining = 1000 * (global.limit - global.skip_frames - - seen_frames + lagged_count); + remaining = 1000 * (global.limit - global.skip_frames - + seen_frames + lagged_count); } else { const int64_t input_pos = ftello(input.file); const int64_t input_pos_lagged = input_pos - lagged_count; @@ -2180,9 +2133,8 @@ int main(int argc, const char **argv_) { remaining = limit - input_pos + lagged_count; } - average_rate = (average_rate <= 0) - ? rate - : (average_rate * 7 + rate) / 8; + average_rate = + (average_rate <= 0) ? rate : (average_rate * 7 + rate) / 8; estimated_time_left = average_rate ? remaining / average_rate : -1; } @@ -2191,23 +2143,23 @@ int main(int argc, const char **argv_) { } fflush(stdout); - if (!global.quiet) - fprintf(stderr, "\033[K"); + if (!global.quiet) fprintf(stderr, "\033[K"); } - if (stream_cnt > 1) - fprintf(stderr, "\n"); + if (stream_cnt > 1) fprintf(stderr, "\n"); if (!global.quiet) { - FOREACH_STREAM(fprintf(stderr, - "\rPass %d/%d frame %4d/%-4d %7"PRId64"B %7"PRId64"b/f %7"PRId64"b/s" - " %7"PRId64" %s (%.2f fps)\033[K\n", - pass + 1, - global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes, + FOREACH_STREAM(fprintf( + stderr, "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 + "b/f %7" PRId64 "b/s" + " %7" PRId64 " %s (%.2f fps)\033[K\n", + pass + 1, global.passes, frames_in, stream->frames_out, + (int64_t)stream->nbytes, seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0, - seen_frames ? (int64_t)stream->nbytes * 8 * - (int64_t)global.framerate.num / global.framerate.den / - seen_frames : 0, + seen_frames + ? (int64_t)stream->nbytes * 8 * (int64_t)global.framerate.num / + global.framerate.den / seen_frames + : 0, stream->cx_time > 9999999 ? stream->cx_time / 1000 : stream->cx_time, stream->cx_time > 9999999 ? "ms" : "us", usec_to_fps(stream->cx_time, seen_frames))); @@ -2242,17 +2194,15 @@ int main(int argc, const char **argv_) { FOREACH_STREAM(stats_close(&stream->fpmb_stats, global.passes - 1)); #endif - if (global.pass) - break; + if (global.pass) break; } if (global.show_q_hist_buckets) - FOREACH_STREAM(show_q_histogram(stream->counts, - global.show_q_hist_buckets)); + FOREACH_STREAM( + show_q_histogram(stream->counts, global.show_q_hist_buckets)); if (global.show_rate_hist_buckets) - FOREACH_STREAM(show_rate_histogram(stream->rate_hist, - &stream->config.cfg, + FOREACH_STREAM(show_rate_histogram(stream->rate_hist, &stream->config.cfg, global.show_rate_hist_buckets)); FOREACH_STREAM(destroy_rate_histogram(stream->rate_hist)); @@ -2274,8 +2224,7 @@ int main(int argc, const char **argv_) { #endif #if CONFIG_VP9_HIGHBITDEPTH - if (allocated_raw_shift) - vpx_img_free(&raw_shift); + if (allocated_raw_shift) vpx_img_free(&raw_shift); #endif vpx_img_free(&raw); free(argv); diff --git a/vpxstats.c b/vpxstats.c index 16728ce09637d33614457ae20b847133ca3ab8db..142e367bb48c440fc7873722cdff8cbfdaaca8d5 100644 --- a/vpxstats.c +++ b/vpxstats.c @@ -30,8 +30,7 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) { stats->file = fopen(fpf, "rb"); - if (stats->file == NULL) - fatal("First-pass stats file does not exist!"); + if (stats->file == NULL) fatal("First-pass stats file does not exist!"); if (fseek(stats->file, 0, SEEK_END)) fatal("First-pass stats file must be seekable!"); @@ -76,18 +75,17 @@ void stats_close(stats_io_t *stats, int last_pass) { fclose(stats->file); stats->file = NULL; } else { - if (stats->pass == last_pass) - free(stats->buf.buf); + if (stats->pass == last_pass) free(stats->buf.buf); } } void stats_write(stats_io_t *stats, const void *pkt, size_t len) { if (stats->file) { - (void) fwrite(pkt, 1, len, stats->file); + (void)fwrite(pkt, 1, len, stats->file); } else { if (stats->buf.sz + len > stats->buf_alloc_sz) { - size_t new_sz = stats->buf_alloc_sz + 64 * 1024; - char *new_ptr = realloc(stats->buf.buf, new_sz); + size_t new_sz = stats->buf_alloc_sz + 64 * 1024; + char *new_ptr = realloc(stats->buf.buf, new_sz); if (new_ptr) { stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf); @@ -104,6 +102,4 @@ void stats_write(stats_io_t *stats, const void *pkt, size_t len) { } } -vpx_fixed_buf_t stats_get(stats_io_t *stats) { - return stats->buf; -} +vpx_fixed_buf_t stats_get(stats_io_t *stats) { return stats->buf; } diff --git a/warnings.c b/warnings.c index 7ac678ab4aabfee4a94ce7916f0580f1205c5b25..a3e4926674b00eb434a478835dbaac2219034766 100644 --- a/warnings.c +++ b/warnings.c @@ -47,8 +47,7 @@ static void add_warning(const char *warning_string, new_node->warning_string = warning_string; new_node->next_warning = NULL; - while (*node != NULL) - node = &(*node)->next_warning; + while (*node != NULL) node = &(*node)->next_warning; *node = new_node; } @@ -78,9 +77,7 @@ static void check_quantizer(int min_q, int max_q, } static void check_lag_in_frames_realtime_deadline( - int lag_in_frames, - int deadline, - struct WarningList *warning_list) { + int lag_in_frames, int deadline, struct WarningList *warning_list) { if (deadline == VPX_DL_REALTIME && lag_in_frames != 0) add_warning(lag_in_frames_with_realtime, warning_list); } @@ -90,26 +87,21 @@ void check_encoder_config(int disable_prompt, const struct vpx_codec_enc_cfg *stream_config) { int num_warnings = 0; struct WarningListNode *warning = NULL; - struct WarningList warning_list = {0}; + struct WarningList warning_list = { 0 }; check_quantizer(stream_config->rc_min_quantizer, - stream_config->rc_max_quantizer, - &warning_list); + stream_config->rc_max_quantizer, &warning_list); check_lag_in_frames_realtime_deadline(stream_config->g_lag_in_frames, - global_config->deadline, - &warning_list); + global_config->deadline, &warning_list); /* Count and print warnings. */ - for (warning = warning_list.warning_node; - warning != NULL; - warning = warning->next_warning, - ++num_warnings) { + for (warning = warning_list.warning_node; warning != NULL; + warning = warning->next_warning, ++num_warnings) { warn(warning->warning_string); } free_warning_list(&warning_list); if (num_warnings) { - if (!disable_prompt && !continue_prompt(num_warnings)) - exit(EXIT_FAILURE); + if (!disable_prompt && !continue_prompt(num_warnings)) exit(EXIT_FAILURE); } } diff --git a/webmdec.h b/webmdec.h index aa371f32122d88099186979a89f9139cea5d9de9..7dcb170caf3c0f1d394bb5e24764e619deb5edc6 100644 --- a/webmdec.h +++ b/webmdec.h @@ -52,8 +52,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx, // 0 - Success // 1 - End of Stream // -1 - Error -int webm_read_frame(struct WebmInputContext *webm_ctx, - uint8_t **buffer, +int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, size_t *buffer_size); // Guesses the frame rate of the input file based on the container timestamps. diff --git a/webmenc.h b/webmenc.h index ad30664e31a8b6228102e2ad132211e2a203370d..1ae7786cd773ca9cce942fc1a69635e1fce85607 100644 --- a/webmenc.h +++ b/webmenc.h @@ -40,8 +40,7 @@ typedef enum stereo_format { void write_webm_file_header(struct WebmOutputContext *webm_ctx, const vpx_codec_enc_cfg_t *cfg, const struct vpx_rational *fps, - stereo_format_t stereo_fmt, - unsigned int fourcc, + stereo_format_t stereo_fmt, unsigned int fourcc, const struct VpxRational *par); void write_webm_block(struct WebmOutputContext *webm_ctx, diff --git a/y4menc.c b/y4menc.c index b647e8dcc5e891a3e57ea637be4dd40102e081c4..e26fcaf6ea30885dbdc01a5c9902817181473d95 100644 --- a/y4menc.c +++ b/y4menc.c @@ -17,39 +17,43 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height, const char *color; switch (bit_depth) { case 8: - color = fmt == VPX_IMG_FMT_444A ? "C444alpha\n" : - fmt == VPX_IMG_FMT_I444 ? "C444\n" : - fmt == VPX_IMG_FMT_I422 ? "C422\n" : - "C420jpeg\n"; + color = fmt == VPX_IMG_FMT_444A + ? "C444alpha\n" + : fmt == VPX_IMG_FMT_I444 ? "C444\n" : fmt == VPX_IMG_FMT_I422 + ? "C422\n" + : "C420jpeg\n"; break; case 9: - color = fmt == VPX_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9\n" : - fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" : - "C420p9 XYSCSS=420P9\n"; + color = fmt == VPX_IMG_FMT_I44416 + ? "C444p9 XYSCSS=444P9\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" + : "C420p9 XYSCSS=420P9\n"; break; case 10: - color = fmt == VPX_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10\n" : - fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" : - "C420p10 XYSCSS=420P10\n"; + color = fmt == VPX_IMG_FMT_I44416 + ? "C444p10 XYSCSS=444P10\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" + : "C420p10 XYSCSS=420P10\n"; break; case 12: - color = fmt == VPX_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12\n" : - fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" : - "C420p12 XYSCSS=420P12\n"; + color = fmt == VPX_IMG_FMT_I44416 + ? "C444p12 XYSCSS=444P12\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" + : "C420p12 XYSCSS=420P12\n"; break; case 14: - color = fmt == VPX_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14\n" : - fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" : - "C420p14 XYSCSS=420P14\n"; + color = fmt == VPX_IMG_FMT_I44416 + ? "C444p14 XYSCSS=444P14\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" + : "C420p14 XYSCSS=420P14\n"; break; case 16: - color = fmt == VPX_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16\n" : - fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" : - "C420p16 XYSCSS=420P16\n"; + color = fmt == VPX_IMG_FMT_I44416 + ? "C444p16 XYSCSS=444P16\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" + : "C420p16 XYSCSS=420P16\n"; break; - default: - color = NULL; - assert(0); + default: color = NULL; assert(0); } return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s", width, height, framerate->numerator, framerate->denominator, 'p', color); diff --git a/y4minput.c b/y4minput.c index 2dbd603e8c1ae8c0178cb9ab6499f92c1884c7d7..7de859f7ac1cc09fdfffbba66e334b734e545ec2 100644 --- a/y4minput.c +++ b/y4minput.c @@ -25,7 +25,7 @@ static int file_read(void *buf, size_t size, FILE *file) { int file_error; size_t len = 0; do { - const size_t n = fread((uint8_t*)buf + len, 1, size - len, file); + const size_t n = fread((uint8_t *)buf + len, 1, size - len, file); len += n; file_error = ferror(file); if (file_error) { @@ -41,21 +41,22 @@ static int file_read(void *buf, size_t size, FILE *file) { } while (!feof(file) && len < size && ++retry_count < kMaxRetries); if (!feof(file) && len != size) { - fprintf(stderr, "Error reading file: %u of %u bytes read," - " error: %d, retries: %d, %d: %s\n", - (uint32_t)len, (uint32_t)size, file_error, retry_count, - errno, strerror(errno)); + fprintf(stderr, + "Error reading file: %u of %u bytes read," + " error: %d, retries: %d, %d: %s\n", + (uint32_t)len, (uint32_t)size, file_error, retry_count, errno, + strerror(errno)); } return len == size; } static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { - int got_w; - int got_h; - int got_fps; - int got_interlace; - int got_par; - int got_chroma; + int got_w; + int got_h; + int got_fps; + int got_interlace; + int got_par; + int got_chroma; char *p; char *q; got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0; @@ -70,55 +71,47 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { /*Process the tag.*/ switch (p[0]) { case 'W': { - if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1)return -1; + if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1; got_w = 1; - } - break; + } break; case 'H': { - if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1)return -1; + if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1; got_h = 1; - } - break; + } break; case 'F': { if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) { return -1; } got_fps = 1; - } - break; + } break; case 'I': { _y4m->interlace = p[1]; got_interlace = 1; - } - break; + } break; case 'A': { if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) { return -1; } got_par = 1; - } - break; + } break; case 'C': { - if (q - p > 16)return -1; + if (q - p > 16) return -1; memcpy(_y4m->chroma_type, p + 1, q - p - 1); _y4m->chroma_type[q - p - 1] = '\0'; got_chroma = 1; - } - break; - /*Ignore unknown tags.*/ + } break; + /*Ignore unknown tags.*/ } } - if (!got_w || !got_h || !got_fps)return -1; - if (!got_interlace)_y4m->interlace = '?'; - if (!got_par)_y4m->par_n = _y4m->par_d = 0; + if (!got_w || !got_h || !got_fps) return -1; + if (!got_interlace) _y4m->interlace = '?'; + if (!got_par) _y4m->par_n = _y4m->par_d = 0; /*Chroma-type is not specified in older files, e.g., those generated by mplayer.*/ - if (!got_chroma)strcpy(_y4m->chroma_type, "420"); + if (!got_chroma) strcpy(_y4m->chroma_type, "420"); return 0; } - - /*All anti-aliasing filters in the following conversion functions are based on one of two window functions: The 6-tap Lanczos window (for down-sampling and shifts): @@ -141,9 +134,9 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { have these steps pipelined, for less memory consumption and better cache performance, but we do them separately for simplicity.*/ -#define OC_MINI(_a,_b) ((_a)>(_b)?(_b):(_a)) -#define OC_MAXI(_a,_b) ((_a)<(_b)?(_b):(_a)) -#define OC_CLAMPI(_a,_b,_c) (OC_MAXI(_a,OC_MINI(_b,_c))) +#define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a)) +#define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a)) +#define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c))) /*420jpeg chroma samples are sited like: Y-------Y-------Y-------Y------- @@ -187,25 +180,36 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { lines, and they are vertically co-sited with the luma samples in both the mpeg2 and jpeg cases (thus requiring no vertical resampling).*/ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst, - const unsigned char *_src, int _c_w, int _c_h) { + const unsigned char *_src, int _c_w, + int _c_h) { int y; int x; for (y = 0; y < _c_h; y++) { /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos window.*/ for (x = 0; x < OC_MINI(_c_w, 2); x++) { - _dst[x] = (unsigned char)OC_CLAMPI(0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + - 114 * _src[x] + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - 9 * _src[OC_MINI(x + 2, _c_w - 1)] + - _src[OC_MINI(x + 3, _c_w - 1)] + 64) >> 7, 255); + _dst[x] = (unsigned char)OC_CLAMPI( + 0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + + _src[OC_MINI(x + 3, _c_w - 1)] + 64) >> + 7, + 255); } for (; x < _c_w - 3; x++) { - _dst[x] = (unsigned char)OC_CLAMPI(0, (4 * _src[x - 2] - 17 * _src[x - 1] + - 114 * _src[x] + 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> 7, 255); + _dst[x] = (unsigned char)OC_CLAMPI( + 0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> + 7, + 255); } for (; x < _c_w; x++) { - _dst[x] = (unsigned char)OC_CLAMPI(0, (4 * _src[x - 2] - 17 * _src[x - 1] + - 114 * _src[x] + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - 9 * _src[OC_MINI(x + 2, _c_w - 1)] + - _src[_c_w - 1] + 64) >> 7, 255); + _dst[x] = (unsigned char)OC_CLAMPI( + 0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >> + 7, + 255); } _dst += _c_w; _src += _c_w; @@ -278,12 +282,12 @@ static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst, static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, unsigned char *_aux) { unsigned char *tmp; - int c_w; - int c_h; - int c_sz; - int pli; - int y; - int x; + int c_w; + int c_h; + int c_sz; + int pli; + int y; + int x; /*Skip past the luma data.*/ _dst += _y4m->pic_w * _y4m->pic_h; /*Compute the size of each chroma plane.*/ @@ -303,53 +307,73 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, This is the same filter used above, but in the other order.*/ for (x = 0; x < c_w; x++) { for (y = 0; y < OC_MINI(c_h, 3); y++) { - _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (tmp[0] - - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + 35 * tmp[OC_MAXI(y - 1, 0) * c_w] - + 114 * tmp[y * c_w] - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - + 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> 7, 255); + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + + 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> + 7, + 255); } for (; y < c_h - 2; y++) { - _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (tmp[(y - 3) * c_w] - - 9 * tmp[(y - 2) * c_w] + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - - 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> 7, 255); + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> + 7, + 255); } for (; y < c_h; y++) { - _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (tmp[(y - 3) * c_w] - - 9 * tmp[(y - 2) * c_w] + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + 4 * tmp[(c_h - 1) * c_w] + 64) >> 7, 255); + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[(c_h - 1) * c_w] + 64) >> + 7, + 255); } _dst++; tmp++; } _dst += c_sz - c_w; tmp -= c_w; - } - break; + } break; case 2: { /*Slide C_r down a quarter-pel. This is the same as the horizontal filter.*/ for (x = 0; x < c_w; x++) { for (y = 0; y < OC_MINI(c_h, 2); y++) { - _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (4 * tmp[0] - - 17 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] - + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] - + tmp[OC_MINI(y + 3, c_h - 1) * c_w] + 64) >> 7, 255); + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (4 * tmp[0] - 17 * tmp[OC_MAXI(y - 1, 0) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - + 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + + tmp[OC_MINI(y + 3, c_h - 1) * c_w] + 64) >> + 7, + 255); } for (; y < c_h - 3; y++) { - _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (4 * tmp[(y - 2) * c_w] - - 17 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] - - 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> 7, 255); + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] - + 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> + 7, + 255); } for (; y < c_h; y++) { - _dst[y * c_w] = (unsigned char)OC_CLAMPI(0, (4 * tmp[(y - 2) * c_w] - - 17 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - - 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[(c_h - 1) * c_w] + 64) >> 7, 255); + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - + 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[(c_h - 1) * c_w] + + 64) >> + 7, + 255); } _dst++; tmp++; } - } - break; + } break; } /*For actual interlaced material, this would have to be done separately on each field, and the shift amounts would be different. @@ -364,27 +388,37 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, /*Perform vertical filtering to reduce a single plane from 4:2:2 to 4:2:0. This is used as a helper by several converation routines.*/ static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst, - const unsigned char *_src, int _c_w, int _c_h) { + const unsigned char *_src, int _c_w, + int _c_h) { int y; int x; /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ for (x = 0; x < _c_w; x++) { for (y = 0; y < OC_MINI(_c_h, 2); y += 2) { - _dst[(y >> 1)*_c_w] = OC_CLAMPI(0, (64 * _src[0] - + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] - - 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] - + 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> 7, 255); + _dst[(y >> 1) * _c_w] = + OC_CLAMPI(0, (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] - + 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] + + 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> + 7, + 255); } for (; y < _c_h - 3; y += 2) { - _dst[(y >> 1)*_c_w] = OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) - - 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) - + 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> 7, 255); + _dst[(y >> 1) * _c_w] = + OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) - + 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) + + 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> + 7, + 255); } for (; y < _c_h; y += 2) { - _dst[(y >> 1)*_c_w] = OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] - + _src[(_c_h - 1) * _c_w]) - 17 * (_src[(y - 1) * _c_w] - + _src[OC_MINI(y + 2, _c_h - 1) * _c_w]) - + 78 * (_src[y * _c_w] + _src[OC_MINI(y + 1, _c_h - 1) * _c_w]) + 64) >> 7, 255); + _dst[(y >> 1) * _c_w] = OC_CLAMPI( + 0, + (3 * (_src[(y - 2) * _c_w] + _src[(_c_h - 1) * _c_w]) - + 17 * (_src[(y - 1) * _c_w] + _src[OC_MINI(y + 2, _c_h - 1) * _c_w]) + + 78 * (_src[y * _c_w] + _src[OC_MINI(y + 1, _c_h - 1) * _c_w]) + + 64) >> + 7, + 255); } _src++; _dst++; @@ -497,12 +531,12 @@ static void y4m_convert_422jpeg_420jpeg(y4m_input *_y4m, unsigned char *_dst, static void y4m_convert_422_420jpeg(y4m_input *_y4m, unsigned char *_dst, unsigned char *_aux) { unsigned char *tmp; - int c_w; - int c_h; - int c_sz; - int dst_c_h; - int dst_c_sz; - int pli; + int c_w; + int c_h; + int c_sz; + int dst_c_h; + int dst_c_sz; + int pli; /*Skip past the luma data.*/ _dst += _y4m->pic_w * _y4m->pic_h; /*Compute the size of each chroma plane.*/ @@ -569,16 +603,16 @@ static void y4m_convert_422_420jpeg(y4m_input *_y4m, unsigned char *_dst, static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst, unsigned char *_aux) { unsigned char *tmp; - int c_w; - int c_h; - int c_sz; - int dst_c_w; - int dst_c_h; - int dst_c_sz; - int tmp_sz; - int pli; - int y; - int x; + int c_w; + int c_h; + int c_sz; + int dst_c_w; + int dst_c_h; + int dst_c_sz; + int tmp_sz; + int pli; + int y; + int x; /*Skip past the luma data.*/ _dst += _y4m->pic_w * _y4m->pic_h; /*Compute the size of each chroma plane.*/ @@ -599,23 +633,42 @@ static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst, /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a 4-tap Mitchell window.*/ for (x = 0; x < OC_MINI(c_w, 1); x++) { - tmp[x << 1] = (unsigned char)OC_CLAMPI(0, (111 * _aux[0] - + 18 * _aux[OC_MINI(1, c_w - 1)] - _aux[OC_MINI(2, c_w - 1)] + 64) >> 7, 255); - tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(0, (47 * _aux[0] - + 86 * _aux[OC_MINI(1, c_w - 1)] - 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> 7, 255); + tmp[x << 1] = (unsigned char)OC_CLAMPI( + 0, (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] - + _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, + 255); + tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( + 0, (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] - + 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, + 255); } for (; x < c_w - 2; x++) { - tmp[x << 1] = (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x] - + 18 * _aux[x + 1] - _aux[x + 2] + 64) >> 7, 255); - tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(0, (-3 * _aux[x - 1] + 50 * _aux[x] - + 86 * _aux[x + 1] - 5 * _aux[x + 2] + 64) >> 7, 255); + tmp[x << 1] = + (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x] + + 18 * _aux[x + 1] - _aux[x + 2] + 64) >> + 7, + 255); + tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( + 0, (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] - + 5 * _aux[x + 2] + 64) >> + 7, + 255); } for (; x < c_w; x++) { - tmp[x << 1] = (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x] - + 18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >> 7, 255); + tmp[x << 1] = (unsigned char)OC_CLAMPI( + 0, (_aux[x - 1] + 110 * _aux[x] + + 18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >> + 7, + 255); if ((x << 1 | 1) < dst_c_w) { - tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(0, (-3 * _aux[x - 1] + 50 * _aux[x] - + 86 * _aux[OC_MINI(x + 1, c_w - 1)] - 5 * _aux[c_w - 1] + 64) >> 7, 255); + tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( + 0, + (-3 * _aux[x - 1] + 50 * _aux[x] + + 86 * _aux[OC_MINI(x + 1, c_w - 1)] - 5 * _aux[c_w - 1] + 64) >> + 7, + 255); } } tmp += dst_c_w; @@ -632,16 +685,16 @@ static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst, static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst, unsigned char *_aux) { unsigned char *tmp; - int c_w; - int c_h; - int c_sz; - int dst_c_w; - int dst_c_h; - int dst_c_sz; - int tmp_sz; - int pli; - int y; - int x; + int c_w; + int c_h; + int c_sz; + int dst_c_w; + int dst_c_h; + int dst_c_sz; + int tmp_sz; + int pli; + int y; + int x; /*Skip past the luma data.*/ _dst += _y4m->pic_w * _y4m->pic_h; /*Compute the size of each chroma plane.*/ @@ -657,18 +710,27 @@ static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst, /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ for (y = 0; y < c_h; y++) { for (x = 0; x < OC_MINI(c_w, 2); x += 2) { - tmp[x >> 1] = OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] - - 17 * _aux[OC_MINI(2, c_w - 1)] - + 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> 7, 255); + tmp[x >> 1] = + OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] - + 17 * _aux[OC_MINI(2, c_w - 1)] + + 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> + 7, + 255); } for (; x < c_w - 3; x += 2) { - tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3]) - - 17 * (_aux[x - 1] + _aux[x + 2]) + 78 * (_aux[x] + _aux[x + 1]) + 64) >> 7, 255); + tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3]) - + 17 * (_aux[x - 1] + _aux[x + 2]) + + 78 * (_aux[x] + _aux[x + 1]) + 64) >> + 7, + 255); } for (; x < c_w; x += 2) { - tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[c_w - 1]) - - 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) + - 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> 7, 255); + tmp[x >> 1] = OC_CLAMPI( + 0, (3 * (_aux[x - 2] + _aux[c_w - 1]) - + 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) + + 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> + 7, + 255); } tmp += dst_c_w; _aux += c_w; @@ -701,9 +763,9 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst, int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, int only_420) { - char buffer[80] = {0}; - int ret; - int i; + char buffer[80] = { 0 }; + int ret; + int i; /*Read until newline, or 80 cols, whichever happens first.*/ for (i = 0; i < 79; i++) { if (_nskip > 0) { @@ -712,10 +774,10 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, } else { if (!file_read(buffer + i, 1, _fin)) return -1; } - if (buffer[i] == '\n')break; + if (buffer[i] == '\n') break; } /*We skipped too much header data.*/ - if (_nskip > 0)return -1; + if (_nskip > 0) return -1; if (i == 79) { fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n"); return -1; @@ -734,10 +796,12 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, return ret; } if (_y4m->interlace == '?') { - fprintf(stderr, "Warning: Input video interlacing format unknown; " + fprintf(stderr, + "Warning: Input video interlacing format unknown; " "assuming progressive scan.\n"); } else if (_y4m->interlace != 'p') { - fprintf(stderr, "Input video is interlaced; " + fprintf(stderr, + "Input video is interlaced; " "Only progressive scan handled.\n"); return -1; } @@ -746,9 +810,11 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, _y4m->bit_depth = 8; if (strcmp(_y4m->chroma_type, "420") == 0 || strcmp(_y4m->chroma_type, "420jpeg") == 0) { - _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h - + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); + _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = + _y4m->dst_c_dec_v = 2; + _y4m->dst_buf_read_sz = + _y4m->pic_w * _y4m->pic_h + + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); /* Natively supported: no conversion required. */ _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; _y4m->convert = y4m_convert_null; @@ -757,9 +823,9 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, _y4m->dst_c_dec_h = 2; _y4m->src_c_dec_v = 2; _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + 1) / 2) * - ((_y4m->pic_h + 1) / 2)); + _y4m->dst_buf_read_sz = + 2 * (_y4m->pic_w * _y4m->pic_h + + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2)); /* Natively supported: no conversion required. */ _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; _y4m->convert = y4m_convert_null; @@ -775,9 +841,9 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, _y4m->dst_c_dec_h = 2; _y4m->src_c_dec_v = 2; _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + 1) / 2) * - ((_y4m->pic_h + 1) / 2)); + _y4m->dst_buf_read_sz = + 2 * (_y4m->pic_w * _y4m->pic_h + + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2)); /* Natively supported: no conversion required. */ _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; _y4m->convert = y4m_convert_null; @@ -789,20 +855,23 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, return -1; } } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) { - _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2; + _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = + _y4m->dst_c_dec_v = 2; _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; /*Chroma filter required: read into the aux buf first.*/ _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = - 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); _y4m->convert = y4m_convert_42xmpeg2_42xjpeg; } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) { - _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2; + _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = + _y4m->dst_c_dec_v = 2; _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); - _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); + _y4m->aux_buf_read_sz = + 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); _y4m->convert = y4m_convert_42xpaldv_42xjpeg; } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) { _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2; @@ -810,7 +879,8 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, _y4m->dst_c_dec_v = 2; _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; /*Chroma filter required: read into the aux buf first.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; _y4m->convert = y4m_convert_422jpeg_420jpeg; } else if (strcmp(_y4m->chroma_type, "422") == 0) { _y4m->src_c_dec_h = 2; @@ -823,16 +893,16 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, We need to make two filter passes, so we need some extra space in the aux buffer.*/ _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + - ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + _y4m->aux_buf_sz = + _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; _y4m->convert = y4m_convert_422_420jpeg; } else { _y4m->vpx_fmt = VPX_IMG_FMT_I422; _y4m->bps = 16; _y4m->dst_c_dec_h = _y4m->src_c_dec_h; _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h - + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + _y4m->dst_buf_read_sz = + _y4m->pic_w * _y4m->pic_h + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; /*Natively supported: no conversion required.*/ _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; _y4m->convert = y4m_convert_null; @@ -879,7 +949,8 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, We need to make two filter passes, so we need some extra space in the aux buffer.*/ _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h; - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + _y4m->aux_buf_sz = + _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; _y4m->convert = y4m_convert_411_420jpeg; } else if (strcmp(_y4m->chroma_type, "444") == 0) { _y4m->src_c_dec_h = 1; @@ -892,8 +963,8 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, We need to make two filter passes, so we need some extra space in the aux buffer.*/ _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h; - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + - ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + _y4m->aux_buf_sz = + _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; _y4m->convert = y4m_convert_444_420jpeg; } else { _y4m->vpx_fmt = VPX_IMG_FMT_I444; @@ -972,9 +1043,10 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, } /*The size of the final frame buffers is always computed from the destination chroma decimation type.*/ - _y4m->dst_buf_sz = _y4m->pic_w * _y4m->pic_h - + 2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) * - ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v); + _y4m->dst_buf_sz = + _y4m->pic_w * _y4m->pic_h + + 2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) * + ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v); if (_y4m->bit_depth == 8) _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz); else @@ -992,11 +1064,11 @@ void y4m_input_close(y4m_input *_y4m) { int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) { char frame[6]; - int pic_sz; - int c_w; - int c_h; - int c_sz; - int bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1; + int pic_sz; + int c_w; + int c_h; + int c_sz; + int bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1; /*Read and skip the frame header.*/ if (!file_read(frame, 6, _fin)) return 0; if (memcmp(frame, "FRAME", 5)) { @@ -1005,8 +1077,9 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) { } if (frame[5] != '\n') { char c; - int j; - for (j = 0; j < 79 && file_read(&c, 1, _fin) && c != '\n'; j++) {} + int j; + for (j = 0; j < 79 && file_read(&c, 1, _fin) && c != '\n'; j++) { + } if (j == 79) { fprintf(stderr, "Error parsing Y4M frame header\n"); return -1; diff --git a/y4minput.h b/y4minput.h index 356cebbcf0aadabe4b86f5e7f645251aa1fa7479..9e69ceb835a8861620e7b822d415225c9538b3c9 100644 --- a/y4minput.h +++ b/y4minput.h @@ -14,52 +14,46 @@ #ifndef Y4MINPUT_H_ #define Y4MINPUT_H_ -# include <stdio.h> -# include "vpx/vpx_image.h" +#include <stdio.h> +#include "vpx/vpx_image.h" #ifdef __cplusplus extern "C" { #endif - - typedef struct y4m_input y4m_input; - - /*The function used to perform chroma conversion.*/ -typedef void (*y4m_convert_func)(y4m_input *_y4m, - unsigned char *_dst, unsigned char *_src); - - +typedef void (*y4m_convert_func)(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_src); struct y4m_input { - int pic_w; - int pic_h; - int fps_n; - int fps_d; - int par_n; - int par_d; - char interlace; - int src_c_dec_h; - int src_c_dec_v; - int dst_c_dec_h; - int dst_c_dec_v; - char chroma_type[16]; + int pic_w; + int pic_h; + int fps_n; + int fps_d; + int par_n; + int par_d; + char interlace; + int src_c_dec_h; + int src_c_dec_v; + int dst_c_dec_h; + int dst_c_dec_v; + char chroma_type[16]; /*The size of each converted frame buffer.*/ - size_t dst_buf_sz; + size_t dst_buf_sz; /*The amount to read directly into the converted frame buffer.*/ - size_t dst_buf_read_sz; + size_t dst_buf_read_sz; /*The size of the auxilliary buffer.*/ - size_t aux_buf_sz; + size_t aux_buf_sz; /*The amount to read into the auxilliary buffer.*/ - size_t aux_buf_read_sz; - y4m_convert_func convert; - unsigned char *dst_buf; - unsigned char *aux_buf; - enum vpx_img_fmt vpx_fmt; - int bps; - unsigned int bit_depth; + size_t aux_buf_read_sz; + y4m_convert_func convert; + unsigned char *dst_buf; + unsigned char *aux_buf; + enum vpx_img_fmt vpx_fmt; + int bps; + unsigned int bit_depth; }; int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,